# Original Code

**The following code will compute default MICE over the mtcars_missing dataset:**

In [None]:
# Load libraries
import numpy as np
import pandas as pd
import io




# Read file
from google.colab import files
uploaded = files.upload()
dat = pd.read_csv(io.BytesIO(uploaded['mtcars_missing.csv']), sep = ",")


# Select numerical variables
numerical_vars = list(set(dat.columns) - set(['model']))


# Import MICE method
!pip install fancyimpute
from fancyimpute import IterativeImputer as MICE


# 3) Define "model"
model = MICE()


# 4) Train "model"
model.fit(dat[numerical_vars])


# 5) "Predict"
dat[numerical_vars] = model.transform(dat[numerical_vars])

Saving mtcars_missing.csv to mtcars_missing.csv
Collecting fancyimpute
  Downloading fancyimpute-0.7.0.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting knnimpute>=0.1.0 (from fancyimpute)
  Downloading knnimpute-0.1.0.tar.gz (8.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nose (from fancyimpute)
  Downloading nose-1.3.7-py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.7/154.7 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: fancyimpute, knnimpute
  Building wheel for fancyimpute (setup.py) ... [?25l[?25hdone
  Created wheel for fancyimpute: filename=fancyimpute-0.7.0-py3-none-any.whl size=29880 sha256=6fdcec08e3eaa2e0f2b3f17b190673f38cf63eaaba84a52b15f8c169eb82cffd
  Stored in directory: /root/.cache/pip/wheels/7b/0c/d3/ee82d1fbdcc0858d96434af108608d01703505d453720c84ed
  Building wheel for knnimpute (setup.py) ... [?25l[?25hdone
  Created wheel fo

Let's see the results

This is the original dataset.

In [None]:
old_dat = pd.read_csv(io.BytesIO(uploaded['mtcars_missing.csv']), sep = ",")
old_dat.head()

Unnamed: 0,model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1.0,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1.0,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0.0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0.0,3,2


This is the dataset after running the code and applying MICE.

In [None]:
dat.head()

Unnamed: 0,model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6.0,160.0,110.0,3.9,2.62,16.46,0.0,0.610321,4.0,4.0
1,Mazda RX4 Wag,21.0,6.0,160.0,110.0,3.9,2.875,17.02,0.0,1.0,4.0,4.0
2,Datsun 710,22.8,4.0,108.0,93.0,3.85,2.32,18.61,1.0,1.0,4.0,1.0
3,Hornet 4 Drive,21.4,6.0,258.0,110.0,3.08,3.215,19.44,1.0,0.0,3.0,1.0
4,Hornet Sportabout,18.7,8.0,360.0,175.0,3.15,3.44,17.02,0.0,0.0,3.0,2.0


We can see that in the first row, the original missing value of *am* has been imputed as 0.610321.

# Exercise

**Modify the code to make MICE apply a Decision Tree model to impute missing values (you should modify line model = MICE() to select the model you want to apply in MICE).**

In [None]:
# Read file
dat = pd.read_csv(io.BytesIO(uploaded['mtcars_missing.csv']), sep = ",")


# Select numerical variables
numerical_vars = list(set(dat.columns) - set(['model']))


# Import MICE method
from fancyimpute import IterativeImputer as MICE


# 3) Define "model"
from sklearn.tree import DecisionTreeRegressor # ADDED!!!
model = MICE(DecisionTreeRegressor()) # MODIFIED!!!


# 4) Train "model"
model.fit(dat[numerical_vars])


# 5) "Predict"
dat[numerical_vars] = model.transform(dat[numerical_vars])

**Check the differences in the imputed values. Which one looks a better approach?**

Let's analyze the new dataset.

In [None]:
dat.head()

Unnamed: 0,model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6.0,160.0,110.0,3.9,2.62,16.46,0.0,1.0,4.0,4.0
1,Mazda RX4 Wag,21.0,6.0,160.0,110.0,3.9,2.875,17.02,0.0,1.0,4.0,4.0
2,Datsun 710,22.8,4.0,108.0,93.0,3.85,2.32,18.61,1.0,1.0,4.0,1.0
3,Hornet 4 Drive,21.4,6.0,258.0,110.0,3.08,3.215,19.44,1.0,0.0,3.0,1.0
4,Hornet Sportabout,18.7,8.0,360.0,175.0,3.15,3.44,17.02,0.0,0.0,3.0,2.0


We can see that now in the first row the original missing value of *am* has been imputed as 1.0. This may seem more reasonable, as *am* is a binary dummy variable.