<a href="https://colab.research.google.com/github/gulabpatel/Feature_Engineering/blob/master/Feature_Engineering_5.1_Missing_Values_MICE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Video for Iterative Imputer: 
https://www.youtube.com/watch?v=WPiYOS3qK70 <br>
https://www.youtube.com/watch?v=1n7ld38PjEc

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [2]:
df = pd.DataFrame({
    'age': [25,27,29,31,33,np.nan],
    'experience': [np.nan, 3,5,7,9,11],
    'salary': [50, np.nan, 110,140,170,200],
    'purchased' : [0,1,1,0,1,0]
})
df

Unnamed: 0,age,experience,salary,purchased
0,25.0,,50.0,0
1,27.0,3.0,,1
2,29.0,5.0,110.0,1
3,31.0,7.0,140.0,0
4,33.0,9.0,170.0,1
5,,11.0,200.0,0


In [3]:
X = df.drop('purchased', 1)
y = df['purchased']

  """Entry point for launching an IPython kernel.


In [4]:
X.corr()

Unnamed: 0,age,experience,salary
age,1.0,1.0,1.0
experience,1.0,1.0,1.0
salary,1.0,1.0,1.0


In [5]:
lr = LinearRegression()
# I'm using Linear Regression because all the features are highly correlated with each other
# In most real life datasets, they will be not, and the need will arise to use other regressors

imp = IterativeImputer(estimator=lr, verbose=2, max_iter=30, tol=1e-10, imputation_order='roman')
# play around with the max_iter and tol parameters to get a better feel of how it is working

imp.fit_transform(X)

[IterativeImputer] Completing matrix with shape (6, 3)
[IterativeImputer] Ending imputation round 1/30, elapsed time 0.03
[IterativeImputer] Change: 61.22518987714511, scaled tolerance: 2e-08 
[IterativeImputer] Ending imputation round 2/30, elapsed time 0.03
[IterativeImputer] Change: 7.963767891095614, scaled tolerance: 2e-08 
[IterativeImputer] Ending imputation round 3/30, elapsed time 0.04
[IterativeImputer] Change: 0.7509179143103637, scaled tolerance: 2e-08 
[IterativeImputer] Ending imputation round 4/30, elapsed time 0.04
[IterativeImputer] Change: 0.01311646776996156, scaled tolerance: 2e-08 
[IterativeImputer] Ending imputation round 5/30, elapsed time 0.05
[IterativeImputer] Change: 0.000814252648837055, scaled tolerance: 2e-08 
[IterativeImputer] Ending imputation round 6/30, elapsed time 0.05
[IterativeImputer] Change: 3.9720598692838394e-05, scaled tolerance: 2e-08 
[IterativeImputer] Ending imputation round 7/30, elapsed time 0.05
[IterativeImputer] Change: 2.1412899116

array([[ 25.,   1.,  50.],
       [ 27.,   3.,  80.],
       [ 29.,   5., 110.],
       [ 31.,   7., 140.],
       [ 33.,   9., 170.],
       [ 35.,  11., 200.]])

In [6]:
# imagine we have a dataset with 6 features, one of which is age.
# Now we want to predict the NaNs in age
# so we find the absolute correlation coefficient between age and all other features
corr_values = [0.9, 0.5, 0.8, 0.4, 0.1]
corr_values

[0.9, 0.5, 0.8, 0.4, 0.1]

In [7]:
np.sum(corr_values)

2.7

In [8]:
0.9/2.7

0.3333333333333333

In [9]:
from sklearn.preprocessing import normalize

In [10]:
# we'll normalize the absolute correlation coefficients to have a sum of one
probs = normalize([corr_values], norm='l1')
probs = probs.ravel()
probs

array([0.33333333, 0.18518519, 0.2962963 , 0.14814815, 0.03703704])

In [11]:
probs.sum()

1.0

In [12]:
# now we'll pick the number of neighbors we want (2 in this case) and set
# the weight/probabilites parameter as the 'probs' we calculated above
# so that numpy assigns the proportional weight to each feature according
# to the correlation of that feature with the target feature (age in this case)

np.random.choice([1,2,3,4,5], 2, replace=False, p=probs)

array([2, 3])

##### to demonstrate working of Iterative Imputer in the case of training and testing sets

In [13]:
df = pd.DataFrame({
    'age': [25,27,29,31,33,np.nan,37,39,41,np.nan,45],
    'experience': [np.nan, 3,5,7,9,11,13,16,np.nan,19,21],
    'salary': [50, np.nan, 110,140,170,200,230,260,np.nan,320,350],
    'purchased' : [0,1,1,0,1,0,0,1,1,0,0]
})
df

Unnamed: 0,age,experience,salary,purchased
0,25.0,,50.0,0
1,27.0,3.0,,1
2,29.0,5.0,110.0,1
3,31.0,7.0,140.0,0
4,33.0,9.0,170.0,1
5,,11.0,200.0,0
6,37.0,13.0,230.0,0
7,39.0,16.0,260.0,1
8,41.0,,,1
9,,19.0,320.0,0


In [14]:
X = df.drop('purchased', 1)
y = df['purchased']

  """Entry point for launching an IPython kernel.


In [15]:
X

Unnamed: 0,age,experience,salary
0,25.0,,50.0
1,27.0,3.0,
2,29.0,5.0,110.0
3,31.0,7.0,140.0
4,33.0,9.0,170.0
5,,11.0,200.0
6,37.0,13.0,230.0
7,39.0,16.0,260.0
8,41.0,,
9,,19.0,320.0


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [17]:
X_train

Unnamed: 0,age,experience,salary
0,25.0,,50.0
1,27.0,3.0,
2,29.0,5.0,110.0
3,31.0,7.0,140.0
4,33.0,9.0,170.0
5,,11.0,200.0
6,37.0,13.0,230.0
7,39.0,16.0,260.0


In [18]:
lr = LinearRegression()
imp = IterativeImputer(estimator=lr, tol=1e-10, max_iter=1, verbose=2, imputation_order='roman')
imp.fit_transform(X_train)

[IterativeImputer] Completing matrix with shape (8, 3)
[IterativeImputer] Ending imputation round 1/1, elapsed time 0.00
[IterativeImputer] Change: 83.718083137517, scaled tolerance: 2.6e-08 




array([[ 25.        ,   0.53793922,  50.        ],
       [ 27.        ,   3.        ,  81.99620258],
       [ 29.        ,   5.        , 110.        ],
       [ 31.        ,   7.        , 140.        ],
       [ 33.        ,   9.        , 170.        ],
       [ 34.42116416,  11.        , 200.        ],
       [ 37.        ,  13.        , 230.        ],
       [ 39.        ,  16.        , 260.        ]])

In [19]:
X_train.mean()

age            31.571429
experience      9.142857
salary        165.714286
dtype: float64

In [20]:
# For the inital imputation of test set, the missing values in test set will be filled 
# with the means of the respective columns in the train set
X_test

Unnamed: 0,age,experience,salary
8,41.0,,
9,,19.0,320.0
10,45.0,21.0,350.0


In [21]:
# this is the sequence of steps that will be followed for doing the imputations.
# remember, we are using the "roman" imputation order, hence the order of
# imputations is 0, 1, 2 etc in that order.
imp.imputation_sequence_

[_ImputerTriplet(feat_idx=0, neighbor_feat_idx=array([1, 2]), estimator=LinearRegression()),
 _ImputerTriplet(feat_idx=1, neighbor_feat_idx=array([0, 2]), estimator=LinearRegression()),
 _ImputerTriplet(feat_idx=2, neighbor_feat_idx=array([0, 1]), estimator=LinearRegression())]

In [22]:
# to estimate the age missing value in first iteration (test set)
print(imp.imputation_sequence_[0][2].coef_)
print(imp.imputation_sequence_[0][2].intercept_)

[0.46068289 0.04777397]
19.798858287238648


In [23]:
19*0.46068289 + 320*0.04777397 + 19.798858287238644

43.83950359723865

In [24]:
imp.transform(X_test)

[IterativeImputer] Completing matrix with shape (3, 3)
[IterativeImputer] Ending imputation round 1/1, elapsed time 0.00


array([[ 41.        ,  17.34984572, 289.89202243],
       [ 43.83950373,  19.        , 320.        ],
       [ 45.        ,  21.        , 350.        ]])

In [25]:
# to estimate the experience missing value in first iteration (test set)
print(imp.imputation_sequence_[1][2].coef_)
print(imp.imputation_sequence_[1][2].intercept_)

[1.02317467 0.00381208]
-25.232031420072186


In [26]:
1.02317467*41 + 0.00381208 *165.71 - 25.23203142007218

17.349829826727817

In [27]:
# to estimate the salary missing value in first iteration (test set)
print(imp.imputation_sequence_[2][2].coef_)
print(imp.imputation_sequence_[2][2].intercept_)

[9.91419532 4.81517967]
-200.1326100334473


In [28]:
9.91419532 *41 + 4.81517967*17.349 - 200.1326100334477

289.8879501813823