**Concept** - Create a completely new column for the column which has a missing data and then input 'False' or 'True' based on whether the data is present in its corresponding original column or not

In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import MissingIndicator, SimpleImputer

In [6]:
df = pd.read_csv('train.csv', usecols = ['Fare', 'Age', 'Survived'])
df.head()

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


In [8]:
X = df.drop(columns = ['Survived'])
y = df['Survived']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state= 2)

In [10]:
X_train.head()

Unnamed: 0,Age,Fare
30,40.0,27.7208
10,4.0,16.7
873,47.0,9.0
182,9.0,31.3875
876,20.0,9.8458


## **Without Applying Missing Indicator**

In [11]:
si = SimpleImputer()
X_train_trf = si.fit_transform(X_train)
X_test_trf = si.transform(X_test)

In [12]:
X_train_trf

array([[ 40.        ,  27.7208    ],
       [  4.        ,  16.7       ],
       [ 47.        ,   9.        ],
       ...,
       [ 71.        ,  49.5042    ],
       [ 29.78590426, 221.7792    ],
       [ 29.78590426,  25.925     ]])

In [13]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

clf.fit(X_train_trf, y_train)

y_pred = clf.predict(X_test_trf)

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.6145251396648045

## **After Applying Missing Indicator**

In [14]:
mi = MissingIndicator()
mi.fit(X_train)

MissingIndicator()

In [18]:
mi.features_

# it shows that only the 0th column in X_train dataset (i.e. Age clumn) has the missing value

array([0])

In [None]:
# Creating new column which stores True/False based of if the data in Age column is missing or not

X_train_missing = mi.transform(X_train)  

In [20]:
X_train_missing

array([[False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [

In [23]:
X_test_missing = mi.transform(X_test)

In [24]:
X_test_missing

array([[False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [ True],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [ True],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [

In [25]:
# Here we are finally adding the missing indicator column (X_train_missing) in our orignial Training Dataset 

X_train['Age_NA'] = X_train_missing

In [26]:
# Here we are finally adding the missing indicator column (X_test_missing) in our orignial Test Dataset 

X_test['Age_NA'] = X_test_missing

In [28]:
# Here you can see wherever the value in Age column is missing (NaN) the corresponding value in Age_NA is True

X_train.tail()

Unnamed: 0,Age,Fare,Age_NA
534,30.0,8.6625,False
584,,8.7125,True
493,71.0,49.5042,False
527,,221.7792,True
168,,25.925,True


In [29]:
si = SimpleImputer()

X_train_trf2 = si.fit_transform(X_train)
X_test_trf2 = si.transform(X_test)

In [30]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()

clf.fit(X_train_trf2,y_train)

y_pred = clf.predict(X_test_trf2)

from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6312849162011173

**We have a parameter (add_indicator) in SimpleImputer. If you give it True then we need not separately use MissingIndicator as setting this parameter = True will do the same job. Example shown below**

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state= 2)

In [32]:
si = SimpleImputer(add_indicator=True)

In [33]:
X_train = si.fit_transform(X_train)

In [None]:
X_test = si.transform(X_test)

In [35]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()

clf.fit(X_train_trf2,y_train)

y_pred = clf.predict(X_test_trf2)

from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6312849162011173