In [54]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, MissingIndicator

In [55]:
df = pd.read_csv('train.csv', usecols=['Survived', 'Age', 'Fare'])

In [56]:
df.head()

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


In [57]:
X = df.drop(columns=['Survived'])
y = df['Survived']


In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [59]:
X_train.head(5)

Unnamed: 0,Age,Fare
331,45.5,28.5
733,23.0,13.0
382,32.0,7.925
704,26.0,7.8542
813,6.0,31.275


In [60]:
X_train.isnull().sum()

Age     140
Fare      0
dtype: int64

In [61]:
si = SimpleImputer()
X_train_trf = si.fit_transform(X_train) 
X_test_trf = si.fit_transform(X_test)

In [62]:
X_train_trf

array([[ 45.5   ,  28.5   ],
       [ 23.    ,  13.    ],
       [ 32.    ,   7.925 ],
       ...,
       [ 41.    ,  14.1083],
       [ 14.    , 120.    ],
       [ 21.    ,  77.2875]])

In [63]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train_trf, y_train)
y_pred = clf.predict(X_test_trf)

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.6480446927374302

In [64]:
mi = MissingIndicator()
mi.fit(X_train)

In [65]:
mi.features_

array([0])

In [66]:
X_train_missing = mi.transform(X_train)

In [67]:
X_train_missing

array([[False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [ True],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [ True],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [

In [68]:
X_test_missing = mi.transform(X_test)

In [69]:
X_test_missing

array([[ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [ True],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [ True],
       [False],
       [False],
       [ True],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [

In [70]:
X_train['Age_NA'] = X_train_missing

In [71]:
X_train

Unnamed: 0,Age,Fare,Age_NA
331,45.5,28.5000,False
733,23.0,13.0000,False
382,32.0,7.9250,False
704,26.0,7.8542,False
813,6.0,31.2750,False
...,...,...,...
106,21.0,7.6500,False
270,,31.0000,True
860,41.0,14.1083,False
435,14.0,120.0000,False


In [72]:
X_test['Age_NA'] = X_test_missing

In [82]:
X_train_trf

array([[ 45.5   ,  28.5   ],
       [ 23.    ,  13.    ],
       [ 32.    ,   7.925 ],
       ...,
       [ 41.    ,  14.1083],
       [ 14.    , 120.    ],
       [ 21.    ,  77.2875]])

In [75]:
X_test

Unnamed: 0,Age,Fare,Age_NA
709,,15.2458,True
439,31.0,10.5000,False
840,20.0,7.9250,False
720,6.0,33.0000,False
39,14.0,11.2417,False
...,...,...,...
433,17.0,7.1250,False
773,,7.2250,True
25,38.0,31.3875,False
84,17.0,10.5000,False


## Using Scikit-learn for Missing Indicator

In [76]:
si = SimpleImputer()

X_train_trf2 = si.fit_transform(X_train)
X_test_trf2 = si.transform(X_test)

In [77]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train_trf2, y_train)
y_pred2 = clf.predict(X_test_trf2)  

accuracy_score(y_test, y_pred2)

0.6368715083798883

In [78]:
si = SimpleImputer(add_indicator=True)


In [79]:
X_train = si.fit_transform(X_train)

In [80]:
X_test = si.transform(X_test)

In [81]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf.fit(X_train_trf2, y_train)
y_pred2 = clf.predict(X_test_trf2)

accuracy_score(y_test, y_pred2)

0.6368715083798883