in this an entirely new column will be created.

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import MissingIndicator, SimpleImputer

In [3]:
df = pd.read_csv('train.csv',usecols=['Age','Fare','Survived'])

In [4]:
df.head()

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


In [5]:
df.isnull().mean()

Unnamed: 0,0
Survived,0.0
Age,0.198653
Fare,0.0


In [6]:
X = df.drop(columns=['Survived'])
y = df['Survived']


In [7]:
X_train,X_test,y_train,y_test = train_test_split(
    X,y,test_size=0.2,random_state=2
)

In [8]:
X_train.head()

Unnamed: 0,Age,Fare
30,40.0,27.7208
10,4.0,16.7
873,47.0,9.0
182,9.0,31.3875
876,20.0,9.8458


In [9]:
X_train.tail()

Unnamed: 0,Age,Fare
534,30.0,8.6625
584,,8.7125
493,71.0,49.5042
527,,221.7792
168,,25.925


In [10]:
si = SimpleImputer() # default strategy = mean
X_train_trf = si.fit_transform(X_train)
X_test_trf = si.transform(X_test)

In [11]:
X_train_trf

array([[ 40.        ,  27.7208    ],
       [  4.        ,  16.7       ],
       [ 47.        ,   9.        ],
       ...,
       [ 71.        ,  49.5042    ],
       [ 29.78590426, 221.7792    ],
       [ 29.78590426,  25.925     ]])

In [12]:
si.statistics_

array([29.78590426, 32.28905604])

In [13]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train_trf,y_train)
y_pred = clf.predict(X_test_trf)


In [14]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6145251396648045

In [15]:
mi = MissingIndicator()
mi.fit(X_train)

In [16]:
mi.feature_names_in_

array(['Age', 'Fare'], dtype=object)

In [17]:
mi.features_

array([0])

*it identifies that missing values are only in 'Age' column*

*and creates a completely new column, where it says True when the data is NaN in 'Age', and says False when the data is present in 'Age'*

*so we can identify if the data is missing or not*

In [18]:
X_train_missing = mi.transform(X_train)

In [22]:
X_train_missing[9]

array([ True])

In [25]:
X_train.head(10)

Unnamed: 0,Age,Fare
30,40.0,27.7208
10,4.0,16.7
873,47.0,9.0
182,9.0,31.3875
876,20.0,9.8458
213,30.0,13.0
157,30.0,8.05
780,13.0,7.2292
572,36.0,26.3875
77,,8.05


we can see that the 9th index which had NaN is now set to True in the new column

In [26]:
X_test_missing = mi.transform(X_test)

In [27]:
X_test_missing

array([[False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [ True],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [ True],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [

In [28]:
X_train['Age_NA'] = X_train_missing

In [29]:
X_test

Unnamed: 0,Age,Fare
707,42.0,26.2875
37,21.0,8.0500
615,24.0,65.0000
169,28.0,56.4958
68,17.0,7.9250
...,...,...
89,24.0,8.0500
80,22.0,9.0000
846,,69.5500
870,26.0,7.8958


In [30]:
X_test['Age_NA'] = X_test_missing

In [31]:
X_train

Unnamed: 0,Age,Fare,Age_NA
30,40.0,27.7208,False
10,4.0,16.7000,False
873,47.0,9.0000,False
182,9.0,31.3875,False
876,20.0,9.8458,False
...,...,...,...
534,30.0,8.6625,False
584,,8.7125,True
493,71.0,49.5042,False
527,,221.7792,True


*since the ml model now knows if the data is missing or not, it's performance will increase*

*now pass the X_train and X_test with the new column Age_NA*

In [32]:
si = SimpleImputer() #default strategy = mean
X_train_trf2 = si.fit_transform(X_train)
X_test_trf2 = si.transform(X_test)

*for training, we are first imputing the missing values in the 'Age' column with the mean and then we are using the MissingIndicator class to tell the ml model that the data at this place is missing by creating a new column 'Age_NA'*

In [33]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train_trf2,y_train)
y_pred = clf.predict(X_test_trf2)
accuracy_score(y_test,y_pred)

0.6312849162011173

*we can see the accuracy has now improved from 0.6145251396648045 to 0.6312849162011173*

**easy method**

*instead of creating a missing indicator obj, and doing it, we can directly use the param "add_indicator" of simple imputer class.*

In [34]:
si = SimpleImputer(add_indicator=True)

In [35]:
X = df.drop(columns=['Survived'])
y = df['Survived']

In [39]:
X_train,X_test,y_train,y_test =train_test_split(
    X,y,test_size=0.2,random_state=2)


In [40]:
X_train = si.fit_transform(X_train)
X_test = si.transform(X_test)

In [42]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test,y_pred)

0.6312849162011173