In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
# load the Titanic Dataset with a few variables for demonstration

data = pd.read_csv('titanic.csv', usecols=['age', 'fare', 'survived'])
data.head()

Unnamed: 0,survived,age,fare
0,1,29.0,211.3375
1,1,0.9167,151.55
2,0,2.0,151.55
3,0,30.0,151.55
4,0,25.0,151.55


In [3]:
# let's look at the percentage of NA

data.isnull().mean()

survived    0.000000
age         0.200917
fare        0.000764
dtype: float64

To add a binary missing indicator, we don't necessarily need to learn anything from the training set, so in principle we could do this in the original dataset and then separate into train and test. However, I do not recommend this practice.
In addition, if you are using scikit-learn to add the missing indicator, the indicator as it is designed, needs to learn from the train set, which features to impute, this is, which are the features for which the binary variable needs to be added. We will see more about different implementations of missing indicators in future notebooks. For now, let's see how to create a binary missing indicator manually.

In [4]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    data[['age', 'fare']],  # predictors
    data['survived'],  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

X_train.shape, X_test.shape

((916, 2), (393, 2))

In [5]:
# explore the missing data in the train set


X_train.isnull().mean()

age     0.191048
fare    0.000000
dtype: float64

In [6]:
# add the missing indicator

X_train['Age_NA'] = np.where(X_train['age'].isnull(), 1, 0)
X_test['Age_NA'] = np.where(X_test['age'].isnull(), 1, 0)

X_train.head()

Unnamed: 0,age,fare,Age_NA
501,13.0,19.5,0
588,4.0,23.0,0
402,30.0,13.8583,0
1193,,7.725,1
686,22.0,7.725,0


In [7]:
# the mean of the binary variable, coincides with the perentage of missing values in the original variable

X_train['Age_NA'].mean()

0.19104803493449782

In [8]:
# yet the original variable, still shows the missing values
X_train.isnull().mean()

age       0.191048
fare      0.000000
Age_NA    0.000000
dtype: float64

In [9]:
# for example median imputation

median = X_train['age'].median()

X_train['age'] = X_train['age'].fillna(median)
X_test['age'] = X_test['age'].fillna(median)

# check that there are no more missing values
X_train.isnull().mean()

age       0.0
fare      0.0
Age_NA    0.0
dtype: float64