## Random Sample Imputation

In this recipe, we will perform random sample imputation using pandas and Feature Engine.

In [1]:
import pandas as pd
import numpy as np

# to split the data sets
from sklearn.model_selection import train_test_split

# to impute missing data with feature-engine
from feature_engine.missing_data_imputers import RandomSampleImputer

In [2]:
# load data
data = pd.read_csv('creditApprovalUCI.csv')
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [3]:
# let's separate into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(
    data[['A2', 'A3', 'A8', 'A14']], data['A16'], test_size=0.3, random_state=0)

X_train.shape, X_test.shape

((483, 4), (207, 4))

In [4]:
X_train.dtypes

A2     float64
A3     float64
A8     float64
A14    float64
dtype: object

In [5]:
# find the percentage of missing data within those variables

X_train.isnull().mean()

A2     0.0
A3     0.0
A8     0.0
A14    0.0
dtype: float64

## Random Sample imputation with Feature Engine

In [6]:
# let's create a random sample imputer

imputer = RandomSampleImputer(random_state=10)

imputer.fit(X_train)

RandomSampleImputer(random_state=10, seed='general', seeding_method='add',
                    variables=['A2', 'A3', 'A8', 'A14'])

In [7]:
# the imputer stores the train set

imputer.X.head()

Unnamed: 0,A2,A3,A8,A14
596,46.08,3.0,2.375,396.0
303,15.92,2.875,0.085,120.0
204,36.33,2.125,0.085,50.0
351,22.17,0.585,0.0,100.0
118,57.83,7.04,14.0,360.0


In [8]:
# transform the data - replace the missing values

X_train_t = imputer.transform(X_train)
X_test_t = imputer.transform(X_test)

In [9]:
# check that null values were replaced
X_train_t.isnull().mean()

A2     0.0
A3     0.0
A8     0.0
A14    0.0
dtype: float64

In [10]:
imputer_obs = RandomSampleImputer(random_state=['A8', 'A3'], seed='observation', seeding_method='add')

In [11]:
imputer_obs.fit(X_train)

RandomSampleImputer(random_state=['A8', 'A3'], seed='observation',
                    seeding_method='add', variables=['A2', 'A3', 'A8', 'A14'])

In [12]:
X_train_tt = imputer_obs.transform(X_train)
X_test_tt = imputer_obs.transform(X_test)

In [13]:
# check that null values were replaced
X_train_tt.isnull().mean()

A2     0.0
A3     0.0
A8     0.0
A14    0.0
dtype: float64

In [14]:
pd.concat([
    X_train_tt[X_train['A2'].isnull()][['A2', 'A3', 'A8']],
    X_train_t[X_train['A2'].isnull()]['A2']],
    axis=1
)

Unnamed: 0,A2,A3,A8,A2.1


In [15]:
tmp = pd.DataFrame({'A2':np.nan, 'A3': 3, 'A8':20, 'A14': np.nan}, index=[1])
tmp

Unnamed: 0,A2,A3,A8,A14
1,,3,20,


In [16]:
for i in range(0,10):
    print(imputer.transform(tmp))

      A2  A3  A8    A14
1  25.25   3  20  200.0
      A2  A3  A8    A14
1  25.25   3  20  200.0
      A2  A3  A8    A14
1  25.25   3  20  200.0
      A2  A3  A8    A14
1  25.25   3  20  200.0
      A2  A3  A8    A14
1  25.25   3  20  200.0
      A2  A3  A8    A14
1  25.25   3  20  200.0
      A2  A3  A8    A14
1  25.25   3  20  200.0
      A2  A3  A8    A14
1  25.25   3  20  200.0
      A2  A3  A8    A14
1  25.25   3  20  200.0
      A2  A3  A8    A14
1  25.25   3  20  200.0


In [17]:
for i in range(0,10):
    print(imputer_obs.transform(tmp))

      A2  A3  A8    A14
1  23.58   3  20  208.0
      A2  A3  A8    A14
1  23.58   3  20  208.0
      A2  A3  A8    A14
1  23.58   3  20  208.0
      A2  A3  A8    A14
1  23.58   3  20  208.0
      A2  A3  A8    A14
1  23.58   3  20  208.0
      A2  A3  A8    A14
1  23.58   3  20  208.0
      A2  A3  A8    A14
1  23.58   3  20  208.0
      A2  A3  A8    A14
1  23.58   3  20  208.0
      A2  A3  A8    A14
1  23.58   3  20  208.0
      A2  A3  A8    A14
1  23.58   3  20  208.0


In [18]:
for i in range(0,10):
    print(imputer_obs.transform(tmp))

      A2  A3  A8    A14
1  23.58   3  20  208.0
      A2  A3  A8    A14
1  23.58   3  20  208.0
      A2  A3  A8    A14
1  23.58   3  20  208.0
      A2  A3  A8    A14
1  23.58   3  20  208.0
      A2  A3  A8    A14
1  23.58   3  20  208.0
      A2  A3  A8    A14
1  23.58   3  20  208.0
      A2  A3  A8    A14
1  23.58   3  20  208.0
      A2  A3  A8    A14
1  23.58   3  20  208.0
      A2  A3  A8    A14
1  23.58   3  20  208.0
      A2  A3  A8    A14
1  23.58   3  20  208.0
