In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from feature_engine.imputation import MeanMedianImputer

In [3]:
data = pd.read_csv('creditApprovalUCI.csv')

In [4]:
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,,u,g,q,h,,,,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A1      678 non-null    object 
 1   A2      678 non-null    float64
 2   A3      598 non-null    float64
 3   A4      684 non-null    object 
 4   A5      684 non-null    object 
 5   A6      681 non-null    object 
 6   A7      681 non-null    object 
 7   A8      598 non-null    float64
 8   A9      598 non-null    object 
 9   A10     598 non-null    object 
 10  A11     690 non-null    int64  
 11  A12     690 non-null    object 
 12  A13     690 non-null    object 
 13  A14     677 non-null    float64
 14  A15     690 non-null    int64  
 15  A16     690 non-null    int64  
dtypes: float64(4), int64(3), object(9)
memory usage: 86.4+ KB


In [6]:
data.isnull().sum()

A1     12
A2     12
A3     92
A4      6
A5      6
A6      9
A7      9
A8     92
A9     92
A10    92
A11     0
A12     0
A13     0
A14    13
A15     0
A16     0
dtype: int64

##### In mean and median imputation, the mean or median values should be calculated using the variables in the train set; therefore, let's separate the data into train and test sets and their respective targets

In [7]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

In [9]:
X_train.shape

(483, 15)

In [10]:
X_test.shape

(207, 15)

##### check the percentage of missing values in the train set:

In [11]:
X_train.isnull().mean()

A1     0.008282
A2     0.022774
A3     0.140787
A4     0.008282
A5     0.008282
A6     0.008282
A7     0.008282
A8     0.140787
A9     0.140787
A10    0.140787
A11    0.000000
A12    0.000000
A13    0.000000
A14    0.014493
A15    0.000000
dtype: float64

##### replace the missing values with the median in five numerical variables

In [13]:
for var in ['A2','A3','A8','A11','A15']:
    value = X_train[var].median()
    X_train[var] = X_train[var].fillna(value)
    X_test[var] = X_test[var].fillna(value)

In [15]:
X_train.isnull().sum()

A1      4
A2      0
A3      0
A4      4
A5      4
A6      4
A7      4
A8      0
A9     68
A10    68
A11     0
A12     0
A13     0
A14     7
A15     0
dtype: int64

In [16]:
X_test.isnull().sum()

A1      8
A2      0
A3      0
A4      2
A5      2
A6      5
A7      5
A8      0
A9     24
A10    24
A11     0
A12     0
A13     0
A14     6
A15     0
dtype: int64

##### using Simple Imputer 

In [20]:
X_train,X_test,y_train,y_test = train_test_split(
    data[['A2','A3','A8','A11','A15']],
    data['A16'], test_size = 0.3, random_state=0)

In [21]:
## create a median imputation object with SimpleImputer
imputer = SimpleImputer(strategy = 'mean')

## Let's fit the imputer to the train set
## the imputer will learn the median of all variables
imputer.fit(X_train)

## we can look at the learnt medians
imputer.statistics_

array([ 31.89019068,   4.84148193,   2.36901205,   2.51759834,
       966.25258799])

In [24]:
## and now we impute train and test sets
## NOTE: the data is returned as a numpy array!!

X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

In [25]:
### check if missing values were removed
pd.DataFrame(X_train).isnull().sum()

0    0
1    0
2    0
3    0
4    0
dtype: int64

##### Using Feature-engine

In [26]:
# let's separate into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(
    data.drop('A16', axis=1), data['A16'], test_size=0.3, random_state=0)

In [27]:
### Let's create a median mputer

median_imputer = MeanMedianImputer(imputation_method = 'median', variables=['A2','A3','A8','A11','A15'])

median_imputer.fit(X_train)

In [28]:
# let's inspect the dictionary with the mappings for each variable
median_imputer.imputer_dict_

{'A2': 28.835, 'A3': 2.75, 'A8': 1.0, 'A11': 0.0, 'A15': 6.0}

In [30]:
## transform the data
X_train = median_imputer.transform(X_train)
X_test = median_imputer.transform(X_test)

In [31]:
X_train[['A2', 'A3', 'A8', 'A11', 'A15']].isnull().mean()

A2     0.0
A3     0.0
A8     0.0
A11    0.0
A15    0.0
dtype: float64

##### Mean / median imputation with Sklearn selecting features to impute

In [32]:
import pandas as pd

# to impute missinf data with sklearn
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# to split the data sets
from sklearn.model_selection import train_test_split

In [33]:
# load data
data = pd.read_csv('creditApprovalUCI.csv')

# seperate data to train and test
X_train,X_test,y_train,y_test = train_test_split(data.drop('A16',axis=1),data['A16'],test_size=0.3,random_state=0)

In [34]:
# first we need to make a list with the numerical vars
numeric_features_mean = ['A2', 'A3', 'A8', 'A11', 'A15']

In [35]:
# then we instantiate the imputer within a pipeline
numeric_mean_imputer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
])

In [36]:
# then we put the features list and the imputer in the column transformer
preprocessor = ColumnTransformer(transformers=[
    ('mean_imputer', numeric_mean_imputer, numeric_features_mean)
],remainder='passthrough')

Imagine you have a bunch of columns in your dataset, and you want to do different kinds of fixing or changing to some of them while leaving others untouched.

The ColumnTransformer is like a tool that helps you do this. The remainder parameter with 'passthrough' is like saying, "For the columns that I don't specifically mention in my fixing plan, just keep them as they are, don't do anything special to them."

In [37]:
### now we fit the preprocessor
preprocessor.fit(X_train)

In [38]:
### and now we impute the data
X_train = preprocessor.transform(X_train)
X_train = preprocessor.transform(X_test)

In [42]:
# Note that Scikit-Learn transformers return NumPy arrays!!
X_train

array([[45.83, 10.5, 5.0, ..., 't', 'g', 0.0],
       [64.08, 20.0, 17.5, ..., 't', 'g', 0.0],
       [31.25, 3.75, 0.625, ..., 't', 'g', 181.0],
       ...,
       [21.42, 4.841481927710842, 2.3690120481927712, ..., 't', 'g',
        132.0],
       [26.83, 4.841481927710842, 2.3690120481927712, ..., 'f', 'g',
        100.0],
       [62.5, 12.75, 5.0, ..., 'f', 'g', 112.0]], dtype=object)