In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

from feature_engine.wrappers import SklearnTransformerWrapper
from feature_engine.categorical_encoders import RareLabelCategoricalEncoder

In [2]:
data = pd.read_csv('houseprice.csv')
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['Id', 'SalePrice'], axis=1), data['SalePrice'], test_size=0.3, random_state=0)

X_train.shape, X_test.shape

((1022, 79), (438, 79))

In [4]:
X_train[['LotFrontage', 'MasVnrArea']].isnull().mean()

LotFrontage    0.184932
MasVnrArea     0.004892
dtype: float64

## SimpleImputer

### Mean imputation

In [5]:
imputer = SklearnTransformerWrapper(transformer = SimpleImputer(strategy='mean'),
                                    variables = ['LotFrontage', 'MasVnrArea'])

imputer.fit(X_train)

SklearnTransformerWrapper(transformer=SimpleImputer(add_indicator=False,
                                                    copy=True, fill_value=None,
                                                    missing_values=nan,
                                                    strategy='mean',
                                                    verbose=0),
                          variables=['LotFrontage', 'MasVnrArea'])

In [6]:
# we can find the mean values within the parameters of the
# simple imputer

imputer.transformer.statistics_

array([ 69.66866747, 103.55358899])

In [7]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

X_train[['LotFrontage', 'MasVnrArea']].isnull().mean()

LotFrontage    0.0
MasVnrArea     0.0
dtype: float64

### Frequent category imputation

In [8]:
cols = [c for c in data.columns if data[c].dtypes=='O' and data[c].isnull().sum()>0]
data[cols].head()

Unnamed: 0,Alley,MasVnrType,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Electrical,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PoolQC,Fence,MiscFeature
0,,BrkFace,Gd,TA,No,GLQ,Unf,SBrkr,,Attchd,RFn,TA,TA,,,
1,,,Gd,TA,Gd,ALQ,Unf,SBrkr,TA,Attchd,RFn,TA,TA,,,
2,,BrkFace,Gd,TA,Mn,GLQ,Unf,SBrkr,TA,Attchd,RFn,TA,TA,,,
3,,,TA,Gd,No,ALQ,Unf,SBrkr,Gd,Detchd,Unf,TA,TA,,,
4,,BrkFace,Gd,TA,Av,GLQ,Unf,SBrkr,TA,Attchd,RFn,TA,TA,,,


In [9]:
imputer = SklearnTransformerWrapper(transformer = SimpleImputer(strategy='most_frequent'),
                                    variables = cols)

imputer.fit(X_train)

SklearnTransformerWrapper(transformer=SimpleImputer(add_indicator=False,
                                                    copy=True, fill_value=None,
                                                    missing_values=nan,
                                                    strategy='most_frequent',
                                                    verbose=0),
                          variables=['Alley', 'MasVnrType', 'BsmtQual',
                                     'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
                                     'BsmtFinType2', 'Electrical',
                                     'FireplaceQu', 'GarageType',
                                     'GarageFinish', 'GarageQual', 'GarageCond',
                                     'PoolQC', 'Fence', 'MiscFeature'])

In [10]:
# we can find the most frequent values within the parameters of the
# simple imputer

imputer.transformer.statistics_

array(['Pave', 'None', 'TA', 'TA', 'No', 'Unf', 'Unf', 'SBrkr', 'Gd',
       'Attchd', 'Unf', 'TA', 'TA', 'Gd', 'MnPrv', 'Shed'], dtype=object)

In [11]:
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

X_train[cols].isnull().mean()

Alley           0.0
MasVnrType      0.0
BsmtQual        0.0
BsmtCond        0.0
BsmtExposure    0.0
BsmtFinType1    0.0
BsmtFinType2    0.0
Electrical      0.0
FireplaceQu     0.0
GarageType      0.0
GarageFinish    0.0
GarageQual      0.0
GarageCond      0.0
PoolQC          0.0
Fence           0.0
MiscFeature     0.0
dtype: float64

## OrdinalEncoder

In [12]:
cols = ['Alley',
        'MasVnrType',
        'BsmtQual',
        'BsmtCond',
        'BsmtExposure',
        'BsmtFinType1',
        'BsmtFinType2',
        'Electrical',
        'FireplaceQu',
        'GarageType',
        'GarageFinish',
        'GarageQual']

In [13]:
# let's remove rare labels to avoid errors when encoding

rare_label_enc = RareLabelCategoricalEncoder(n_categories=2, variables=cols)

X_train = rare_label_enc.fit_transform(X_train)
X_test = rare_label_enc.transform(X_test)



In [14]:
# now let's replace categories by integers

encoder = SklearnTransformerWrapper(transformer = OrdinalEncoder(),
                                    variables = cols)

encoder.fit(X_train)

SklearnTransformerWrapper(transformer=OrdinalEncoder(categories='auto',
                                                     dtype=<class 'numpy.float64'>),
                          variables=['Alley', 'MasVnrType', 'BsmtQual',
                                     'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
                                     'BsmtFinType2', 'Electrical',
                                     'FireplaceQu', 'GarageType',
                                     'GarageFinish', 'GarageQual'])

In [15]:
# we can navigate to the parameters of the sklearn transformer
# like this:

encoder.transformer.categories_

[array(['Grvl', 'Pave'], dtype=object),
 array(['BrkFace', 'None', 'Rare', 'Stone'], dtype=object),
 array(['Ex', 'Gd', 'Rare', 'TA'], dtype=object),
 array(['Rare', 'TA'], dtype=object),
 array(['Av', 'Gd', 'Mn', 'No'], dtype=object),
 array(['ALQ', 'BLQ', 'GLQ', 'Rare', 'Rec', 'Unf'], dtype=object),
 array(['Rare', 'Unf'], dtype=object),
 array(['FuseA', 'Rare', 'SBrkr'], dtype=object),
 array(['Gd', 'Rare', 'TA'], dtype=object),
 array(['Attchd', 'BuiltIn', 'Detchd', 'Rare'], dtype=object),
 array(['Fin', 'RFn', 'Unf'], dtype=object),
 array(['Rare', 'TA'], dtype=object)]

In [16]:
# encode categories

X_train = encoder.transform(X_train)
X_test = encoder.transform(X_test)

X_train[cols].isnull().mean()

Alley           0.0
MasVnrType      0.0
BsmtQual        0.0
BsmtCond        0.0
BsmtExposure    0.0
BsmtFinType1    0.0
BsmtFinType2    0.0
Electrical      0.0
FireplaceQu     0.0
GarageType      0.0
GarageFinish    0.0
GarageQual      0.0
dtype: float64

In [17]:
X_test[cols].head()

Unnamed: 0,Alley,MasVnrType,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Electrical,FireplaceQu,GarageType,GarageFinish,GarageQual
529,1.0,1.0,3.0,1.0,3.0,4.0,1.0,2.0,2.0,0.0,1.0,1.0
491,1.0,1.0,3.0,1.0,3.0,1.0,0.0,0.0,2.0,0.0,2.0,1.0
459,1.0,2.0,3.0,1.0,3.0,3.0,1.0,2.0,2.0,2.0,2.0,1.0
279,1.0,0.0,1.0,1.0,3.0,1.0,1.0,2.0,2.0,0.0,0.0,1.0
655,1.0,0.0,3.0,1.0,3.0,5.0,1.0,2.0,0.0,2.0,2.0,1.0


## Scaling

In [18]:
cols = [
    'LotFrontage',
    'MasVnrArea',
    'Alley',
    'MasVnrType',
    'BsmtQual',
    'BsmtCond',
    'BsmtExposure',
    'BsmtFinType1',
    'BsmtFinType2',
    'Electrical',
    'FireplaceQu',
    'GarageType',
    'GarageFinish',
    'GarageQual']

In [19]:
# let's apply the standard scaler on the above variables

scaler = SklearnTransformerWrapper(transformer = StandardScaler(),
                                    variables = cols)

scaler.fit(X_train)

SklearnTransformerWrapper(transformer=StandardScaler(copy=True, with_mean=True,
                                                     with_std=True),
                          variables=['LotFrontage', 'MasVnrArea', 'Alley',
                                     'MasVnrType', 'BsmtQual', 'BsmtCond',
                                     'BsmtExposure', 'BsmtFinType1',
                                     'BsmtFinType2', 'Electrical',
                                     'FireplaceQu', 'GarageType',
                                     'GarageFinish', 'GarageQual'])

In [20]:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

X_train[cols].isnull().mean()

LotFrontage     0.0
MasVnrArea      0.0
Alley           0.0
MasVnrType      0.0
BsmtQual        0.0
BsmtCond        0.0
BsmtExposure    0.0
BsmtFinType1    0.0
BsmtFinType2    0.0
Electrical      0.0
FireplaceQu     0.0
GarageType      0.0
GarageFinish    0.0
GarageQual      0.0
dtype: float64

In [21]:
# mean values, learnt by the StandardScaler
scaler.transformer.mean_

array([ 69.66866747, 103.55358899,   0.97064579,   0.90508806,
         1.89530333,   0.92074364,   2.30332681,   2.79158513,
         0.87866928,   1.84148728,   0.50684932,   0.66046967,
         1.21917808,   0.95205479])

In [22]:
# std values, learnt by the StandardScaler
scaler.transformer.scale_

array([2.08224380e+01, 1.81079810e+02, 1.68797328e-01, 8.21550216e-01,
       1.09874393e+00, 2.70138464e-01, 1.12347687e+00, 1.83694623e+00,
       3.26511224e-01, 5.19973787e-01, 8.35520190e-01, 9.44229034e-01,
       8.09014833e-01, 2.13650328e-01])

In [23]:
# the mean of the scaled variables is 0
X_train[cols].mean()

LotFrontage     1.792434e-16
MasVnrArea     -2.575674e-16
Alley           5.051406e-16
MasVnrType     -5.561978e-17
BsmtQual        3.693501e-18
BsmtCond       -4.032434e-16
BsmtExposure    6.116004e-17
BsmtFinType1   -9.863821e-17
BsmtFinType2   -3.946615e-16
Electrical     -1.980368e-16
FireplaceQu    -1.746809e-16
GarageType      4.284461e-16
GarageFinish    1.442638e-16
GarageQual     -1.251988e-16
dtype: float64

In [24]:
# the std of the scaled variables is ~1

X_train[cols].std()

LotFrontage     1.00049
MasVnrArea      1.00049
Alley           1.00049
MasVnrType      1.00049
BsmtQual        1.00049
BsmtCond        1.00049
BsmtExposure    1.00049
BsmtFinType1    1.00049
BsmtFinType2    1.00049
Electrical      1.00049
FireplaceQu     1.00049
GarageType      1.00049
GarageFinish    1.00049
GarageQual      1.00049
dtype: float64