In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from feature_engine.wrappers import SklearnTransformerWrapper

In [2]:
data = pd.read_csv('houseprice.csv')
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['Id', 'SalePrice'], axis=1), data['SalePrice'], test_size=0.3, random_state=0)

X_train.shape, X_test.shape

((1022, 79), (438, 79))

## Scaling

In [4]:
cols = [var for var in X_train.columns if X_train[var].dtypes !='O']

cols

['MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

In [5]:
# let's apply the standard scaler on the above variables

scaler = SklearnTransformerWrapper(transformer = StandardScaler(),
                                    variables = cols)

scaler.fit(X_train.fillna(0))

SklearnTransformerWrapper(transformer=StandardScaler(),
                          variables=['MSSubClass', 'LotFrontage', 'LotArea',
                                     'OverallQual', 'OverallCond', 'YearBuilt',
                                     'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
                                     'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
                                     '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
                                     'GrLivArea', 'BsmtFullBath',
                                     'BsmtHalfBath', 'FullBath', 'HalfBath',
                                     'BedroomAbvGr', 'KitchenAbvGr',
                                     'TotRmsAbvGrd', 'Fireplaces',
                                     'GarageYrBlt', 'GarageCars', 'GarageArea',
                                     'WoodDeckSF', 'OpenPorchSF',
                                     'EnclosedPorch', ...])

In [6]:
X_train = scaler.transform(X_train.fillna(0))
X_test = scaler.transform(X_test.fillna(0))

In [7]:
# mean values, learnt by the StandardScaler
scaler.transformer.mean_

array([5.66144814e+01, 5.67847358e+01, 1.05679667e+04, 6.07925636e+00,
       5.56262231e+00, 1.97094031e+03, 1.98469863e+03, 1.03046967e+02,
       4.42224070e+02, 4.71272016e+01, 5.65992172e+02, 1.05534344e+03,
       1.16172211e+03, 3.54725049e+02, 5.69080235e+00, 1.52213796e+03,
       4.18786693e-01, 5.67514677e-02, 1.57632094e+00, 3.82583170e-01,
       2.89432485e+00, 1.04500978e+00, 6.54892368e+00, 6.12524462e-01,
       1.87349902e+03, 1.76418787e+00, 4.69398239e+02, 9.48522505e+01,
       4.73786693e+01, 2.36076321e+01, 3.32583170e+00, 1.56467710e+01,
       1.78669276e+00, 5.58649706e+01, 6.30039139e+00, 2.00783953e+03])

In [8]:
# std values, learnt by the StandardScaler
scaler.transformer.scale_

array([4.21398764e+01, 3.41348001e+01, 1.02862405e+04, 1.35722328e+00,
       1.11803056e+00, 3.02275350e+01, 2.04957916e+01, 1.81223904e+02,
       4.29715456e+02, 1.57876396e+02, 4.32828598e+02, 4.10034673e+02,
       3.73803122e+02, 4.40622334e+02, 4.77726012e+01, 5.17557055e+02,
       5.14714307e-01, 2.35558243e-01, 5.41385993e-01, 4.97950772e-01,
       8.02751557e-01, 2.21031096e-01, 1.61483273e+00, 6.35357186e-01,
       4.43166351e+02, 7.33707095e-01, 2.08708471e+02, 1.28334288e+02,
       6.75958872e+01, 6.42442872e+01, 2.87183246e+01, 5.62094688e+01,
       3.33296043e+01, 5.87014705e+02, 2.70839455e+00, 1.34094155e+00])

In [9]:
# the mean of the scaled variables is 0
X_train[cols].mean()

MSSubClass       9.038215e-17
LotFrontage      5.040543e-17
LotArea         -4.953637e-17
OverallQual     -3.024326e-16
OverallCond     -3.476236e-17
YearBuilt        1.360947e-15
YearRemodAdd    -3.199007e-15
MasVnrArea      -2.954801e-17
BsmtFinSF1       5.996508e-17
BsmtFinSF2      -1.738118e-18
BsmtUnfSF       -6.952473e-18
TotalBsmtSF     -2.780989e-17
1stFlrSF         1.355732e-16
2ndFlrSF        -5.561978e-17
LowQualFinSF    -4.953637e-17
GrLivArea       -7.126285e-17
BsmtFullBath     3.823860e-17
BsmtHalfBath     5.561978e-17
FullBath        -1.199302e-16
HalfBath         5.214355e-18
BedroomAbvGr     1.112396e-16
KitchenAbvGr    -3.319806e-16
TotRmsAbvGrd    -2.468128e-16
Fireplaces      -2.433366e-17
GarageYrBlt     -2.085742e-17
GarageCars      -9.038215e-17
GarageArea      -1.738118e-17
WoodDeckSF      -8.690591e-19
OpenPorchSF     -2.259554e-17
EnclosedPorch   -2.172648e-17
3SsnPorch        1.738118e-18
ScreenPorch      1.738118e-17
PoolArea        -3.302425e-17
MiscVal   

In [10]:
# the std of the scaled variables is ~1

X_train[cols].std()

MSSubClass       1.00049
LotFrontage      1.00049
LotArea          1.00049
OverallQual      1.00049
OverallCond      1.00049
YearBuilt        1.00049
YearRemodAdd     1.00049
MasVnrArea       1.00049
BsmtFinSF1       1.00049
BsmtFinSF2       1.00049
BsmtUnfSF        1.00049
TotalBsmtSF      1.00049
1stFlrSF         1.00049
2ndFlrSF         1.00049
LowQualFinSF     1.00049
GrLivArea        1.00049
BsmtFullBath     1.00049
BsmtHalfBath     1.00049
FullBath         1.00049
HalfBath         1.00049
BedroomAbvGr     1.00049
KitchenAbvGr     1.00049
TotRmsAbvGrd     1.00049
Fireplaces       1.00049
GarageYrBlt      1.00049
GarageCars       1.00049
GarageArea       1.00049
WoodDeckSF       1.00049
OpenPorchSF      1.00049
EnclosedPorch    1.00049
3SsnPorch        1.00049
ScreenPorch      1.00049
PoolArea         1.00049
MiscVal          1.00049
MoSold           1.00049
YrSold           1.00049
dtype: float64