In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# for saving the pipeline
import joblib

# from Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, Binarizer

# from feature-engine
from feature_engine.imputation import (
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer,
)

from feature_engine.encoding import(
    RareLabelEncoder,
    OrdinalEncoder,
)

from feature_engine.transformation import(
    LogTransformer,
    YeoJohnsonTransformer,
)

from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

In [2]:
# load dataset
input_dir = './input'
data = pd.read_csv(f'{input_dir}/train.csv')

# show data size
print(data.shape)

data.head()

(1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000


In [3]:
## Seperate dataset into train and test

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['Id', 'SalePrice'], axis=1),
    data['SalePrice'],
    test_size = 0.1,
    random_state = 42,
)

X_train.shape, X_test.shape

((1314, 79), (146, 79))

## Feature Engineering

Let's engineer the features of the House Price Dataset including:

1. Missing Values
2. Temporal Values
3. Non-Gaussian distributed variables (e.g. Target `SalePrice`)
4. Categorical variables: remove rare labels
5. Categorical variables: convert strings to numbers
6. Standardlize values of variables to the same range

### Target: `SalePrice`

In [4]:
y_triain = np.log(y_train)
y_test = np.log(y_test)

## Missing Values

There are two ways to handle missing values.

1. For those variables with a lot of missing, we replace missing values with string "missing";
2. For those variables with fewer missing, we fill the value with the most frequency category. 

In [5]:
# let's identify the categorical variables
cat_vars = [var for var in data.columns if data[var].dtype == 'O']

# MSSubClass is also categorical by definition, despite its numeric values

# lets add MSSubClass to the list of categorical variables
cat_vars = cat_vars + ['MSSubClass']

# cast all variables as categorical
X_train[cat_vars] = X_train[cat_vars].astype('O')
X_test[cat_vars] = X_test[cat_vars].astype('O')

# number of categorical variables
len(cat_vars)

44

In [6]:
# make a list of the catergorical variables that contain missing values
cat_vars_with_na = [var for var in cat_vars if X_train[var].isnull().sum() > 0]

# print percentage of missing values per variable
X_train[cat_vars_with_na].isnull().mean().sort_values(ascending=False)

PoolQC          0.994673
MiscFeature     0.961948
Alley           0.934551
Fence           0.808219
FireplaceQu     0.468798
GarageType      0.055556
GarageFinish    0.055556
GarageQual      0.055556
GarageCond      0.055556
BsmtExposure    0.025114
BsmtQual        0.024353
BsmtCond        0.024353
BsmtFinType1    0.024353
BsmtFinType2    0.024353
MasVnrType      0.006088
Electrical      0.000761
dtype: float64

In [8]:
# variables to impute with the string missing
with_string_missing = [var for var in cat_vars_with_na if X_train[var].isnull().mean() > 0.1]
with_frequent_category = [var for var in cat_vars_with_na if X_train[var].isnull().mean() < 0.1]
cat_imputer_missing = CategoricalImputer(imputation_method='missing', variables=with_string_missing)

cat_imputer_missing.fit(X_train)

# replace NA by missing
# IMPORTANT: note that we could store this class with joblib
X_train = cat_imputer_missing.transform(X_train)
X_test = cat_imputer_missing.transform(X_test)

In [9]:
# variables to impute with the most frequent category

cat_imputer_frequent = CategoricalImputer(imputation_method='frequent', variables=with_frequent_category)

cat_imputer_frequent.fit(X_train)

X_train = cat_imputer_frequent.transform(X_train)
X_test = cat_imputer_frequent.transform(X_test)

In [10]:
# check that we have no missing information in the engineered variables

X_train[cat_vars_with_na].isnull().sum()

Alley           0
MasVnrType      0
BsmtQual        0
BsmtCond        0
BsmtExposure    0
BsmtFinType1    0
BsmtFinType2    0
Electrical      0
FireplaceQu     0
GarageType      0
GarageFinish    0
GarageQual      0
GarageCond      0
PoolQC          0
Fence           0
MiscFeature     0
dtype: int64

## Numerical Variables

To engineer missing values in numerical variables, we will:

* add a binary missing indicator variable
* and then repalce the missing value in the original value with the mean

In [12]:
num_vars = [
    var for var in X_train.columns if var not in cat_vars
]

len(num_vars)

35

In [13]:
# make a list with the numerical variables that contains the missing values

vars_with_na = [var for var in num_vars if X_train[var].isnull().sum() > 0]

X_train[vars_with_na].isnull().mean()

LotFrontage    0.180365
MasVnrArea     0.006088
GarageYrBlt    0.055556
dtype: float64

In [14]:
## add missing indicator

missing_ind = AddMissingIndicator(variables=vars_with_na)

missing_ind.fit(X_train)

X_train = missing_ind.transform(X_train)
X_test = missing_ind.transform(X_test)

# check the binary missing indicator variables
X_train[['LotFrontage_na', 'MasVnrArea_na', 'GarageYrBlt_na']].head()

Unnamed: 0,LotFrontage_na,MasVnrArea_na,GarageYrBlt_na
907,0,0,0
782,0,0,0
952,0,0,0
620,0,0,1
669,0,0,0


In [15]:
# then replace missing data with the mean

mean_imputer = MeanMedianImputer(imputation_method='mean', variables=vars_with_na)

# learn and store parameters from train set
mean_imputer.fit(X_train)
X_train = mean_imputer.transform(X_train)
X_test = mean_imputer.transform(X_test)

# check that we have no more missing values in the engineered variables
X_train[vars_with_na].isnull().sum()

LotFrontage    0
MasVnrArea     0
GarageYrBlt    0
dtype: int64

## Temporal variables

Capture elapsed time

In [21]:
def elapsed_years(df, var):
    # capture difference between the year variable and the year when the house was sold
    df[var] = df['YrSold'] - df[var]
    return df

In [23]:
for var in ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']:
    X_train = elapsed_years(X_train, var)
    X_test = elapsed_years(X_test, var)

In [25]:
# Now we drop YrSold
drop_features = DropFeatures(features_to_drop=['YrSold'])

X_train = drop_features.fit_transform(X_train)
X_test = drop_features.transform(X_test)

## Numerical variable transformation

### Logarithmic Transformation

Let's ransform with the logarightm the positive numerical variables in order to get a more Gaussian-like distribution. Target columns are `LotFrontage`, `1stFlrSF`, `GrLivArea`

In [26]:
log_transformer = LogTransformer(variables=["LotFrontage", "1stFlrSF", "GrLivArea"])

X_train = log_transformer.fit_transform(X_train)
X_test = log_transformer.transform(X_test)

In [27]:
# check that test set does not contain null values in the engineered variables
[var for var in ["LotFrontage", "1stFlrSF", "GrLivArea"] if X_test[var].isnull().sum() > 0]

[]

## Yeo-Johnson Transformation

We will apply Yeo-Johnson transformation to LotArea.

In [28]:
yeo_transformer = YeoJohnsonTransformer(variables=['LotArea'])

X_train = yeo_transformer.fit_transform(X_train)
X_test = yeo_transformer.transform(X_test)

  loglike = -n_samples / 2 * np.log(trans.var(axis=0))
  w = xb - ((xb - xc) * tmp2 - (xb - xa) * tmp1) / denom
  tmp1 = (x - w) * (fx - fv)
  tmp2 = (x - v) * (fx - fw)


## Binarize skewed variables

There were a few variables very skewed, we would transform them into binary variables. 

In [29]:
skewed = [
    'BsmtFinSF2', 'LowQualFinSF', 'EnclosedPorch',
    '3SsnPorch', 'ScreenPorch', 'MiscVal'
]

binarizer = SklearnTransformerWrapper(
    transformer=Binarizer(threshold=0), variables=skewed
)

X_train = binarizer.fit_transform(X_train)
X_test = binarizer.transform(X_test)

X_train[skewed].head()

Unnamed: 0,BsmtFinSF2,LowQualFinSF,EnclosedPorch,3SsnPorch,ScreenPorch,MiscVal
907,0,0,0,0,1,0
782,0,0,0,0,0,0
952,0,0,0,0,0,0
620,0,0,1,0,0,0
669,0,0,1,0,0,0


## Categorical variables

Apply mappings to some categorical variables who have an assigned order. 

In [30]:
# re-map strings to numbers, which determine quality

qual_mappings = {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, 'Missing': 0, 'NA': 0}

qual_vars = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
             'HeatingQC', 'KitchenQual', 'FireplaceQu',
             'GarageQual', 'GarageCond',
            ]

for var in qual_vars:
    X_train[var] = X_train[var].map(qual_mappings)
    X_test[var] = X_test[var].map(qual_mappings)

In [31]:
exposure_mappings = {'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4}

var = 'BsmtExposure'

X_train[var] = X_train[var].map(exposure_mappings)
X_test[var] = X_test[var].map(exposure_mappings)

In [32]:
finish_mappings = {'Missing': 0, 'NA': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}

finish_vars = ['BsmtFinType1', 'BsmtFinType2']

for var in finish_vars:
    X_train[var] = X_train[var].map(finish_mappings)
    X_test[var] = X_test[var].map(finish_mappings)

In [33]:
garage_mappings = {'Missing': 0, 'NA': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3}

var = 'GarageFinish'

X_train[var] = X_train[var].map(garage_mappings)
X_test[var] = X_test[var].map(garage_mappings)

In [34]:
fence_mappings = {'Missing': 0, 'NA': 0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv': 4}

var = 'Fence'

X_train[var] = X_train[var].map(fence_mappings)
X_test[var] = X_test[var].map(fence_mappings)

## Removing Rare Labels

For the remaining categorical variables, we will group those categories that are present in less than 1% of the observations. That is, all values of categorical variables that are shared by less than 1% of houses, well be replaced by the string "Rare".

In [35]:
# capture all quality variables

qual_vars = qual_vars + finish_vars + ['BsmtExposure','GarageFinish','Fence']

# capture the remaining categorical variables
cat_others =  [var for var in cat_vars if var not in qual_vars]

len(cat_others)

30

In [36]:
rare_encoder = RareLabelEncoder(tol=0.01, n_categories=1, variables=cat_others)

# find common labels
rare_encoder.fit(X_train)

RareLabelEncoder(n_categories=1, tol=0.01,
                 variables=['MSZoning', 'Street', 'Alley', 'LotShape',
                            'LandContour', 'Utilities', 'LotConfig',
                            'LandSlope', 'Neighborhood', 'Condition1',
                            'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle',
                            'RoofMatl', 'Exterior1st', 'Exterior2nd',
                            'MasVnrType', 'Foundation', 'Heating', 'CentralAir',
                            'Electrical', 'Functional', 'GarageType',
                            'PavedDrive', 'PoolQC', 'MiscFeature', 'SaleType',
                            'SaleCondition', 'MSSubClass'])

In [37]:
X_train = rare_encoder.transform(X_train)
X_test = rare_encoder.transform(X_test)

## Encoding of categorical variables

Next, we need to transform the strings of the categorical variables into numbers.

We will do it so that we capture the monotonic relationship between the label and the target.

In [38]:
# set up the encoder
cat_encoder = OrdinalEncoder(encoding_method='ordered', variables=cat_others)

# create the mappings
cat_encoder.fit(X_train, y_train)

OrdinalEncoder(variables=['MSZoning', 'Street', 'Alley', 'LotShape',
                          'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
                          'Neighborhood', 'Condition1', 'Condition2',
                          'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
                          'Exterior1st', 'Exterior2nd', 'MasVnrType',
                          'Foundation', 'Heating', 'CentralAir', 'Electrical',
                          'Functional', 'GarageType', 'PavedDrive', 'PoolQC',
                          'MiscFeature', 'SaleType', 'SaleCondition',
                          'MSSubClass'])

In [39]:
X_train = cat_encoder.transform(X_train)
X_test = cat_encoder.transform(X_test)

## Feature Scaling

We scale features to the minimum and maximum values. 

In [40]:
# create scaler
scaler = MinMaxScaler()

#  fit  the scaler to the train set
scaler.fit(X_train) 

# transform the train and test set

# sklearn returns numpy arrays, so we wrap the
# array with a pandas dataframe

X_train = pd.DataFrame(
    scaler.transform(X_train),
    columns=X_train.columns
)

X_test = pd.DataFrame(
    scaler.transform(X_test),
    columns=X_train.columns
)