In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder

from feature_engine import categorical_encoders as ce
import feature_engine.missing_data_imputers as mdi

from sklearn.pipeline import Pipeline

The aim of this notebook is to clarify some questions regarding specific aspects of some **Feature Engineering** steps, towards improving model's performance.

The questions raised and clarified are:
- Question 1: Comparison of the performance of **One Hot Encoding** vs **Top One Hot Encoding** vs **Ordinal Encoding + Rare Label Encoding** with a high number of features

- Question 2: Importance of **Monotonic relationship** between the features and target in **Linear Models (Lasso)** and **Tree Based models (Random Forest)**

In [29]:
data = pd.read_csv("house-prices-advanced-regression-techniques/train.csv")
data.shape

(1460, 81)

In [30]:
data.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


**Analysis of the variable types**

In [31]:


#Special Case
year_vars = [v for v in data.columns if 'Yr' in v or 'Year' in v]

categorical_vars = [v for v in data.columns if data[v].dtype == 'O']

discrete_vars = [
    v for v in data.columns if v not in categorical_vars and v not in year_vars and len(data[v].unique()) < 15
]

continuous_vars = [
    v for v in data.columns if v not in categorical_vars and v not in year_vars and v not in discrete_vars and v not in ['Id', 'SalePrice']
]

print('Nº Continuous variables: {}'.format(len(continuous_vars)))
print('Nº of Discrete variables: {}'.format(len(discrete_vars)))
print('Nº of Categorical variables: {}'.format(len(categorical_vars)))

print('Nº of variables with Year information: {}'.format(len(year_vars)))

Nº Continuous variables: 19
Nº of Discrete variables: 13
Nº of Categorical variables: 43
Nº of variables with Year information: 4


Altough they are discrete variables, the variables with **Year** information would be considered continuous due to the high cardinality. Therefore, a special category *year_vars* was created

In [32]:
discrete_vars

['OverallQual',
 'OverallCond',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageCars',
 'PoolArea',
 'MoSold']

By analysing the Discrete Variables, we can see that there is a meaning and relation between the numbers in these columns. That is, the rows where the variable is 4 are closer to the rows where the variable is 3, than the variables where the variable is 1. Therefore, none of these Discrete Variables is going to be considered Categorical

In [33]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(['Id', 'SalePrice'], axis=1),
    data['SalePrice'],
    test_size=0.2,
    random_state=0)

In [34]:
#analysis of categorical variables with no Missing Values
#only the features with no Missing Values will be used

null_percentages = X_train[categorical_vars].isnull().sum() / len(X_train)
null_percentages.sort_values(na_position='first', ascending=True, inplace=True)


categories_info = np.array([[index, len(X_train[index].unique())] for index, value in null_percentages.items() if value == 0])
print("Features Cardinality:\n\n", categories_info)
categorical_indexes = categories_info[:, 0]

X_train_q1 = X_train[categorical_indexes]
X_test_q1 = X_test[categorical_indexes]

X_train_q2 = X_train[categorical_indexes]
X_test_q2 = X_test[categorical_indexes]


Features Cardinality:

 [['MSZoning' '5']
 ['PavedDrive' '3']
 ['Functional' '6']
 ['KitchenQual' '4']
 ['CentralAir' '2']
 ['HeatingQC' '5']
 ['Heating' '6']
 ['SaleType' '9']
 ['Foundation' '6']
 ['ExterCond' '5']
 ['ExterQual' '4']
 ['Exterior2nd' '16']
 ['Exterior1st' '15']
 ['RoofMatl' '7']
 ['SaleCondition' '6']
 ['HouseStyle' '8']
 ['Street' '2']
 ['LotShape' '4']
 ['RoofStyle' '6']
 ['Utilities' '2']
 ['LotConfig' '5']
 ['LandContour' '4']
 ['BldgType' '5']
 ['LandSlope' '3']
 ['Neighborhood' '25']
 ['Condition1' '9']
 ['Condition2' '6']]


## Question 1:

Comparison of the performance of **One Hot Encoding** vs **Top One Hot Encoding** vs **Ordinal Encoding + Rare Label Encoding** with a high number of features

<br><br>


In [35]:
#1 One Hot Encoding

#One Hot Encoding
encoder = OneHotEncoder(categories='auto', handle_unknown='ignore', sparse=False)
encoder.fit(X_train_q1)

X_train_ohe = pd.DataFrame(encoder.transform(X_train_q1))
X_test_ohe = pd.DataFrame(encoder.transform(X_test_q1))


lasso_model = Lasso(random_state=0)
lasso_model.fit(X_train_ohe, y_train)


ohe_results = lasso_model.predict(X_test_ohe)

  positive)


In [36]:
#2 Top One Hot Encoding

n_top = 7

#Top One Hot Encoding
encoder = ce.OneHotCategoricalEncoder(top_categories = n_top)
encoder.fit(X_train_q1)

X_train_top_ohe = pd.DataFrame(encoder.transform(X_train_q1))
X_test_top_ohe = pd.DataFrame(encoder.transform(X_test_q1))


#Lasso model
lasso_model = Lasso(random_state=0)
lasso_model.fit(X_train_top_ohe, y_train)


top_ohe_results = lasso_model.predict(X_test_top_ohe)

  positive)


In [37]:
#3 Ordinal Encoding + Rare Label Encoding

#Rare Label Encoding
rare_encoder = ce.RareLabelCategoricalEncoder(n_categories = 5)
rare_encoder.fit(X_train_q1)

X_train_ordinal = rare_encoder.transform(X_train_q1)
X_test_ordinal = rare_encoder.transform(X_test_q1)

#Ordinal Encoding
encoder = ce.OrdinalCategoricalEncoder()
encoder.fit(X_train_ordinal, y_train)

X_train_ordinal = pd.DataFrame(encoder.transform(X_train_ordinal))
X_test_ordinal = pd.DataFrame(encoder.transform(X_test_ordinal))


#Lasso model
lasso_model = Lasso(random_state=0)
lasso_model.fit(X_train_ordinal, y_train)


ordinal_results = lasso_model.predict(X_test_ordinal)

In [38]:
#Compare the Results of the three Approaches

print("Shape Approach 1: ", X_train_ohe.shape)
print("Shape Approach 2: ", X_train_top_ohe.shape)
print("Shape Approach 3: ", X_train_ordinal.shape)

mse_approach_1 = mean_squared_error(y_true = y_test, y_pred = ohe_results)
mse_approach_2 = mean_squared_error(y_true = y_test, y_pred = top_ohe_results)
mse_approach_3 = mean_squared_error(y_true = y_test, y_pred = ordinal_results)


print("\nMSE Approach 1: ", mse_approach_1, "\nMSE Approach 2: ", mse_approach_2, "\nMSE Approach 3: ", mse_approach_3)

Shape Approach 1:  (1168, 178)
Shape Approach 2:  (1168, 138)
Shape Approach 3:  (1168, 27)

MSE Approach 1:  2581889240.312276 
MSE Approach 2:  2566542069.838165 
MSE Approach 3:  3028271362.8342156


<br><br>


As we may have suspected, when the data dimensionality is high (significant number of features), the **One Hot Encoding** method increases significantly the feature space, which will increase the sparsity in the training data and therefore decrease the model performance.

In these cases, it may be better to one hot encode only the *top n* categories of each feature and group the remaining in one single category, which is what **Top One Hot Encoding** does. This way, the feature space will not be increase so significantly, which will, in this case, lead to a slight increase in the model's performance.

The last approach **Ordinal Encoding + Rare Label Encoding** was the one with the lower performance, this was expected as by performing ordinal encoding, some relationships between the encoded categories (1, 2, 3, ...) may be created, when, in the reality, they do not exist.


## Question 2:

Importance of **Monotonic relationship** between the features and target in **Linear Models (Lasso)** and **Tree Based models (Random Forest)**


**<br><br>Lasso - Linear Model**

In [39]:


#1. Ordinal encoding is arbitrarly
q2_pipe_arbitrary = Pipeline([
    
    ('rare_label_enc',
     ce.RareLabelCategoricalEncoder(tol=0.05,
                                    n_categories=5)),
    ('categorical_enc',
     ce.OrdinalCategoricalEncoder(encoding_method = 'arbitrary')),

    ('lasso', Lasso(random_state=0))
])


#2. Ordinal encoding is performed aiming to create a monotonic relationship between features and variables
q2_pipe_ordered = Pipeline([
    
    ('rare_label_enc',
     ce.RareLabelCategoricalEncoder(tol=0.05,
                                    n_categories=5)),
    ('categorical_enc',
     ce.OrdinalCategoricalEncoder(encoding_method = 'ordered')),

    ('lasso', Lasso(random_state=0))
])


q2_pipe_arbitrary.fit(X_train_q2, y_train)
q2_pipe_ordered.fit(X_train_q2, y_train)

X_test_preds_arbitrary = q2_pipe_arbitrary.predict(X_test_q2)
X_test_preds_ordered = q2_pipe_ordered.predict(X_test_q2)

mse_approach_1 = mean_squared_error(y_true = y_test, y_pred = X_test_preds_arbitrary)
mse_approach_2 = mean_squared_error(y_true = y_test, y_pred = X_test_preds_ordered)

print("\nMSE Approach 1: ", mse_approach_1, "\nMSE Approach 2: ", mse_approach_2)


MSE Approach 1:  4388424052.668934 
MSE Approach 2:  3028271362.8342156


As we can conclude, by guaranteeing a monotonic relationship between features and variables **(Approach 2)** the results improve signficantly comparing to arbitrary encoding **(Approach 1)**. This means that, when using **Linear Models (Lasso)**, we should aim to establish a monotonic relatioship between variables.

**<br><br>Random Forest - Tree-based Model**

In [40]:
#For simplicity a Pipeline will be created for training the RF model

#1. Ordinal encoding is arbitrarly
q2_pipe_arbitrary = Pipeline([
    
    ('rare_label_enc',
     ce.RareLabelCategoricalEncoder(tol=0.05,
                                    n_categories=5)),
    ('categorical_enc',
     ce.OrdinalCategoricalEncoder(encoding_method = 'arbitrary')),

    ('RF', RandomForestRegressor(random_state=0))
])


#2. Ordinal encoding is performed aiming to create a monotonic relationship between features and variables
q2_pipe_ordered = Pipeline([
    
    ('rare_label_enc',
     ce.RareLabelCategoricalEncoder(tol=0.05,
                                    n_categories=5)),
    ('categorical_enc',
     ce.OrdinalCategoricalEncoder(encoding_method = 'ordered')),

    ('lasso', Lasso(random_state=0))
])


q2_pipe_arbitrary.fit(X_train_q2, y_train)
q2_pipe_ordered.fit(X_train_q2, y_train)

X_test_preds_arbitrary = q2_pipe_arbitrary.predict(X_test_q2)
X_test_preds_ordered = q2_pipe_ordered.predict(X_test_q2)

mse_approach_1 = mean_squared_error(y_true = y_test, y_pred = X_test_preds_arbitrary)
mse_approach_2 = mean_squared_error(y_true = y_test, y_pred = X_test_preds_ordered)

print("\nMSE Approach 1: ", mse_approach_1, "\nMSE Approach 2: ", mse_approach_2)




MSE Approach 1:  2726198411.3998837 
MSE Approach 2:  3028271362.8342156


Concerning non linear models, as **Tree-based models (Random Forest)**, we can conclude that the monotonic relationship is not an requirement for creating the model. In fact, the result of the **Approach 2** (non monotonic relationships between variables) is better comparing to **Approach 1** (monotonic relationships between variables), although the difference is not significant.

<br><br>
In conclusion, we confirmed that, when dealing with **Linear Models**, it is important to garantee a monotonic relationship between each independent variable and the target. When dealing with **Non Linear Model**s, as Tree-based models, the monotonic relationship in the input data is not a mandatory.