In [24]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.model_selection import train_test_split, cross_validate

from statsmodels.stats.outliers_influence import variance_inflation_factor

In [25]:
data = pd.read_csv('model_data.csv', index_col='PID')
data.shape

(2579, 20)

#### Independent Variable (y)

In [26]:
y = data.SalePrice.copy()
data.drop('SalePrice', axis=1, inplace=True)

## ðŸŒ² Prepare for RandomForest

In [27]:
from sklearn import ensemble
from sklearn.preprocessing import LabelEncoder

In [28]:
data_le = data.copy()
data_le.columns

Index(['MSSubClass', 'Foundation', 'PavedDrive', 'BsmtUnfSF', 'AllBathBsmt',
       'AllBathAbv', 'HeatingQC', 'Neighborhood', 'YearBuilt', 'GarageCars',
       'PorchArea', 'GoodLivArea', 'CentralAir', 'KitchenQual', 'ExterQual',
       'BsmtCond', 'FireplaceQu', 'GarageQual', 'HasPool'],
      dtype='object')

In [29]:
# Label CentralAir Y or N with 1 or 0
data_le.CentralAir = data_le.CentralAir.apply(lambda x: 1 if x=='Y' else 0, )

In [30]:
# Use LabelEncoder on Categorical features
le = LabelEncoder()
data_le.MSSubClass = le.fit_transform(data.MSSubClass)
#data_le.GarageType = le.fit_transform(data.GarageType)
data_le.Foundation = le.fit_transform(data.Foundation)
data_le.PavedDrive = le.fit_transform(data.PavedDrive)
data_le.Neighborhood = le.fit_transform(data.Neighborhood)
# Manually Encode Ordinal features
#data_le.BsmtQual = data_le.BsmtQual.replace({'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5,np.NaN:0})
data_le.HeatingQC = data_le.HeatingQC.replace({'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5})
data_le.KitchenQual = data_le.KitchenQual.replace({'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5})
data_le.ExterQual = data_le.ExterQual.replace({'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5})
data_le.BsmtCond = data_le.BsmtCond.replace({'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5,'None':0})
data_le.FireplaceQu = data_le.FireplaceQu.replace({'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5,'None':0})
data_le.GarageQual = data_le.GarageQual.replace({'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5,'None':0})

In [31]:
X_train, X_test, y_train, y_test = train_test_split(data_le, y, test_size=0.25)

In [32]:
forest = ensemble.RandomForestRegressor()
forest.set_params(n_estimators=50, random_state=42, max_features=10)
forest.fit(X_train, y_train)
print("The training r2: %.5f" %(forest.score(X_train, y_train)))
print("The test     r2: %.5f" %(forest.score(X_test, y_test)))

The training r2: 0.98396
The test     r2: 0.83247


### Some Overfit?

In [33]:
forest.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 10,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 50,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [34]:
from sklearn.model_selection import GridSearchCV

In [35]:
grid_para_forest = [{
    "max_features":np.arange(8,15,1),
    "max_depth": np.linspace(start=6, stop=30, num=13, dtype=int)}]
gs_forest = GridSearchCV(forest, grid_para_forest, scoring='r2', cv=4, n_jobs=-1)
%time gs_forest.fit(X_train, y_train)

Wall time: 24.3 s


GridSearchCV(cv=4,
             estimator=RandomForestRegressor(max_features=10, n_estimators=50,
                                             random_state=42),
             n_jobs=-1,
             param_grid=[{'max_depth': array([ 6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]),
                          'max_features': array([ 8,  9, 10, 11, 12, 13, 14])}],
             scoring='r2')

In [36]:
gs_forest.best_params_

{'max_depth': 20, 'max_features': 8}

In [37]:
print("The training r2: %.5f" % (gs_forest.score(X_train, y_train)))
print("The test     r2: %.5f" % (gs_forest.score(X_test, y_test)))

The training r2: 0.98381
The test     r2: 0.83786


In [38]:
pd.DataFrame({'Feat':X_train.columns,'Importance':gs_forest.best_estimator_.feature_importances_}).\
sort_values(by='Importance', ascending=False)

Unnamed: 0,Feat,Importance
11,GoodLivArea,0.367879
9,GarageCars,0.145803
14,ExterQual,0.127889
8,YearBuilt,0.092827
13,KitchenQual,0.080107
5,AllBathAbv,0.035407
3,BsmtUnfSF,0.03136
16,FireplaceQu,0.029795
10,PorchArea,0.027351
7,Neighborhood,0.018925


## ðŸš€ Boosting

In [115]:
from sklearn.ensemble import GradientBoostingRegressor

In [116]:
gbm = GradientBoostingRegressor()
gbm.fit(X_train, y_train)
gbm.score(X_test, y_test)

0.8854790428596276

In [117]:
gbm.get_params()

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'ls',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [118]:
grid_para_boost = [{
    "learning_rate":[0.1,0.25,0.5,1],
    "max_depth": np.linspace(start=6, stop=30, num=13, dtype=int)}]
gs_boost = GridSearchCV(gbm, grid_para_boost, scoring='r2', cv=5, n_jobs=-1)
%time gs_boost.fit(X_train, y_train)

Wall time: 1min 5s


GridSearchCV(cv=5, estimator=GradientBoostingRegressor(), n_jobs=-1,
             param_grid=[{'learning_rate': [0.1, 0.25, 0.5, 1],
                          'max_depth': array([ 6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30])}],
             scoring='r2')

In [119]:
gs_boost.best_params_

{'learning_rate': 0.1, 'max_depth': 6}

In [120]:
print("The training r2: %.5f" % (gs_boost.score(X_train, y_train)))
print("The test     r2: %.5f" % (gs_boost.score(X_test, y_test)))

The training r2: 0.98737
The test     r2: 0.88791
