In [48]:
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt 
#import seaborn as sns
#import plotly.express as px
from pprint import pprint

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from functions import df_engineered, r2rmse_scores

import pickle
import time

In [2]:
feature_selection = ['ExterQual', 'BsmtQual', 'KitchenQual', 'OverallQual', 
                    'GrLivArea', 'TotalBsmtSF', 'GarageArea', 'FullBath', 
                    'HouseAge', 'TotRmsAbvGrd', 'OverallCond',  'FlrSF1st', 'FlrSF2nd',
                    'Fireplaces', 'HasFireplace','LotFrontage', 'LotArea', 'MSSubClass', 'GoodGarageType', 
                    'BsmtUnfSF', 'Zone', 'Location', 'CulDSac', 'ExQual', 'RemodAge', 'LargerHouse', 
                    'Remod', 'ExBsmtQual', 'TwoStory', 'TotalSF', 'RoadRail', 'ExKitchen', 'CentralAir',
                    'TotalBath', 'Exterior1st_top']
                        

In [3]:
df = pd.read_csv('data/Ames_Housing_Price_Data.csv', index_col=0).iloc[:,1:]
print('df shape:', df.shape)

data = df_engineered(df, Reg=False)

df shape: (2580, 80)


In [4]:
X_train, X_test, y_train, y_test = train_test_split(data[feature_selection], data['SalePrice'], test_size=0.30, random_state=42)

## Random Forest

In [5]:
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]

max_features = ['sqrt', 'log2']

max_depth = [int(x) for x in np.linspace(30, 210, num = 11)]

max_depth.append(None)

min_samples_split = [2, 5, 10, 15]

min_samples_leaf = [4, 8, 12, 15]

bootstrap = [True, False]


random_grid = [{'n_estimators': n_estimators,
               'max_features': max_features,
               'bootstrap': bootstrap,
               'max_features':max_features,
               'min_samples_split': min_samples_split},
               {
               'n_estimators': n_estimators,
               'max_features': max_features,
               'bootstrap': bootstrap,   
               'max_features':max_features,
               'min_samples_leaf': min_samples_leaf}]

pprint(random_grid)

[{'bootstrap': [True, False],
  'max_features': ['sqrt', 'log2'],
  'min_samples_split': [2, 5, 10, 15],
  'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]},
 {'bootstrap': [True, False],
  'max_features': ['sqrt', 'log2'],
  'min_samples_leaf': [4, 8, 12, 15],
  'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]}]


In [6]:
rf = RandomForestRegressor()

rf_random = RandomizedSearchCV(estimator=rf, 
                               param_distributions=random_grid, 
                               n_iter=100, 
                               cv=3, 
                               verbose=1, 
                               random_state=42, 
                               n_jobs = -1)

rf_random.fit(X_train, y_train)

r2rmse_scores(rf_random, X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
--------------------------------------------------
5-fold Cross Validation Scoring
Mean R^2 score: 0.9184476007207808
Mean RMSE score: 20066.266228465167
--------------------------------------------------


In [16]:
pprint(rf_random.best_params_)

{'bootstrap': False,
 'max_features': 'log2',
 'min_samples_split': 5,
 'n_estimators': 100}


In [42]:
yhat_train = rf_random.predict(X_train)
yhat_test = rf_random.predict(X_test)

train_mse_non_log = mean_squared_error(y_train, yhat_train, squared=False)
test_mse_non_log = mean_squared_error(y_test, yhat_test, squared=False)

print(train_mse_non_log)
print(test_mse_non_log)

4444.534050784377
19929.86671786943


In [78]:
#file_name = "rf_reg_{}.pkl".format(time.strftime("%Y%m%d-%H.%M.%S"))

# save
#pickle.dump(rf_random, open(file_name, "wb"))

# load
rf_model_loaded = pickle.load(open(file_name, "rb"))

In [79]:
yhat_train = rf_model_loaded.predict(X_train)
yhat_test = rf_model_loaded.predict(X_test)

train_mse_non_log = mean_squared_error(y_train, yhat_train, squared=False)
test_mse_non_log = mean_squared_error(y_test, yhat_test, squared=False)

print(train_mse_non_log)
print(test_mse_non_log)

4444.534050784377
19929.86671786943


## XGBoost

In [60]:
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]

max_features = ['sqrt', 'log2']

max_depth = [int(x) for x in np.linspace(30, 210, num = 11)]

max_depth.append(None)

min_samples_split = [2, 5, 10, 15]

min_samples_leaf = [4, 8, 12, 15]

learning_rate = [0.1, 0.01, 0.5]

alpha = [0.1, 0.3, 0.5, 0.9]


random_grid = [{'n_estimators': n_estimators,
               'max_features': max_features,
               'max_features':max_features,
               'learning_rate':learning_rate,
               'alpha':alpha,
               'min_samples_split': min_samples_split},
               {
               'n_estimators': n_estimators,
               'max_features': max_features, 
               'max_features':max_features,
               'learning_rate':learning_rate,
               'alpha':alpha,
               'min_samples_leaf': min_samples_leaf}]

pprint(random_grid)

[{'alpha': [0.1, 0.3, 0.5, 0.9],
  'learning_rate': [0.1, 0.01, 0.5],
  'max_features': ['sqrt', 'log2'],
  'min_samples_split': [2, 5, 10, 15],
  'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]},
 {'alpha': [0.1, 0.3, 0.5, 0.9],
  'learning_rate': [0.1, 0.01, 0.5],
  'max_features': ['sqrt', 'log2'],
  'min_samples_leaf': [4, 8, 12, 15],
  'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]}]


In [61]:
xgb = GradientBoostingRegressor()

xgb_random = RandomizedSearchCV(estimator=xgb, 
                               param_distributions=random_grid, 
                               n_iter=100, 
                               cv=3, 
                               verbose=1, 
                               random_state=42, 
                               n_jobs = -1)

xgb_random.fit(X_train, y_train)

r2rmse_scores(xgb_random, X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
--------------------------------------------------
5-fold Cross Validation Scoring
Mean R^2 score: 0.9258030987715001
Mean RMSE score: 18528.54698705896
--------------------------------------------------


In [62]:
pprint(xgb_random.best_params_)

{'alpha': 0.1,
 'learning_rate': 0.1,
 'max_features': 'sqrt',
 'min_samples_split': 10,
 'n_estimators': 100}


In [68]:
yhat_train = xgb_random.predict(X_train)
yhat_test = xgb_random.predict(X_test)

train_mse_non_log = mean_squared_error(y_train, yhat_train, squared=False)
test_mse_non_log = mean_squared_error(y_test, yhat_test, squared=False)

print(train_mse_non_log)
print(test_mse_non_log)

13598.591504674852
17822.03155802156


In [71]:
#file_name = "xgb_reg_{}.pkl".format(time.strftime("%Y%m%d-%H.%M.%S"))

# save
#pickle.dump(xgb_random, open(file_name, "wb"))

# load
xgb_model_loaded = pickle.load(open(file_name, "rb"))

In [77]:
yhat_train = xgb_model_loaded.predict(X_train)
yhat_test = xgb_model_loaded.predict(X_test)

train_mse_non_log = mean_squared_error(y_train, yhat_train, squared=False)
test_mse_non_log = mean_squared_error(y_test, yhat_test, squared=False)

print(train_mse_non_log)
print(test_mse_non_log)

13598.591504674852
17822.03155802156
