In [2]:
## We run a bunch of grid searches. At the end, the best model is not taken from them. Model params are given from the grid searches if relevant
## It would have to be ran again to get the CV scores for each of the models (perhaps try a grid search with the best params from each of the grid searches if this
## is to be done)

import sklearn
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split


## These have to be changed once out of kaggle
# df = pd.read_csv('/kaggle/input/whynot/features_dummified_214col.csv')
# df2 = pd.read_csv('/kaggle/input/whynot/cleaned_housing.csv')

df = pd.read_csv('./data/features_dummified_214col.csv')
df2 = pd.read_csv('./data/cleaned_housing.csv')

In [3]:
RF = RandomForestRegressor()

target = df2['SalePrice']

X_train, X_test, y_train, y_test= train_test_split(df, target, test_size=0.3, random_state = 0)


In [8]:
from sklearn.model_selection import GridSearchCV
param_grid = {'min_samples_split':[2, 5, 7, 10], 
              'min_samples_leaf':[1, 2, 5, 8], 
              'n_estimators':[50, 100, 200], 
              'max_depth':[2, 4, 6],
              'max_features': ['auto', 'log2']}

CV_RF = GridSearchCV(estimator=RF, param_grid=param_grid, cv= 5)
CV_RF.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid={'max_depth': [2, 4, 6],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [1, 2, 5, 8],
                         'min_samples_split': [2, 5, 7, 10],
                         'n_estimators': [50, 100, 200]})

In [9]:
CV_RF.best_estimator_

RandomForestRegressor(max_depth=6, min_samples_leaf=2, min_samples_split=7)

In [10]:
CV_RF.best_score_

0.8627897437813207

In [17]:
best_est = CV_RF.best_estimator_

best_est.score(X_train,y_train)


0.9311257061944435

In [16]:
CV_RF.best_params_

{'max_depth': 6,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 7,
 'n_estimators': 100}

In [18]:
param_grid = {'min_samples_split':[6, 7, 8, 9, 10], 
              'min_samples_leaf':[5, 8, 10], 
              'n_estimators':[200, 400], 
              'max_depth':[5, 7, 10, 12],
              'max_features': ['auto', 'log2']}

CV_RF2 = GridSearchCV(estimator=RF, param_grid=param_grid, cv= 10)
CV_RF2.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=RandomForestRegressor(),
             param_grid={'max_depth': [5, 7, 10, 12],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [5, 8, 10],
                         'min_samples_split': [6, 7, 8, 9, 10],
                         'n_estimators': [200, 400]})

In [110]:
CV_RF2.best_estimator_.get_params

<bound method BaseEstimator.get_params of RandomForestRegressor(max_depth=12, min_samples_leaf=5, min_samples_split=8,
                      n_estimators=400)>

In [38]:
CV_RF2.best_estimator_.score(X_test,y_test)

0.8965356786983262

In [114]:
RF3 = RandomForestRegressor( n_jobs=-1, random_state=0)

RandomForestRegressor(min_samples_leaf=3, min_samples_split=10, n_jobs=-1,
                      random_state=0)

In [116]:
param_grid = {'min_samples_split':[3, 5, 8, 9, 10], 
              'min_samples_leaf':[5, 8, 10], 
              'n_estimators':[100, 200, 400], 
              'max_depth':[5, 7, 10, 12, 15, None],
              'max_features': ['auto', 'sqrt', 'log2']}

CV_RF3 = GridSearchCV(estimator=RF3, param_grid=param_grid, cv= 10)
CV_RF3.fit(X_train, y_train)

GridSearchCV(cv=10,
             estimator=RandomForestRegressor(min_samples_leaf=3,
                                             min_samples_split=10, n_jobs=-1,
                                             random_state=0),
             param_grid={'max_depth': [5, 7, 10, 12, 15, None],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [5, 8, 10],
                         'min_samples_split': [3, 5, 8, 9, 10],
                         'n_estimators': [100, 200, 400]})

In [121]:
print(f'The R-squared on test set is {CV_RF3.best_estimator_.score(X_test,y_test)}')
print(f'The R-squared on train set is {CV_RF3.best_estimator_.score(X_train,y_train)}')

0.8960178757338528
0.948404891940828


In [122]:
CV_RF3.best_estimator_.get_params

<bound method BaseEstimator.get_params of RandomForestRegressor(max_depth=15, min_samples_leaf=5, min_samples_split=5,
                      n_estimators=200, n_jobs=-1, random_state=0)>

In [55]:
## We can get the R^2 up to .9, but there is worse overfitting.

RF4 = RandomForestRegressor(n_estimators=250, max_depth=10, min_samples_leaf=2, min_samples_split=6, n_jobs=-1, criterion='squared_error', random_state=0)
RF4.fit(X_train, y_train)

RandomForestRegressor(max_depth=10, min_samples_leaf=2, min_samples_split=6,
                      n_estimators=250, n_jobs=-1, random_state=0)

In [62]:
print(f'The R-squared on test set is {RF4.score(X_test,y_test)}')
      
print(f'The R-squared on train set is {RF4.score(X_train,y_train)}')

The R-squared on test set is 0.9004291463966746
The R-squared on train set is 0.9678611241005803
