In [30]:
## We run a bunch of grid searches. At the end, the best model is not taken from them. Model params are given from the grid searches if relevant
## It would have to be ran again to get the CV scores for each of the models (perhaps try a grid search with the best params from each of the grid searches if this
## is to be done)

import sklearn
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split


## These have to be changed once out of kaggle
# df = pd.read_csv('/kaggle/input/whynot/features_dummified_214col.csv')
# df2 = pd.read_csv('/kaggle/input/whynot/cleaned_housing.csv')

df = pd.read_csv('./data/features_dummified_214col.csv')
df2 = pd.read_csv('./data/cleaned_housing.csv')

In [31]:
RF = RandomForestRegressor()

target = df2['SalePrice']

X_train, X_test, y_train, y_test= train_test_split(df, target, test_size=0.3, random_state = 0)


In [34]:
from sklearn.model_selection import GridSearchCV
param_grid = {'min_samples_split':[2, 5, 7, 10], 
              'min_samples_leaf':[1, 2, 5, 8], 
              'n_estimators':[50, 100, 200], 
              'max_depth':[2, 4, 6],
              'max_features': ['auto', 'log2']}

CV_RF = GridSearchCV(estimator=RF, param_grid=param_grid, cv= 5)
CV_RF.fit(X_train, y_train)

In [9]:
CV_RF.best_estimator_

RandomForestRegressor(max_depth=6, min_samples_leaf=2, min_samples_split=7)

In [10]:
CV_RF.best_score_

0.8627897437813207

In [17]:
best_est = CV_RF.best_estimator_

best_est.score(X_train,y_train)


0.9311257061944435

In [36]:
def grid_fit(model, params, cv):
    
    grid = GridSearchCV(model, params, cv=cv, return_train_score = True)
    ans = grid.fit(X_train, y_train)
    model = ans.best_estimator_
    return model.fit(X_train, y_train)
    
def find_coefs(model):
    coefs = {}
    
    for idx, coef in enumerate(model.feature_importances_):
        if coef == 0:
            continue
        else:
            coefs[df.columns[idx]] = coef

    sig_coefs = [feature[:feature.find('_')] if '_' in feature 
                 else feature for feature in coefs.keys()]
      
#     print(f'{model}, {len(set(sig_coefs))}, {round(model.score(X_train, Y_train), 3)}, {round(model.score(X_test, Y_test), 3)}')
    print(f'{model}, \n\
    features: {len(set(sig_coefs))}, \n\
    train score: {model.score(X_train, y_train)}, \n\
    test score: {model.score(X_test, y_test)} \n\
    coefs: {sig_coefs}')

In [37]:
param_grid = {'min_samples_split':[2, 5, 7, 10], 
              'min_samples_leaf':[1, 2, 5, 8], 
              'n_estimators':[50, 100, 200], 
              'max_depth':[2, 4, 6],
              'max_features': ['auto', 'log2']}

forest = grid_fit(RF, param_grid, 5)

In [28]:
find_coefs(forest)

IndexError: index 214 is out of bounds for axis 0 with size 214

In [29]:
forest.feature_importances_

array([8.44789809e-04, 1.15094763e-03, 7.48765258e-05, 5.95606440e-01,
       1.32326885e-03, 5.87507478e-03, 2.01721328e-04, 3.92431510e-03,
       2.54367741e-05, 6.59317396e-04, 6.44228130e-04, 1.66598878e-03,
       0.00000000e+00, 7.15887730e-04, 1.14626069e-02, 4.69192948e-04,
       3.34436859e-04, 3.92372482e-05, 8.76661314e-03, 5.44512449e-05,
       2.57663989e-03, 3.14515287e-03, 7.90769086e-05, 2.94475524e-04,
       2.98501220e-05, 1.46595930e-01, 1.34315128e-03, 9.35127411e-03,
       7.99321694e-03, 1.98207777e-03, 2.98575129e-03, 3.83689606e-02,
       3.47235961e-04, 1.51087137e-03, 8.78832776e-02, 2.58574100e-03,
       2.62081125e-02, 2.23754645e-03, 2.99990316e-03, 9.86893619e-05,
       0.00000000e+00, 3.10473255e-04, 0.00000000e+00, 4.79581469e-03,
       4.02453338e-04, 5.32462943e-05, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 3.35255922e-04, 3.63329041e-06, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
      

In [40]:
rf_feats = [8.44789809e-04, 1.15094763e-03, 7.48765258e-05, 5.95606440e-01,
       1.32326885e-03, 5.87507478e-03, 2.01721328e-04, 3.92431510e-03,
       2.54367741e-05, 6.59317396e-04, 6.44228130e-04, 1.66598878e-03,
       0.00000000e+00, 7.15887730e-04, 1.14626069e-02, 4.69192948e-04,
       3.34436859e-04, 3.92372482e-05, 8.76661314e-03, 5.44512449e-05,
       2.57663989e-03, 3.14515287e-03, 7.90769086e-05, 2.94475524e-04,
       2.98501220e-05, 1.46595930e-01, 1.34315128e-03, 9.35127411e-03,
       7.99321694e-03, 1.98207777e-03, 2.98575129e-03, 3.83689606e-02,
       3.47235961e-04, 1.51087137e-03, 8.78832776e-02, 2.58574100e-03,
       2.62081125e-02, 2.23754645e-03, 2.99990316e-03, 9.86893619e-05,
       0.00000000e+00, 3.10473255e-04, 0.00000000e+00, 4.79581469e-03,
       4.02453338e-04, 5.32462943e-05, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 3.35255922e-04, 3.63329041e-06, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 3.98919427e-06, 0.00000000e+00,
       0.00000000e+00, 1.28707029e-03, 0.00000000e+00, 1.33380871e-05,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 5.88221752e-06, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 9.01726812e-05,
       0.00000000e+00, 2.81371404e-05, 0.00000000e+00, 5.36941222e-04,
       0.00000000e+00, 1.56600855e-05, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 2.93039856e-05, 4.23796545e-05, 6.66619685e-05,
       4.99425514e-05, 0.00000000e+00, 0.00000000e+00, 1.26219248e-05,
       4.28775202e-05, 4.86009099e-04, 0.00000000e+00, 0.00000000e+00,
       1.06718767e-05, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       5.67551873e-05, 0.00000000e+00, 1.05564694e-05, 1.50774423e-05,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 5.20178172e-04,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       3.41864194e-04, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 6.05811983e-06, 3.32055522e-05, 0.00000000e+00,
       0.00000000e+00, 7.38827261e-05, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.87080135e-05,
       1.53374188e-05, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.57215406e-04,
       3.17306707e-04, 5.39410459e-05, 8.03688312e-05, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.11224114e-03,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 5.33669826e-06, 0.00000000e+00, 6.04046681e-04,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.07947081e-04,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 9.38722475e-04,
       2.89634289e-03, 8.04423880e-03, 2.39236610e-03]

In [43]:
features = []

for i in rf_feats:
    if i < 1e-4:
        continue
    else:
        features.append(i)
        
len(features)

50

In [16]:
CV_RF.best_params_

{'max_depth': 6,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 7,
 'n_estimators': 100}

In [18]:
param_grid = {'min_samples_split':[6, 7, 8, 9, 10], 
              'min_samples_leaf':[5, 8, 10], 
              'n_estimators':[200, 400], 
              'max_depth':[5, 7, 10, 12],
              'max_features': ['auto', 'log2']}

CV_RF2 = GridSearchCV(estimator=RF, param_grid=param_grid, cv= 10)
CV_RF2.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=RandomForestRegressor(),
             param_grid={'max_depth': [5, 7, 10, 12],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [5, 8, 10],
                         'min_samples_split': [6, 7, 8, 9, 10],
                         'n_estimators': [200, 400]})

In [110]:
CV_RF2.best_estimator_.get_params

<bound method BaseEstimator.get_params of RandomForestRegressor(max_depth=12, min_samples_leaf=5, min_samples_split=8,
                      n_estimators=400)>

In [38]:
CV_RF2.best_estimator_.score(X_test,y_test)

0.8965356786983262

In [114]:
RF3 = RandomForestRegressor( n_jobs=-1, random_state=0)

RandomForestRegressor(min_samples_leaf=3, min_samples_split=10, n_jobs=-1,
                      random_state=0)

In [116]:
param_grid = {'min_samples_split':[3, 5, 8, 9, 10], 
              'min_samples_leaf':[5, 8, 10], 
              'n_estimators':[100, 200, 400], 
              'max_depth':[5, 7, 10, 12, 15, None],
              'max_features': ['auto', 'sqrt', 'log2']}

CV_RF3 = GridSearchCV(estimator=RF3, param_grid=param_grid, cv= 10)
CV_RF3.fit(X_train, y_train)

GridSearchCV(cv=10,
             estimator=RandomForestRegressor(min_samples_leaf=3,
                                             min_samples_split=10, n_jobs=-1,
                                             random_state=0),
             param_grid={'max_depth': [5, 7, 10, 12, 15, None],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [5, 8, 10],
                         'min_samples_split': [3, 5, 8, 9, 10],
                         'n_estimators': [100, 200, 400]})

In [121]:
print(f'The R-squared on test set is {CV_RF3.best_estimator_.score(X_test,y_test)}')
print(f'The R-squared on train set is {CV_RF3.best_estimator_.score(X_train,y_train)}')

0.8960178757338528
0.948404891940828


In [122]:
CV_RF3.best_estimator_.get_params

<bound method BaseEstimator.get_params of RandomForestRegressor(max_depth=15, min_samples_leaf=5, min_samples_split=5,
                      n_estimators=200, n_jobs=-1, random_state=0)>

In [55]:
## We can get the R^2 up to .9, but there is worse overfitting.

RF4 = RandomForestRegressor(n_estimators=250, max_depth=10, min_samples_leaf=2, min_samples_split=6, n_jobs=-1, criterion='squared_error', random_state=0)
RF4.fit(X_train, y_train)

RandomForestRegressor(max_depth=10, min_samples_leaf=2, min_samples_split=6,
                      n_estimators=250, n_jobs=-1, random_state=0)

In [62]:
print(f'The R-squared on test set is {RF4.score(X_test,y_test)}')
      
print(f'The R-squared on train set is {RF4.score(X_train,y_train)}')

The R-squared on test set is 0.9004291463966746
The R-squared on train set is 0.9678611241005803
