# Model Benchmarking

This notebook will perform exploration via model and model hyperparameter searching of possible shallow models using SKLearn built in algorithms.

I use data constructed by Feature_Engineering.ipynb and model searching functionality from SKL_search.py 

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
sns.set_context('talk')
import matplotlib.pyplot as plt
import warnings
from SKL_search import *
warnings.filterwarnings("ignore")

In [4]:
#untie local dependency
import os
cwd = os.getcwd().split('\\')
local_repo_path = '\\'.join(cwd[:-1])
raw_data_path = local_repo_path + r'\data\raw'
processed_data_path = local_repo_path + '\data\processed'
feature_path = local_repo_path + r'\\data\\features'

In [7]:
sj_feature_train = pd.read_csv(feature_path + r'\\sj_train.csv', index_col=0)
sj_feature_test = pd.read_csv(feature_path + r'\\sj_test.csv', index_col=0)
iq_feature_train = pd.read_csv(feature_path + r'\\iq_train.csv', index_col=0)
iq_feature_test = pd.read_csv(feature_path + r'\\iq_test.csv', index_col=0)

In [53]:
from sklearn.model_selection import train_test_split
X_tr_sj, X_val_sj, Y_tr_sj, Y_val_sj = train_test_split(sj_feature_train.drop('total_cases', axis = 1), sj_feature_train['total_cases'], test_size = 0.05, random_state = 4)
X_tr_iq, X_val_iq, Y_tr_iq, Y_val_iq = train_test_split(iq_feature_train.drop('total_cases', axis = 1), iq_feature_train['total_cases'], test_size = 0.05, random_state = 4)

In [19]:
from sklearn.ensemble import (ExtraTreesRegressor, RandomForestRegressor, 
                              AdaBoostRegressor, GradientBoostingRegressor)
from sklearn.svm import SVR
from sklearn import linear_model

In [34]:
models1 = { 
    'RandomForestRegressor': RandomForestRegressor(),
    #'AdaBoostRegressor': AdaBoostRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'SVR': SVR(),
    'LinearRegression': linear_model.LinearRegression()
}

params1 = {  
    'RandomForestRegressor': { 'n_estimators': [50, 100], 'max_depth':[5,10]},
    #'AdaBoostRegressor':  { 'base_estimator':[RandomForestClassifier(min_samples_leaf = 10)], 'n_estimators': [10, 20]},
    'GradientBoostingRegressor': { 'n_estimators': [100,200] },
    'SVR': [
        {'kernel': ['linear'], 'C': [2.5,3], 'degree':[2,3]}],
    'LinearRegression': {'n_jobs':[-1]}
    
}

In [35]:
helper = EstimatorSelectionHelper(models1, params1)
helper.fit(X_tr_sj, Y_tr_sj, scoring='neg_mean_absolute_error', cv = 2)

Running GridSearchCV for RandomForestRegressor.
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    1.6s remaining:    2.7s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:    1.9s remaining:    1.1s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    2.1s finished


Running GridSearchCV for GradientBoostingRegressor.
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Done   2 out of   4 | elapsed:    1.0s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    1.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    1.8s finished


Running GridSearchCV for SVR.
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    1.5s remaining:    2.5s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:    1.9s remaining:    1.1s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    1.9s finished


Running GridSearchCV for LinearRegression.
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    1.0s finished


In [36]:
helper.score_summary(sort_by='min_score')

Unnamed: 0,estimator,min_score,mean_score,max_score,std_score,C,degree,kernel,max_depth,n_estimators,n_jobs
6,SVR,-21.9086,-21.8675,-21.8264,0.0410927,2.5,2.0,linear,,,
7,SVR,-21.9086,-21.8675,-21.8264,0.0410927,2.5,3.0,linear,,,
8,SVR,-21.9086,-21.868,-21.8273,0.0406622,3.0,2.0,linear,,,
9,SVR,-21.9086,-21.868,-21.8273,0.0406622,3.0,3.0,linear,,,
0,RandomForestRegressor,-25.2366,-25.1818,-25.127,0.0547743,,,,5.0,50.0,
10,LinearRegression,-25.847,-25.0087,-24.1704,0.838326,,,,,,-1.0
1,RandomForestRegressor,-25.8545,-25.669,-25.4834,0.185548,,,,5.0,100.0,
2,RandomForestRegressor,-26.3656,-26.0322,-25.6988,0.333366,,,,10.0,50.0,
3,RandomForestRegressor,-26.4745,-26.3202,-26.166,0.154276,,,,10.0,100.0,
4,GradientBoostingRegressor,-27.1781,-27.1236,-27.0691,0.0544902,,,,,100.0,


In [54]:
helper = EstimatorSelectionHelper(models1, params1)
helper.fit(X_tr_iq, Y_tr_iq, scoring='neg_mean_absolute_error', cv = 2)

Running GridSearchCV for RandomForestRegressor.
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    1.5s remaining:    2.6s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:    1.9s remaining:    1.1s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    2.1s finished


Running GridSearchCV for GradientBoostingRegressor.
Fitting 2 folds for each of 2 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Done   2 out of   4 | elapsed:    1.0s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    1.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:    1.8s finished


Running GridSearchCV for SVR.
Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    1.4s remaining:    2.4s
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:    1.8s remaining:    1.1s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    1.9s finished


Running GridSearchCV for LinearRegression.
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    1.0s finished


In [55]:
helper.score_summary(sort_by='min_score')

Unnamed: 0,estimator,min_score,mean_score,max_score,std_score,C,degree,kernel,max_depth,n_estimators,n_jobs
8,SVR,-6.5057,-5.99921,-5.49273,0.506485,3.0,2.0,linear,,,
9,SVR,-6.5057,-5.99921,-5.49273,0.506485,3.0,3.0,linear,,,
6,SVR,-6.50856,-6.00128,-5.49401,0.507278,2.5,2.0,linear,,,
7,SVR,-6.50856,-6.00128,-5.49401,0.507278,2.5,3.0,linear,,,
0,RandomForestRegressor,-7.15491,-7.01876,-6.88262,0.136147,,,,5.0,50.0,
1,RandomForestRegressor,-7.16146,-6.99621,-6.83095,0.165257,,,,5.0,100.0,
3,RandomForestRegressor,-7.20846,-7.15061,-7.09276,0.0578492,,,,10.0,100.0,
10,LinearRegression,-7.32241,-6.5558,-5.78918,0.766616,,,,,,-1.0
2,RandomForestRegressor,-7.37733,-7.17407,-6.97082,0.203259,,,,10.0,50.0,
4,GradientBoostingRegressor,-7.76412,-7.54025,-7.31637,0.223875,,,,,100.0,


In [56]:
iq_feature_test.shape

(156, 10)

In [57]:
sj_feature_test.shape

(260, 10)

In [58]:
260/416

0.625

In [69]:
clf = SVR()
clf.fit(X_tr_sj, Y_tr_sj)

sj_preds = clf.predict(sj_feature_test.drop(['city', 'year'], axis = 1))

sj_sub = pd.DataFrame(sj_feature_test[['city', 'year', 'weekofyear']])
sj_sub['total_cases'] = sj_preds

sj_sub