In [1]:
import pandas as pd
df = pd.read_json('data/training_dataset.json')

In [3]:
from src.dictionaries import *

import io
import numpy as np

import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression as LRC
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.svm import SVC
import xgboost as XGB

import seaborn as sns

from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import r2_score

In [4]:
X = df.loc[:,features].values
y_recovery = df.loc[:,['recovery']].values * 1
y_delta = df.loc[:,['delta']].values
y_recovery = y_recovery.reshape(len(y_recovery),)
y_delta = y_delta.reshape(len(y_recovery),)
X_train, X_test, y_train, y_test = train_test_split(X, y_recovery, test_size=.2, stratify = y_recovery)
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
def gridsearch_with_output(estimator, parameter_grid, X_train, y_train):
    '''
        Parameters: estimator: the type of model (e.g. RandomForestRegressor())
                    paramter_grid: dictionary defining the gridsearch parameters
                    X_train: 2d numpy array
                    y_train: 1d numpy array
        Returns:  best parameters and model fit with those parameters
    '''
    model_gridsearch = GridSearchCV(estimator,
                                    parameter_grid,
                                    n_jobs=-1,
                                    verbose=True,
                                    scoring='f1')
    model_gridsearch.fit(X_train, y_train)
    best_params = model_gridsearch.best_params_ 
    model_best = model_gridsearch.best_estimator_
    print("\nModel:")
    print(estimator)
    print("\nparameters:")
    print(hyperparameters)    
    print("\nResult of gridsearch:")
    print("{0:<20s} | {1:<8s} | {2}".format("Parameter", "Optimal", "Gridsearch values"))
    print("-" * 55)
    for param, vals in parameter_grid.items():
        print("{0:<20s} | {1:<8s} | {2}".format(str(param), 
                                                str(best_params[param]),
                                                str(vals)))
    return best_params, model_best

In [9]:
estimator = RFC(random_state=7)
hyperparameters = { 'criterion': ['gini', 'entropy'],
                   'max_depth': [None, 5, 10],
                   'max_features': ['auto', 'sqrt', 'log2'],
                   'bootstrap': [True, False],
                   'oob_score': [True, False]}
gridsearch_with_output(estimator, hyperparameters, X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:  9.0min finished



Model:
RandomForestClassifier(random_state=7)

parameters:
{'criterion': ['gini', 'entropy'], 'max_depth': [None, 5, 10], 'max_features': ['auto', 'sqrt', 'log2'], 'bootstrap': [True, False], 'oob_score': [True, False]}

Result of gridsearch:
Parameter            | Optimal  | Gridsearch values
-------------------------------------------------------
criterion            | entropy  | ['gini', 'entropy']
max_depth            | None     | [None, 5, 10]
max_features         | auto     | ['auto', 'sqrt', 'log2']
bootstrap            | False    | [True, False]
oob_score            | False    | [True, False]


({'bootstrap': False,
  'criterion': 'entropy',
  'max_depth': None,
  'max_features': 'auto',
  'oob_score': False},
 RandomForestClassifier(bootstrap=False, criterion='entropy', random_state=7))

In [10]:
estimator = RFC(random_state=7)
hyperparameters = { 'n_estimators': [100,200,500],
                      'criterion': ['entropy'],
                   'max_depth': [None],
                   'min_samples_split':[2, 5],
                   'min_weight_fraction_leaf': [0],
                   'max_features': ['auto'],
                   'bootstrap': [False],
                   'oob_score': [False]}
gridsearch_with_output(estimator, hyperparameters, X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  7.8min finished



Model:
RandomForestClassifier(random_state=7)

parameters:
{'n_estimators': [100, 200, 500], 'criterion': ['entropy'], 'max_depth': [None], 'min_samples_split': [2, 5], 'min_weight_fraction_leaf': [0], 'max_features': ['auto'], 'bootstrap': [False], 'oob_score': [False]}

Result of gridsearch:
Parameter            | Optimal  | Gridsearch values
-------------------------------------------------------
n_estimators         | 500      | [100, 200, 500]
criterion            | entropy  | ['entropy']
max_depth            | None     | [None]
min_samples_split    | 5        | [2, 5]
min_weight_fraction_leaf | 0        | [0]
max_features         | auto     | ['auto']
bootstrap            | False    | [False]
oob_score            | False    | [False]


({'bootstrap': False,
  'criterion': 'entropy',
  'max_depth': None,
  'max_features': 'auto',
  'min_samples_split': 5,
  'min_weight_fraction_leaf': 0,
  'n_estimators': 500,
  'oob_score': False},
 RandomForestClassifier(bootstrap=False, criterion='entropy',
                        min_samples_split=5, min_weight_fraction_leaf=0,
                        n_estimators=500, random_state=7))

In [11]:
estimator = RFC(random_state=7)
hyperparameters = { 'n_estimators': [500, 1000, 2000],
                      'criterion': ['entropy'],
                   'max_depth': [None],
                   'min_samples_split':[5, 10],
                   'min_weight_fraction_leaf': [0, 1, 5],
                   'max_features': ['auto'],
                   'bootstrap': [False],
                   'oob_score': [False]}
gridsearch_with_output(estimator, hyperparameters, X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 16.6min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed: 54.8min finished



Model:
RandomForestClassifier(random_state=7)

parameters:
{'n_estimators': [500, 1000, 2000], 'criterion': ['entropy'], 'max_depth': [None], 'min_samples_split': [5, 10], 'min_weight_fraction_leaf': [0, 1, 5], 'max_features': ['auto'], 'bootstrap': [False], 'oob_score': [False]}

Result of gridsearch:
Parameter            | Optimal  | Gridsearch values
-------------------------------------------------------
n_estimators         | 500      | [500, 1000, 2000]
criterion            | entropy  | ['entropy']
max_depth            | None     | [None]
min_samples_split    | 5        | [5, 10]
min_weight_fraction_leaf | 0        | [0, 1, 5]
max_features         | auto     | ['auto']
bootstrap            | False    | [False]
oob_score            | False    | [False]


({'bootstrap': False,
  'criterion': 'entropy',
  'max_depth': None,
  'max_features': 'auto',
  'min_samples_split': 5,
  'min_weight_fraction_leaf': 0,
  'n_estimators': 500,
  'oob_score': False},
 RandomForestClassifier(bootstrap=False, criterion='entropy',
                        min_samples_split=5, min_weight_fraction_leaf=0,
                        n_estimators=500, random_state=7))