# Contents
- [Imports](#imports)
- [Modelling](#model)
- [Holdout Testing](#holdout)

---
# Imports<a id=imports></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from inspect import signature
from itertools import cycle
import pickle
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score, make_scorer, roc_curve, classification_report, precision_recall_curve, roc_auc_score

import xgboost as xgb
from xgboost import XGBClassifier

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


We load our data from our feature engineered csv.

In [2]:
data=pd.read_csv(r'.\data\feateng.csv')

---
# Modelling<a id=model></a>

We split our data into our X and y data.

In [3]:
X=data[[x for x in data.columns if x != 'move']]
y=data['move']

And we use SMOTE to upsample our minority classes to ensure an even distribution of our classes.

In [4]:
len(data)

9764

In [5]:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)

In [6]:
print(str(len(X_res))+' '+str(len(y_res)))

21072 21072


We perform our train-test-split on our X and y data, stratifying on our classes.

In [7]:
X_train,X_test,y_train,y_test=train_test_split(X_res,y_res,random_state=42,stratify=y_res)

And we proceed with our modelling in the same manner as before.

In [8]:
#code found from http://www.davidsbatista.net/blog/2018/02/23/model_optimization/
class EstimatorSelectionHelper:

    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=3, n_jobs=-1, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]

In [9]:
classifier_models = {
    'LogisticRegression' : LogisticRegression(random_state = 42),
    'KNN': KNeighborsClassifier(), 
#     'NaiveBayes' : MultinomialNB(),
    'DecisionTree' : DecisionTreeClassifier(random_state = 42), 
    'BaggedDecisionTree' : BaggingClassifier(random_state = 42),
    'RandomForest' : RandomForestClassifier(random_state = 42), 
    'ExtraTrees' : ExtraTreesClassifier(random_state = 42), 
    'AdaBoost' : AdaBoostClassifier(random_state=42), 
    'GradientBoosting' : GradientBoostingClassifier(random_state = 42),
    'SVM' : SVC(random_state=42),
    'XGBoost' : XGBClassifier(random_state=42)
}

In [10]:
classifier_model_params = {
    'LogisticRegression' : {
        'penalty' : ['l1', 'l2'],
        'C' : np.arange(.05, 1, .05) },
    'KNN' : {
        'n_neighbors' : np.arange(3, 22, 2) },
    'NaiveBayes' : {
        'alpha' : np.arange(.05, 2, .05)},
    'DecisionTree': {
        'max_depth' : [None, 6, 10, 14], 
        'min_samples_leaf' : [1, 2],
        'min_samples_split': [2, 3] },
    'BaggedDecisionTree' : {
        'n_estimators' : [20, 60, 100] },
    'RandomForest' : {
        'n_estimators' : [20, 60, 100],
        'max_depth' : [None, 2, 6, 10],
        'min_samples_split' : [2, 3, 4] },
    'ExtraTrees' : {
        'n_estimators' : [20, 60, 100],
        'max_depth' : [None, 6, 10, 14],
        'min_samples_leaf' : [1, 2], 
        'min_samples_split' : [2, 3], },
    'AdaBoost' : {
        'n_estimators' : np.arange(100, 151, 25),
        'learning_rate' : np.linspace(0.05, 1, 10) },
    'GradientBoosting' : {
        'n_estimators' : np.arange(5, 150, 15),
        'learning_rate' : np.linspace(0.05, 1, 10),
        'max_depth' : [1, 2, 3] },
    'SVM' : {
        'C' : np.arange(0.05, 1, .05),
        'kernel' : ['rbf', 'linear'] },
    'XGBoost' : {
        'n_estimators'  : np.arange(100, 151, 25), 
        'learning_rate' : np.arange(0.1, 1, .3),
        'max_depth' : [3],
        'alpha' : np.arange(0, 1, .3),
        'lambda' : np.arange(0, 1, .3),
        'gamma' : np.arange(0, 1, .3),
        'subsample' : [.5],
        'n_jobs' : [4],}
        }

In [11]:
search = EstimatorSelectionHelper(classifier_models, classifier_model_params)
search.fit(X_train, y_train, scoring='accuracy', n_jobs=-1)

Running GridSearchCV for LogisticRegression.
Fitting 3 folds for each of 38 candidates, totalling 114 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   15.4s
[Parallel(n_jobs=-1)]: Done 114 out of 114 | elapsed:  1.0min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Running GridSearchCV for KNN.
Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   12.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Running GridSearchCV for DecisionTree.
Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed:    1.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Running GridSearchCV for BaggedDecisionTree.
Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:    6.9s remaining:    8.7s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:   12.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Running GridSearchCV for RandomForest.
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:   17.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Running GridSearchCV for ExtraTrees.
Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:   13.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Running GridSearchCV for AdaBoost.
Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   24.1s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:   57.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Running GridSearchCV for GradientBoosting.
Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   29.4s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 17.1min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed: 19.3min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Running GridSearchCV for SVM.
Fitting 3 folds for each of 38 candidates, totalling 114 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 114 out of 114 | elapsed:  4.1min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Running GridSearchCV for XGBoost.
Fitting 3 folds for each of 576 candidates, totalling 1728 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 14.6min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed: 26.0min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed: 40.8min
[Parallel(n_jobs=-1)]: Done 1728 out of 1728 | elapsed: 56.9min finished


---
# Holdout Testing<a id=holdout></a>
We create the table of our GridSearch scores and parameters.

In [12]:
score1=search.score_summary(sort_by='mean_score')

LogisticRegression
KNN
DecisionTree
BaggedDecisionTree
RandomForest
ExtraTrees
AdaBoost
GradientBoosting
SVM
XGBoost


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




And create the pivot table which gives us the best scores and the parameters for each of our estimators.

In [13]:
table=pd.pivot_table(score1,index='estimator',aggfunc='max').sort_values('mean_score',ascending=False)
table

Unnamed: 0_level_0,C,alpha,gamma,kernel,lambda,learning_rate,max_depth,max_score,mean_score,min_samples_leaf,min_samples_split,min_score,n_estimators,n_jobs,n_neighbors,penalty,std_score,subsample
estimator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
ExtraTrees,,,,,,,14.0,0.998102,0.997216,2.0,3.0,0.996963,100.0,,,,0.005785,
XGBoost,,0.9,0.9,,0.9,0.7,3.0,0.997153,0.996014,,,0.994685,150.0,4.0,,,0.001591,0.5
GradientBoosting,,,,,,1.0,3.0,0.996393,0.995634,,,0.994875,140.0,,,,0.027797,
KNN,,,,,,,,0.995634,0.995191,,,0.994685,,,21.0,,0.001886,
RandomForest,,,,,,,10.0,0.996203,0.995001,,4.0,0.993356,100.0,,,,0.008713,
BaggedDecisionTree,,,,,,,,0.990319,0.988547,,,0.985763,100.0,,,,0.002757,
DecisionTree,,,,,,,14.0,0.981587,0.979879,2.0,3.0,0.977031,,,,,0.017613,
LogisticRegression,0.95,,,,,,,0.906036,0.900658,,,0.894267,,,,l2,0.006514,
SVM,0.95,,,rbf,,,,0.897874,0.892875,,,0.888952,,,,,0.004744,
AdaBoost,,,,,,1.0,,0.823083,0.808719,,,0.799734,150.0,,,,0.044687,


We export these parameters to another external csv file for future reference if needed.

In [14]:
table.to_csv(r'.\data\model2params.csv')

In [15]:
table=pd.read_csv(r'.\data\model2params.csv',index_col='estimator')
table

Unnamed: 0_level_0,C,alpha,gamma,kernel,lambda,learning_rate,max_depth,max_score,mean_score,min_samples_leaf,min_samples_split,min_score,n_estimators,n_jobs,n_neighbors,penalty,std_score,subsample
estimator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
ExtraTrees,,,,,,,14.0,0.998102,0.997216,2.0,3.0,0.996963,100.0,,,,0.005785,
XGBoost,,0.9,0.9,,0.9,0.7,3.0,0.997153,0.996014,,,0.994685,150.0,4.0,,,0.001591,0.5
GradientBoosting,,,,,,1.0,3.0,0.996393,0.995634,,,0.994875,140.0,,,,0.027797,
KNN,,,,,,,,0.995634,0.995191,,,0.994685,,,21.0,,0.001886,
RandomForest,,,,,,,10.0,0.996203,0.995001,,4.0,0.993356,100.0,,,,0.008713,
BaggedDecisionTree,,,,,,,,0.990319,0.988547,,,0.985763,100.0,,,,0.002757,
DecisionTree,,,,,,,14.0,0.981587,0.979879,2.0,3.0,0.977031,,,,,0.017613,
LogisticRegression,0.95,,,,,,,0.906036,0.900658,,,0.894267,,,,l2,0.006514,
SVM,0.95,,,rbf,,,,0.897874,0.892875,,,0.888952,,,,,0.004744,
AdaBoost,,,,,,1.0,,0.823083,0.808719,,,0.799734,150.0,,,,0.044687,


We define the estimators we will use for our holdout testing based on the parameters that gave us the best accuracy stores in our GridSearch.

In [16]:
estimators = {
    'XGB' : XGBClassifier(reg_alpha=table.loc['XGBoost']['alpha'],reg_lambda=table.loc['XGBoost']['lambda'],random_state = 42, gamma=table.loc['XGBoost']['gamma'],learning_rate=table.loc['XGBoost']['learning_rate'],max_depth=int(table.loc['XGBoost']['max_depth']),n_estimators=int(table.loc['XGBoost']['n_estimators']),subsample=table.loc['XGBoost']['subsample']),
    'ADA' : AdaBoostClassifier(random_state = 42,learning_rate=table.loc['AdaBoost']['learning_rate'],n_estimators=int(table.loc['AdaBoost']['n_estimators'])),
    'GBoost' : GradientBoostingClassifier(random_state = 42,learning_rate=table.loc['GradientBoosting']['learning_rate'],max_depth=int(table.loc['GradientBoosting']['max_depth']),n_estimators=int(table.loc['GradientBoosting']['n_estimators'])),
    'SVC' : SVC(C=table.loc['SVM']['C'],random_state = 42,kernel=table.loc['SVM']['kernel'],probability=True,gamma='auto'),
    'DecisionTree' : DecisionTreeClassifier(random_state = 42,max_depth=int(table.loc['DecisionTree']['max_depth']),min_samples_leaf=int(table.loc['DecisionTree']['min_samples_leaf']),min_samples_split=int(table.loc['DecisionTree']['min_samples_split'])),
    'knn' : KNeighborsClassifier(n_neighbors=int(table.loc['KNN']['n_neighbors'])),
    'random forest' : RandomForestClassifier(random_state = 42,max_depth=int(table.loc['RandomForest']['max_depth']),min_samples_split=int(table.loc['RandomForest']['min_samples_split']),n_estimators=int(table.loc['RandomForest']['n_estimators'])),
    'ef' : ExtraTreesClassifier(random_state = 42,max_depth=int(table.loc['ExtraTrees']['max_depth']),min_samples_leaf=int(table.loc['ExtraTrees']['min_samples_leaf']),min_samples_split=int(table.loc['ExtraTrees']['min_samples_split']),n_estimators=int(table.loc['ExtraTrees']['n_estimators'])),
    'lr with reg' : LogisticRegression(random_state = 42,penalty=table.loc['LogisticRegression']['penalty'],C=table.loc['LogisticRegression']['C']),
    'bagging classifier' : BaggingClassifier(random_state = 42,n_estimators=int(table.loc['BaggedDecisionTree']['n_estimators'])),
#     'NaiveBayes' : MultinomialNB(alpha=table.loc['NaiveBayes']['alpha'])
}

And we perform our holdout testing for the previously defined estimators.

In [17]:
scores = []

for e_name,estimator in estimators.items():
    estimator.fit(X_train,y_train)
    pred=estimator.predict(X_test)
    fpr, tpr, _ = roc_curve(y_test, estimator.predict_proba(X_test)[:,1],pos_label=1)
    precision, recall, _ = precision_recall_curve(y_test, estimator.predict_proba(X_test)[:,1],pos_label=1)
    score = {'est':e_name,
             'roc_curve':[fpr, tpr],
             'prc':[precision,recall]}
    scores.append(score)
    print('Results for ',e_name,':')
    print(classification_report(y_test,pred))


Results for  XGB :
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       878
           1       1.00      1.00      1.00       878
           2       0.99      1.00      1.00       878
           3       0.99      0.99      0.99       878
           4       0.99      0.99      0.99       878
           5       1.00      0.99      0.99       878

    accuracy                           0.99      5268
   macro avg       0.99      0.99      0.99      5268
weighted avg       0.99      0.99      0.99      5268

Results for  ADA :
              precision    recall  f1-score   support

           0       0.75      0.50      0.60       878
           1       0.94      0.97      0.95       878
           2       0.82      0.57      0.67       878
           3       0.74      0.24      0.37       878
           4       0.81      0.41      0.55       878
           5       0.35      0.97      0.52       878

    accuracy                           



Results for  lr with reg :
              precision    recall  f1-score   support

           0       0.78      0.63      0.70       878
           1       0.78      1.00      0.88       878
           2       0.79      0.88      0.83       878
           3       0.82      0.83      0.82       878
           4       0.93      0.81      0.87       878
           5       0.90      0.83      0.86       878

    accuracy                           0.83      5268
   macro avg       0.83      0.83      0.83      5268
weighted avg       0.83      0.83      0.83      5268

Results for  bagging classifier :
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       878
           1       0.99      0.99      0.99       878
           2       0.99      1.00      0.99       878
           3       0.99      0.99      0.99       878
           4       0.99      0.99      0.99       878
           5       0.99      0.99      0.99       878

    accuracy    

Similar to our first-round of modelling, our models seem to perform admirably.<br/>
XGBoost,Gradient Boosting, Random Forest, KNN, Extra Trees and Bagging Classifier models give us close to perfect accuracy and the other models also perform well with the exception of the AdaBoost model again.<br/>
However, as our first round of modelling and testing have shown, holdout test results can be vastly different from the results of actual testing with a separate dataset.<br/><br/>
Now that we have fitted all our best models, we shall pickle the models for testing purposes later on.

In [18]:
for e_name, estimator in estimators.items():
    pickle.dump(estimator,open('./data/model2/'+str(e_name)+'.sav', 'wb'))

And we test if our pickled files are working correctly.

In [19]:
model='XGB'

In [20]:
mod=pickle.load(open(r'.//data/model2/'+model+'.sav', 'rb'))

In [21]:
result=mod.predict(X_test)

In [22]:
compare=estimators['XGB'].predict(X_test)

In [23]:
for i in range(len(result)):
    if result[i]==compare[i]:
        continue
    else:
        print('mismatch')
print('done')

done


It seems that our pickling was successful. We shall move on to testing our models.