## 1. Library Import

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve, auc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate, cross_val_predict

## 2. Load Data

In [2]:
df_train = pd.read_csv('train_iter_7.csv',
                      index_col = 0)
df_test = pd.read_csv('test_iter_7.csv',
                      index_col = 0)

X_train = df_train.drop('Survived',
                       axis = 1,)
Y_train = df_train.loc[:,'Survived']
X_test = df_test

df_best_result = pd.read_csv('../Iteration_6_Family/best_result_6.csv',
                             index_col = 0)

## 3. Model

### 3.1. Logistic Regression

In [3]:
#define the model
model = LogisticRegression()

#define the parameters to search
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
class_weight = [None, 'balanced']

#define grid search
grid = dict(solver = solvers,
           penalty = penalty,
           C = c_values,
           class_weight = class_weight)
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Best: 0.812930 using {'C': 0.1, 'class_weight': None, 'penalty': 'l2', 'solver': 'liblinear'}


In [4]:
df_logistic_results = pd.DataFrame(data = results_dict)
df_logistic_results.sort_values(by = 'means',
                            axis = 0,
                            inplace = True,
                            ascending = False)
df_logistic_results.reset_index(drop = True,
                               inplace = True)
df_logistic_results.head(10)

Unnamed: 0,means,stds,params
0,0.81293,0.043776,"{'C': 0.1, 'class_weight': None, 'penalty': 'l..."
1,0.811057,0.044428,"{'C': 0.1, 'class_weight': None, 'penalty': 'l..."
2,0.811057,0.044428,"{'C': 0.1, 'class_weight': None, 'penalty': 'l..."
3,0.811057,0.044428,"{'C': 0.1, 'class_weight': None, 'penalty': 'l..."
4,0.811057,0.044428,"{'C': 0.1, 'class_weight': None, 'penalty': 'l..."
5,0.810674,0.0372,"{'C': 1.0, 'class_weight': None, 'penalty': 'l..."
6,0.810674,0.0372,"{'C': 1.0, 'class_weight': None, 'penalty': 'l..."
7,0.810674,0.0372,"{'C': 1.0, 'class_weight': None, 'penalty': 'l..."
8,0.810674,0.0372,"{'C': 1.0, 'class_weight': None, 'penalty': 'l..."
9,0.810674,0.0372,"{'C': 1.0, 'class_weight': None, 'penalty': 'l..."


In [5]:
df_best_result_LR = pd.DataFrame(df_logistic_results.iloc[0,:]).T
df_best_result_LR['Model'] = "Logistic_Regression"
df_best_result_LR

Unnamed: 0,means,stds,params,Model
0,0.81293,0.043776,"{'C': 0.1, 'class_weight': None, 'penalty': 'l...",Logistic_Regression


In [6]:
model = LogisticRegression(solver = 'liblinear',
                           penalty = 'l2',
                           C = 0.1,
                           class_weight = None)
model_result = cross_validate(model, X_train, Y_train, cv = 10,
                             scoring = ('accuracy', 'recall', 'f1','roc_auc'))
y_pred = cross_val_predict(model, X_train, Y_train, cv = 10)
pd.DataFrame(confusion_matrix(Y_train, y_pred))

Unnamed: 0,0,1
0,487,62
1,105,237


In [7]:
df_best_result_LR['Recall'] = model_result['test_recall'].mean()
df_best_result_LR['F1_Score'] = model_result['test_f1'].mean()
df_best_result_LR['Iteration'] = 7
df_best_result_LR['AUC'] = model_result['test_roc_auc'].mean()
df_best_result = pd.concat([df_best_result,df_best_result_LR],
         axis = 0)

In [8]:
df_best_result.sort_values('means',
                          axis = 0,
                          ascending = False)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.841748,0.035564,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.725546,0.771451,6,0.879804
0,0.839867,0.034705,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.728151,0.773902,1,0.874456
0,0.839867,0.034705,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.731176,0.767082,2,0.872206
0,0.839118,0.035118,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.731176,0.767082,3,0.8721
0,0.837611,0.033098,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.731092,0.777132,6,0.864891
0,0.836488,0.033072,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.733782,0.77481,3,0.864874
0,0.836126,0.038454,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.71084,0.763086,5,0.876457
0,0.835377,0.035897,"{'bootstrap': True, 'criterion': 'gini', 'max_...",RandomForestClassifier,0.728151,0.770911,2,0.870139
0,0.835372,0.031847,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.725294,0.770897,5,0.867502
0,0.834236,0.036355,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.739748,0.774462,1,0.87007


### 3.2. Support Vector Machines

In [12]:
#define the model
model = SVC()

#define the parameters to search
kernel = ['poly','rbf']#['linear','poly','rbf','sigmoid']
c_values = [10,1.0]#[100, 10, 1.0, 0.1, 0.01]
gamma = ['scale', 0.1]

#define grid search
grid = dict(kernel = kernel,
           C = c_values,
           gamma = gamma)
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0,
                          verbose = 2)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Fitting 30 folds for each of 8 candidates, totalling 240 fits
Best: 0.821910 using {'C': 1.0, 'gamma': 0.1, 'kernel': 'poly'}


In [13]:
df_SVC_results = pd.DataFrame(data = results_dict)
df_SVC_results.sort_values(by = 'means',
                            axis = 0,
                            inplace = True,
                            ascending = False)
df_SVC_results.reset_index(drop = True,
                               inplace = True)
df_SVC_results.head(10)

Unnamed: 0,means,stds,params
0,0.82191,0.038503,"{'C': 1.0, 'gamma': 0.1, 'kernel': 'poly'}"
1,0.819297,0.036174,"{'C': 1.0, 'gamma': 'scale', 'kernel': 'rbf'}"
2,0.818922,0.03529,"{'C': 1.0, 'gamma': 0.1, 'kernel': 'rbf'}"
3,0.817424,0.036833,"{'C': 1.0, 'gamma': 'scale', 'kernel': 'poly'}"
4,0.815156,0.039098,"{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}"
5,0.814411,0.039525,"{'C': 10, 'gamma': 0.1, 'kernel': 'poly'}"
6,0.814395,0.039552,"{'C': 10, 'gamma': 'scale', 'kernel': 'poly'}"
7,0.812905,0.039295,"{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}"


In [14]:
df_best_result_SVM = pd.DataFrame(df_SVC_results.iloc[0,:]).T
df_best_result_SVM['Model'] = "SVC"
df_best_result_SVM

Unnamed: 0,means,stds,params,Model
0,0.82191,0.038503,"{'C': 1.0, 'gamma': 0.1, 'kernel': 'poly'}",SVC


In [15]:
model = SVC(kernel = 'poly',
           C = 1.0,
           gamma = 0.1).fit(X_train, Y_train)
model_result = cross_validate(model, X_train, Y_train, cv = 10,
                             scoring = ('accuracy', 'recall', 'f1','roc_auc'))
y_pred = cross_val_predict(model, X_train, Y_train, cv = 10)
pd.DataFrame(confusion_matrix(Y_train, y_pred))

Unnamed: 0,0,1
0,502,47
1,113,229


In [16]:
df_best_result_SVM['Recall'] = model_result['test_recall'].mean()
df_best_result_SVM['F1_Score'] = model_result['test_f1'].mean()
df_best_result_SVM['Iteration'] = 7
df_best_result_SVM['AUC'] = model_result['test_roc_auc'].mean()
df_best_result = pd.concat([df_best_result,df_best_result_SVM],
         axis = 0)

In [17]:
df_best_result.sort_values('means',
                          axis = 0,
                          ascending = False)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.841748,0.035564,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.725546,0.771451,6,0.879804
0,0.839867,0.034705,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.731176,0.767082,2,0.872206
0,0.839867,0.034705,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.728151,0.773902,1,0.874456
0,0.839118,0.035118,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.731176,0.767082,3,0.8721
0,0.837611,0.033098,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.731092,0.777132,6,0.864891
0,0.836488,0.033072,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.733782,0.77481,3,0.864874
0,0.836126,0.038454,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.71084,0.763086,5,0.876457
0,0.835377,0.035897,"{'bootstrap': True, 'criterion': 'gini', 'max_...",RandomForestClassifier,0.728151,0.770911,2,0.870139
0,0.835372,0.031847,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.725294,0.770897,5,0.867502
0,0.834236,0.036355,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.739748,0.774462,1,0.87007


### 3.3. Naive Bayes Classifiers

In [18]:
#define the model
model = GaussianNB()

#define the parameters to search
var_smoothing = np.logspace(0, -9, num = 50) 

#define grid search
grid = dict(var_smoothing = var_smoothing)
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0,
                          verbose = 2)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Fitting 30 folds for each of 50 candidates, totalling 1500 fits
Best: 0.713038 using {'var_smoothing': 0.03393221771895328}


In [19]:
df_GaussianNB_results = pd.DataFrame(data = results_dict)
df_GaussianNB_results.sort_values(by = ['means','stds'],
                            axis = 0,
                            inplace = True,
                            ascending = [False,True])
df_GaussianNB_results.reset_index(drop = True,
                               inplace = True)
df_GaussianNB_results.head(10)

Unnamed: 0,means,stds,params
0,0.713038,0.041607,{'var_smoothing': 0.03393221771895328}
1,0.712692,0.03811,{'var_smoothing': 0.281176869797423}
2,0.712285,0.041316,{'var_smoothing': 0.07906043210907697}
3,0.711581,0.035425,{'var_smoothing': 0.4291934260128778}
4,0.71154,0.041361,{'var_smoothing': 0.0517947467923121}
5,0.711165,0.042645,{'var_smoothing': 0.12067926406393285}
6,0.710046,0.042942,{'var_smoothing': 0.0062505519252739694}
7,0.709297,0.042973,{'var_smoothing': 0.009540954763499934}
8,0.709297,0.042678,{'var_smoothing': 0.004094915062380423}
9,0.709297,0.042777,{'var_smoothing': 0.0004941713361323833}


In [20]:
df_best_result_GNB = pd.DataFrame(df_GaussianNB_results.iloc[0,:]).T
df_best_result_GNB['Model'] = "GaussianNB"
df_best_result_GNB

Unnamed: 0,means,stds,params,Model
0,0.713038,0.041607,{'var_smoothing': 0.03393221771895328},GaussianNB


In [21]:
model = GaussianNB(var_smoothing = 0.03393221771895328).fit(X_train, Y_train)
model_result = cross_validate(model, X_train, Y_train, cv = 10,
                             scoring = ('accuracy', 'recall', 'f1','roc_auc'))
y_pred = cross_val_predict(model, X_train, Y_train, cv = 10)
pd.DataFrame(confusion_matrix(Y_train, y_pred))

Unnamed: 0,0,1
0,480,69
1,187,155


In [22]:
df_best_result_GNB['Recall'] = model_result['test_recall'].mean()
df_best_result_GNB['F1_Score'] = model_result['test_f1'].mean()
df_best_result_GNB['Iteration'] = 7
df_best_result_GNB['AUC'] = model_result['test_roc_auc'].mean()
df_best_result = pd.concat([df_best_result,df_best_result_GNB],
         axis = 0)

In [23]:
df_best_result.sort_values('means',
                          axis = 0,
                          ascending = False)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.841748,0.035564,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.725546,0.771451,6,0.879804
0,0.839867,0.034705,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.728151,0.773902,1,0.874456
0,0.839867,0.034705,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.731176,0.767082,2,0.872206
0,0.839118,0.035118,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.731176,0.767082,3,0.8721
0,0.837611,0.033098,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.731092,0.777132,6,0.864891
0,0.836488,0.033072,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.733782,0.77481,3,0.864874
0,0.836126,0.038454,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.71084,0.763086,5,0.876457
0,0.835377,0.035897,"{'bootstrap': True, 'criterion': 'gini', 'max_...",RandomForestClassifier,0.728151,0.770911,2,0.870139
0,0.835372,0.031847,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.725294,0.770897,5,0.867502
0,0.834236,0.036355,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.739748,0.774462,1,0.87007


### 3.5. Perceptron

In [24]:
#define the model
model = Perceptron()

#define the parameters to search
eta0 = [0.0001,0.001,0.01,0.1,1.0]
max_iter = [10,100,1000,10000]
early_stopping = [True,False]
penalty = ['l2','l1',None]

#define grid search
grid = dict(eta0 = eta0,
           max_iter = max_iter,
           early_stopping = early_stopping,
           penalty = penalty)
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0,
                          verbose = 2)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Fitting 30 folds for each of 120 candidates, totalling 3600 fits
Best: 0.736625 using {'early_stopping': False, 'eta0': 0.0001, 'max_iter': 10, 'penalty': 'l2'}


In [25]:
df_Perceptron_results = pd.DataFrame(data = results_dict)
df_Perceptron_results.sort_values(by = ['means','stds'],
                            axis = 0,
                            inplace = True,
                            ascending = [False,True])
df_Perceptron_results.reset_index(drop = True,
                               inplace = True)
df_Perceptron_results.head(10)

Unnamed: 0,means,stds,params
0,0.736625,0.040254,"{'early_stopping': False, 'eta0': 0.0001, 'max..."
1,0.736625,0.040254,"{'early_stopping': False, 'eta0': 0.0001, 'max..."
2,0.736625,0.040254,"{'early_stopping': False, 'eta0': 0.0001, 'max..."
3,0.736625,0.040254,"{'early_stopping': False, 'eta0': 0.0001, 'max..."
4,0.736625,0.040254,"{'early_stopping': False, 'eta0': 0.0001, 'max..."
5,0.736625,0.040254,"{'early_stopping': False, 'eta0': 0.0001, 'max..."
6,0.736625,0.040254,"{'early_stopping': False, 'eta0': 0.0001, 'max..."
7,0.736625,0.040254,"{'early_stopping': False, 'eta0': 0.0001, 'max..."
8,0.736625,0.040254,"{'early_stopping': False, 'eta0': 0.001, 'max_..."
9,0.736625,0.040254,"{'early_stopping': False, 'eta0': 0.001, 'max_..."


In [28]:
df_best_result_Per = pd.DataFrame(df_Perceptron_results.iloc[0,:]).T
df_best_result_Per['Model'] = "Perceptron"
df_best_result_Per

Unnamed: 0,means,stds,params,Model
0,0.736625,0.040254,"{'early_stopping': False, 'eta0': 0.0001, 'max...",Perceptron


In [29]:
model = Perceptron(eta0 = 0.0001,
           max_iter = 10,
           early_stopping = False,
           penalty = 'l2').fit(X_train, Y_train)
model_result = cross_validate(model, X_train, Y_train, cv = 10,
                             scoring = ('accuracy', 'recall', 'f1','roc_auc'))
y_pred = cross_val_predict(model, X_train, Y_train, cv = 10)
pd.DataFrame(confusion_matrix(Y_train, y_pred))

Unnamed: 0,0,1
0,367,182
1,89,253


In [30]:
df_best_result_Per['Recall'] = model_result['test_recall'].mean()
df_best_result_Per['F1_Score'] = model_result['test_f1'].mean()
df_best_result_Per['Iteration'] = 7
df_best_result_Per['AUC'] = model_result['test_roc_auc'].mean()
df_best_result = pd.concat([df_best_result,df_best_result_Per],
         axis = 0)

In [31]:
df_best_result.sort_values('means',
                          axis = 0,
                          ascending = False)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.841748,0.035564,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.725546,0.771451,6,0.879804
0,0.839867,0.034705,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.728151,0.773902,1,0.874456
0,0.839867,0.034705,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.731176,0.767082,2,0.872206
0,0.839118,0.035118,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.731176,0.767082,3,0.8721
0,0.837611,0.033098,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.731092,0.777132,6,0.864891
0,0.836488,0.033072,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.733782,0.77481,3,0.864874
0,0.836126,0.038454,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.71084,0.763086,5,0.876457
0,0.835377,0.035897,"{'bootstrap': True, 'criterion': 'gini', 'max_...",RandomForestClassifier,0.728151,0.770911,2,0.870139
0,0.835372,0.031847,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.725294,0.770897,5,0.867502
0,0.834236,0.036355,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.739748,0.774462,1,0.87007


### 3.6. Linear SVC

In [32]:
#define the model
model = LinearSVC()

#define the parameters to search
c_values = [100, 10, 1.0, 0.1, 0.01]
penalty = ['l2','l1']
max_iter = [1,10,100,1000,10000]

#define grid search
grid = dict(max_iter = max_iter,
           C = c_values,
           penalty = penalty)
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0,
                          verbose = 2)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Fitting 30 folds for each of 50 candidates, totalling 1500 fits
Best: 0.808435 using {'C': 0.01, 'max_iter': 10, 'penalty': 'l2'}




In [33]:
df_LinearSVC_results = pd.DataFrame(data = results_dict)
df_LinearSVC_results.sort_values(by = ['means','stds'],
                            axis = 0,
                            inplace = True,
                            ascending = [False,True])
df_LinearSVC_results.reset_index(drop = True,
                               inplace = True)
df_LinearSVC_results.head(10)

Unnamed: 0,means,stds,params
0,0.808435,0.043468,"{'C': 0.01, 'max_iter': 10, 'penalty': 'l2'}"
1,0.808435,0.043468,"{'C': 0.01, 'max_iter': 100, 'penalty': 'l2'}"
2,0.808435,0.043468,"{'C': 0.01, 'max_iter': 1000, 'penalty': 'l2'}"
3,0.808435,0.043468,"{'C': 0.01, 'max_iter': 10000, 'penalty': 'l2'}"
4,0.801694,0.039242,"{'C': 0.1, 'max_iter': 100, 'penalty': 'l2'}"
5,0.801694,0.039242,"{'C': 0.1, 'max_iter': 1000, 'penalty': 'l2'}"
6,0.801694,0.039242,"{'C': 0.1, 'max_iter': 10000, 'penalty': 'l2'}"
7,0.801298,0.046186,"{'C': 0.01, 'max_iter': 1, 'penalty': 'l2'}"
8,0.799072,0.042061,"{'C': 0.1, 'max_iter': 10, 'penalty': 'l2'}"
9,0.798693,0.03998,"{'C': 100, 'max_iter': 10000, 'penalty': 'l2'}"


In [34]:
df_best_result_LSVC = pd.DataFrame(df_LinearSVC_results.iloc[0,:]).T
df_best_result_LSVC['Model'] = "LinearSVC"
df_best_result_LSVC

Unnamed: 0,means,stds,params,Model
0,0.808435,0.043468,"{'C': 0.01, 'max_iter': 10, 'penalty': 'l2'}",LinearSVC


In [35]:
model = LinearSVC(max_iter = 10,
           C = 0.01,
           penalty = 'l2').fit(X_train, Y_train)
model_result = cross_validate(model, X_train, Y_train, cv = 10,
                             scoring = ('accuracy', 'recall', 'f1','roc_auc'))
y_pred = cross_val_predict(model, X_train, Y_train, cv = 10)
pd.DataFrame(confusion_matrix(Y_train, y_pred))



Unnamed: 0,0,1
0,485,64
1,107,235


In [36]:
df_best_result_LSVC['Recall'] = model_result['test_recall'].mean()
df_best_result_LSVC['F1_Score'] = model_result['test_f1'].mean()
df_best_result_LSVC['Iteration'] = 7
df_best_result_LSVC['AUC'] = model_result['test_roc_auc'].mean()
df_best_result = pd.concat([df_best_result,df_best_result_LSVC],
         axis = 0)

In [40]:
pd.set_option('display.max_rows', 999)
df_best_result.sort_values('means',
                          axis = 0,
                          ascending = False)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.841748,0.035564,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.725546,0.771451,6,0.879804
0,0.839867,0.034705,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.731176,0.767082,2,0.872206
0,0.839867,0.034705,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.728151,0.773902,1,0.874456
0,0.839118,0.035118,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.731176,0.767082,3,0.8721
0,0.837611,0.033098,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.731092,0.777132,6,0.864891
0,0.836488,0.033072,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.733782,0.77481,3,0.864874
0,0.836126,0.038454,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.71084,0.763086,5,0.876457
0,0.835377,0.035897,"{'bootstrap': True, 'criterion': 'gini', 'max_...",RandomForestClassifier,0.728151,0.770911,2,0.870139
0,0.835372,0.031847,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.725294,0.770897,5,0.867502
0,0.834236,0.036355,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.739748,0.774462,1,0.87007


### 3.7. Stochastic Gradient Descent

In [41]:
#define the model
model = SGDClassifier()

#define the parameters to search
loss = ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']
penalty = ['l2','l1']
alpha = [0.0001,0.001,0.01,0.1,1]
learning_rate = ['optimal']


#define grid search
grid = dict(loss = loss,
           alpha = alpha,
           penalty = penalty,
           learning_rate = learning_rate)
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0,
                          verbose = 2)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Fitting 30 folds for each of 90 candidates, totalling 2700 fits
Best: 0.812551 using {'alpha': 0.01, 'learning_rate': 'optimal', 'loss': 'log', 'penalty': 'l2'}


In [42]:
df_SGDClassifier_results = pd.DataFrame(data = results_dict)
df_SGDClassifier_results.sort_values(by = ['means','stds'],
                            axis = 0,
                            inplace = True,
                            ascending = [False,True])
df_SGDClassifier_results.reset_index(drop = True,
                               inplace = True)
df_SGDClassifier_results.head(10)

Unnamed: 0,means,stds,params
0,0.812551,0.044531,"{'alpha': 0.01, 'learning_rate': 'optimal', 'l..."
1,0.808419,0.039672,"{'alpha': 0.001, 'learning_rate': 'optimal', '..."
2,0.806588,0.042662,"{'alpha': 0.1, 'learning_rate': 'optimal', 'lo..."
3,0.80432,0.036948,"{'alpha': 0.001, 'learning_rate': 'optimal', '..."
4,0.801319,0.044017,"{'alpha': 0.01, 'learning_rate': 'optimal', 'l..."
5,0.799467,0.040588,"{'alpha': 0.01, 'learning_rate': 'optimal', 'l..."
6,0.798714,0.042563,"{'alpha': 0.01, 'learning_rate': 'optimal', 'l..."
7,0.79273,0.038371,"{'alpha': 0.001, 'learning_rate': 'optimal', '..."
8,0.791248,0.038733,"{'alpha': 0.001, 'learning_rate': 'optimal', '..."
9,0.790849,0.047051,"{'alpha': 0.1, 'learning_rate': 'optimal', 'lo..."


In [43]:
df_best_result_SGDC = pd.DataFrame(df_SGDClassifier_results.iloc[0,:]).T
df_best_result_SGDC['Model'] = "SGDClassifier"
df_best_result_SGDC

Unnamed: 0,means,stds,params,Model
0,0.812551,0.044531,"{'alpha': 0.01, 'learning_rate': 'optimal', 'l...",SGDClassifier


In [44]:
model = SGDClassifier(loss = 'log',
                      alpha = 0.01,
                      penalty = 'l2',
                      learning_rate = 'optimal').fit(X_train, Y_train)
model_result = cross_validate(model, X_train, Y_train, cv = 10,
                             scoring = ('accuracy', 'recall', 'f1','roc_auc'))
y_pred = cross_val_predict(model, X_train, Y_train, cv = 10)
pd.DataFrame(confusion_matrix(Y_train, y_pred))

Unnamed: 0,0,1
0,485,64
1,106,236


In [45]:
df_best_result_SGDC['Recall'] = model_result['test_recall'].mean()
df_best_result_SGDC['F1_Score'] = model_result['test_f1'].mean()
df_best_result_SGDC['Iteration'] = 7
df_best_result_SGDC['AUC'] = model_result['test_roc_auc'].mean()
df_best_result = pd.concat([df_best_result,df_best_result_SGDC],
         axis = 0)

In [46]:
pd.set_option('display.max_rows', 999)
df_best_result.sort_values('means',
                          axis = 0,
                          ascending = False)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.841748,0.035564,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.725546,0.771451,6,0.879804
0,0.839867,0.034705,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.731176,0.767082,2,0.872206
0,0.839867,0.034705,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.728151,0.773902,1,0.874456
0,0.839118,0.035118,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.731176,0.767082,3,0.8721
0,0.837611,0.033098,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.731092,0.777132,6,0.864891
0,0.836488,0.033072,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.733782,0.77481,3,0.864874
0,0.836126,0.038454,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.71084,0.763086,5,0.876457
0,0.835377,0.035897,"{'bootstrap': True, 'criterion': 'gini', 'max_...",RandomForestClassifier,0.728151,0.770911,2,0.870139
0,0.835372,0.031847,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.725294,0.770897,5,0.867502
0,0.834236,0.036355,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.739748,0.774462,1,0.87007


### 3.8. Decision Tree

In [47]:
#define the model
model = DecisionTreeClassifier()

#define the parameters to search
criterion = ['gini', 'entropy']
splitter = ['best','random']
#max_depth = list(range(1,100,10))
min_samples_split = list(range(2,40,2))
min_samples_leaf = list(range(1,20,2))
max_features = ['auto','sqrt','log2', None]

#define grid search
grid = dict(criterion = criterion,
           splitter = splitter,
           min_samples_split = min_samples_split,
           min_samples_leaf = min_samples_leaf,
           max_features = max_features)
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0,
                          verbose = 2)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Fitting 30 folds for each of 3040 candidates, totalling 91200 fits
Best: 0.826787 using {'criterion': 'gini', 'max_features': None, 'min_samples_leaf': 5, 'min_samples_split': 10, 'splitter': 'best'}


In [48]:
df_DecisionTreeClassifier_results = pd.DataFrame(data = results_dict)
df_DecisionTreeClassifier_results.sort_values(by = ['means','stds'],
                            axis = 0,
                            inplace = True,
                            ascending = [False,True])
df_DecisionTreeClassifier_results.reset_index(drop = True,
                               inplace = True)
df_DecisionTreeClassifier_results.head(10)

Unnamed: 0,means,stds,params
0,0.826787,0.036212,"{'criterion': 'gini', 'max_features': None, 'm..."
1,0.82454,0.034564,"{'criterion': 'gini', 'max_features': None, 'm..."
2,0.823762,0.044469,"{'criterion': 'gini', 'max_features': None, 'm..."
3,0.823042,0.036463,"{'criterion': 'gini', 'max_features': None, 'm..."
4,0.822672,0.038276,"{'criterion': 'entropy', 'max_features': None,..."
5,0.822667,0.039214,"{'criterion': 'gini', 'max_features': None, 'm..."
6,0.822659,0.041187,"{'criterion': 'entropy', 'max_features': None,..."
7,0.822297,0.038744,"{'criterion': 'gini', 'max_features': None, 'm..."
8,0.821174,0.03779,"{'criterion': 'entropy', 'max_features': None,..."
9,0.820795,0.040958,"{'criterion': 'entropy', 'max_features': None,..."


In [49]:
df_best_result_DTC = pd.DataFrame(df_DecisionTreeClassifier_results.iloc[0,:]).T
df_best_result_DTC['Model'] = "DecisionTreeClassifier"
df_best_result_DTC

Unnamed: 0,means,stds,params,Model
0,0.826787,0.036212,"{'criterion': 'gini', 'max_features': None, 'm...",DecisionTreeClassifier


In [50]:
model = DecisionTreeClassifier(criterion = 'gini',
                       splitter = 'best',
                       min_samples_split = 10,
                       min_samples_leaf = 5,
                       max_features = None).fit(X_train, Y_train)
model_result = cross_validate(model, X_train, Y_train, cv = 10,
                             scoring = ('accuracy', 'recall', 'f1','roc_auc'))
y_pred = cross_val_predict(model, X_train, Y_train, cv = 10)
pd.DataFrame(confusion_matrix(Y_train, y_pred))

Unnamed: 0,0,1
0,478,71
1,92,250


In [51]:
df_best_result_DTC['Recall'] = model_result['test_recall'].mean()
df_best_result_DTC['F1_Score'] = model_result['test_f1'].mean()
df_best_result_DTC['Iteration'] = 7
df_best_result_DTC['AUC'] = model_result['test_roc_auc'].mean()
df_best_result = pd.concat([df_best_result,df_best_result_DTC],
         axis = 0)

In [52]:
pd.set_option('display.max_rows', 999)
df_best_result.sort_values('means',
                          axis = 0,
                          ascending = False)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.841748,0.035564,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.725546,0.771451,6,0.879804
0,0.839867,0.034705,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.728151,0.773902,1,0.874456
0,0.839867,0.034705,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.731176,0.767082,2,0.872206
0,0.839118,0.035118,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.731176,0.767082,3,0.8721
0,0.837611,0.033098,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.731092,0.777132,6,0.864891
0,0.836488,0.033072,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.733782,0.77481,3,0.864874
0,0.836126,0.038454,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.71084,0.763086,5,0.876457
0,0.835377,0.035897,"{'bootstrap': True, 'criterion': 'gini', 'max_...",RandomForestClassifier,0.728151,0.770911,2,0.870139
0,0.835372,0.031847,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.725294,0.770897,5,0.867502
0,0.834236,0.036355,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.739748,0.774462,1,0.87007


### 3.9. Random Forest

In [53]:
#define the model
model = RandomForestClassifier()

#define the parameters to search
n_estimators = [100,1000]
criterion = ['gini','entropy']
max_depth = [10,100]
#max_depth.append(None)
min_samples_split = [4,6,8]
min_samples_leaf = [1,3,5]
max_features = [None]
bootstrap = [True, False]

#define grid search
grid = dict(criterion = criterion,
            n_estimators = n_estimators,
           max_depth = max_depth,
           min_samples_split = min_samples_split,
           min_samples_leaf = min_samples_leaf,
           max_features = max_features,
           bootstrap = bootstrap)
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0,
                          verbose = 2)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Fitting 30 folds for each of 144 candidates, totalling 4320 fits
Best: 0.836862 using {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 4, 'n_estimators': 100}


In [54]:
df_RandomForestClassifier_results = pd.DataFrame(data = results_dict)
df_RandomForestClassifier_results.sort_values(by = ['means','stds'],
                            axis = 0,
                            inplace = True,
                            ascending = [False,True])
df_RandomForestClassifier_results.reset_index(drop = True,
                               inplace = True)
with pd.option_context('display.max_colwidth', -1):     
    display(df_RandomForestClassifier_results.head(10))

Unnamed: 0,means,stds,params
0,0.836862,0.03233,"{'bootstrap': True, 'criterion': 'entropy', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 4, 'n_estimators': 100}"
1,0.836117,0.035646,"{'bootstrap': True, 'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 4, 'n_estimators': 100}"
2,0.836109,0.033345,"{'bootstrap': True, 'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 1000}"
3,0.834619,0.030323,"{'bootstrap': True, 'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 4, 'n_estimators': 1000}"
4,0.834619,0.031945,"{'bootstrap': True, 'criterion': 'entropy', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 1000}"
5,0.834245,0.031914,"{'bootstrap': True, 'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 1000}"
6,0.834232,0.034748,"{'bootstrap': True, 'criterion': 'entropy', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 4, 'n_estimators': 1000}"
7,0.833878,0.034753,"{'bootstrap': True, 'criterion': 'entropy', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 100}"
8,0.833874,0.033634,"{'bootstrap': True, 'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 6, 'n_estimators': 1000}"
9,0.833862,0.032174,"{'bootstrap': True, 'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 6, 'n_estimators': 100}"


In [55]:
df_best_result_RF = pd.DataFrame(df_RandomForestClassifier_results.iloc[0,:]).T
df_best_result_RF['Model'] = "RandomForestClassifier"
df_best_result_RF

Unnamed: 0,means,stds,params,Model
0,0.836862,0.03233,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier


In [56]:
model = RandomForestClassifier(criterion = 'entropy',
                               n_estimators = 100,
                               max_depth = 100,
                               min_samples_split = 4,
                               min_samples_leaf = 3,
                               max_features = None,
                               bootstrap = True).fit(X_train, Y_train)
model_result = cross_validate(model, X_train, Y_train, cv = 10,
                             scoring = ('accuracy', 'recall', 'f1','roc_auc'))
y_pred = cross_val_predict(model, X_train, Y_train, cv = 10)
pd.DataFrame(confusion_matrix(Y_train, y_pred))

Unnamed: 0,0,1
0,492,57
1,91,251


In [57]:
df_best_result_RF['Recall'] = model_result['test_recall'].mean()
df_best_result_RF['F1_Score'] = model_result['test_f1'].mean()
df_best_result_RF['Iteration'] = 7
df_best_result_RF['AUC'] = model_result['test_roc_auc'].mean()
df_best_result = pd.concat([df_best_result,df_best_result_RF],
         axis = 0)

In [58]:
pd.set_option('display.max_rows', 999)
df_best_result.sort_values('means',
                          axis = 0,
                          ascending = False)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.841748,0.035564,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.725546,0.771451,6,0.879804
0,0.839867,0.034705,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.731176,0.767082,2,0.872206
0,0.839867,0.034705,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.728151,0.773902,1,0.874456
0,0.839118,0.035118,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.731176,0.767082,3,0.8721
0,0.837611,0.033098,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.731092,0.777132,6,0.864891
0,0.836862,0.03233,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.742773,0.775271,7,0.870719
0,0.836488,0.033072,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.733782,0.77481,3,0.864874
0,0.836126,0.038454,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.71084,0.763086,5,0.876457
0,0.835377,0.035897,"{'bootstrap': True, 'criterion': 'gini', 'max_...",RandomForestClassifier,0.728151,0.770911,2,0.870139
0,0.835372,0.031847,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.725294,0.770897,5,0.867502


### 3.10. XGB Classifier

In [57]:
#define the model
model = XGBClassifier()

#define the parameters to search
n_estimators = [2000]
learning_rate = [0.1,0.01]
max_depth = [6,9]
min_child_weight = [3,6,9]
gamma = [1]
subsample = [1.0]
colsample_bytree = [1.0]

#define grid search
grid = dict(n_estimators = n_estimators,
           learning_rate = learning_rate,
           max_depth = max_depth,
           min_child_weight = min_child_weight,
           gamma = gamma,
           subsample = subsample,
           colsample_bytree = colsample_bytree,
           use_label_encoder=[False])
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0,
                          verbose = 2)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Fitting 30 folds for each of 12 candidates, totalling 360 fits
Best: 0.841748 using {'colsample_bytree': 1.0, 'gamma': 1, 'learning_rate': 0.01, 'max_depth': 9, 'min_child_weight': 6, 'n_estimators': 2000, 'subsample': 1.0, 'use_label_encoder': False}


In [58]:
df_XGBClassifier_results = pd.DataFrame(data = results_dict)
df_XGBClassifier_results.sort_values(by = ['means','stds'],
                            axis = 0,
                            inplace = True,
                            ascending = [False,True])
df_XGBClassifier_results.reset_index(drop = True,
                               inplace = True)
with pd.option_context('display.max_colwidth', -1):     
    display(df_XGBClassifier_results.head(10))

Unnamed: 0,means,stds,params
0,0.841748,0.035564,"{'colsample_bytree': 1.0, 'gamma': 1, 'learning_rate': 0.01, 'max_depth': 9, 'min_child_weight': 6, 'n_estimators': 2000, 'subsample': 1.0, 'use_label_encoder': False}"
1,0.840633,0.035968,"{'colsample_bytree': 1.0, 'gamma': 1, 'learning_rate': 0.1, 'max_depth': 9, 'min_child_weight': 6, 'n_estimators': 2000, 'subsample': 1.0, 'use_label_encoder': False}"
2,0.839871,0.033974,"{'colsample_bytree': 1.0, 'gamma': 1, 'learning_rate': 0.1, 'max_depth': 6, 'min_child_weight': 6, 'n_estimators': 2000, 'subsample': 1.0, 'use_label_encoder': False}"
3,0.837628,0.036562,"{'colsample_bytree': 1.0, 'gamma': 1, 'learning_rate': 0.01, 'max_depth': 6, 'min_child_weight': 6, 'n_estimators': 2000, 'subsample': 1.0, 'use_label_encoder': False}"
4,0.83464,0.03721,"{'colsample_bytree': 1.0, 'gamma': 1, 'learning_rate': 0.01, 'max_depth': 9, 'min_child_weight': 3, 'n_estimators': 2000, 'subsample': 1.0, 'use_label_encoder': False}"
5,0.833887,0.03276,"{'colsample_bytree': 1.0, 'gamma': 1, 'learning_rate': 0.1, 'max_depth': 6, 'min_child_weight': 9, 'n_estimators': 2000, 'subsample': 1.0, 'use_label_encoder': False}"
6,0.833516,0.033218,"{'colsample_bytree': 1.0, 'gamma': 1, 'learning_rate': 0.01, 'max_depth': 6, 'min_child_weight': 9, 'n_estimators': 2000, 'subsample': 1.0, 'use_label_encoder': False}"
7,0.833512,0.036128,"{'colsample_bytree': 1.0, 'gamma': 1, 'learning_rate': 0.1, 'max_depth': 6, 'min_child_weight': 3, 'n_estimators': 2000, 'subsample': 1.0, 'use_label_encoder': False}"
8,0.833146,0.036851,"{'colsample_bytree': 1.0, 'gamma': 1, 'learning_rate': 0.1, 'max_depth': 9, 'min_child_weight': 3, 'n_estimators': 2000, 'subsample': 1.0, 'use_label_encoder': False}"
9,0.832763,0.034474,"{'colsample_bytree': 1.0, 'gamma': 1, 'learning_rate': 0.01, 'max_depth': 6, 'min_child_weight': 3, 'n_estimators': 2000, 'subsample': 1.0, 'use_label_encoder': False}"


In [59]:
df_best_result_XGB = pd.DataFrame(df_XGBClassifier_results.iloc[0,:]).T
df_best_result_XGB['Model'] = "XGBClassifier"
df_best_result_XGB

Unnamed: 0,means,stds,params,Model
0,0.841748,0.035564,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier


In [60]:
model = XGBClassifier(n_estimators = 2000,
                       learning_rate = 0.01,
                       max_depth = 9,
                       min_child_weight = 6,
                       gamma = 1.0,
                       subsample = 1.0,
                       colsample_bytree = 1.0,
                       use_label_encoder=[False]).fit(X_train, Y_train)
model_result = cross_validate(model, X_train, Y_train, cv = 10,
                             scoring = ('accuracy', 'recall', 'f1','roc_auc'))
y_pred = cross_val_predict(model, X_train, Y_train, cv = 10)
pd.DataFrame(confusion_matrix(Y_train, y_pred))





















































































Unnamed: 0,0,1
0,500,49
1,94,248


In [61]:
df_best_result_XGB['Recall'] = model_result['test_recall'].mean()
df_best_result_XGB['F1_Score'] = model_result['test_f1'].mean()
df_best_result_XGB['Iteration'] = 6
df_best_result_XGB['AUC'] = model_result['test_roc_auc'].mean()
df_best_result = pd.concat([df_best_result,df_best_result_XGB],
         axis = 0)

In [62]:
df_best_result.sort_values('means',
                          axis = 0,
                          ascending = False)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.841748,0.035564,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.725546,0.771451,6,0.879804
0,0.839867,0.034705,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.728151,0.773902,1,0.874456
0,0.839867,0.034705,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.731176,0.767082,2,0.872206
0,0.839118,0.035118,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.731176,0.767082,3,0.8721
0,0.837611,0.033098,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.731092,0.777132,6,0.864891
0,0.836488,0.033072,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.733782,0.77481,3,0.864874
0,0.836126,0.038454,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.71084,0.763086,5,0.876457
0,0.835377,0.035897,"{'bootstrap': True, 'criterion': 'gini', 'max_...",RandomForestClassifier,0.728151,0.770911,2,0.870139
0,0.835372,0.031847,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.725294,0.770897,5,0.867502
0,0.834236,0.036355,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.739748,0.774462,1,0.87007


In [63]:
df_best_result[df_best_result['Iteration'].isin([2,5,6])].sort_values('means',
                                                                   axis = 0,
                                                                   ascending = False)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.841748,0.035564,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.725546,0.771451,6,0.879804
0,0.839867,0.034705,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.731176,0.767082,2,0.872206
0,0.837611,0.033098,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.731092,0.777132,6,0.864891
0,0.836126,0.038454,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.71084,0.763086,5,0.876457
0,0.835377,0.035897,"{'bootstrap': True, 'criterion': 'gini', 'max_...",RandomForestClassifier,0.728151,0.770911,2,0.870139
0,0.835372,0.031847,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.725294,0.770897,5,0.867502
0,0.826783,0.033091,"{'criterion': 'gini', 'max_features': None, 'm...",DecisionTreeClassifier,0.725462,0.748269,6,0.838959
0,0.824553,0.03489,"{'criterion': 'gini', 'max_features': None, 'm...",DecisionTreeClassifier,0.731345,0.748041,5,0.834424
0,0.823774,0.040589,"{'C': 1.0, 'gamma': 0.1, 'kernel': 'rbf'}",SVC,0.689832,0.74366,2,0.863829
0,0.822285,0.037868,"{'C': 1.0, 'gamma': 0.1, 'kernel': 'rbf'}",SVC,0.71916,0.752776,6,0.85788


Iteration 6 is consistently better in most of the models.

In [64]:
df_best_result.to_csv('best_result_6.csv')