## 1. Library Import

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve, auc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate, cross_val_predict

## 2. Load Data

In [6]:
df_train = pd.read_csv('train_iter_11.csv',
                      index_col = 0)
df_test = pd.read_csv('test_iter_11.csv',
                      index_col = 0)

X_train = df_train.drop('Survived',
                       axis = 1,)
Y_train = df_train.loc[:,'Survived']
X_test = df_test.drop('Survived',
                       axis = 1,)

df_best_result = pd.read_csv('../Iteration_9_title/best_result_9.csv',
                             index_col = 0)

## 3. Model

### 3.1. Logistic Regression

In [7]:
#define the model
model = LogisticRegression()

#define the parameters to search
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
class_weight = [None, 'balanced']

#define grid search
grid = dict(solver = solvers,
           penalty = penalty,
           C = c_values,
           class_weight = class_weight)
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Best: 0.864191 using {'C': 100, 'class_weight': None, 'penalty': 'l2', 'solver': 'newton-cg'}


In [8]:
df_logistic_results = pd.DataFrame(data = results_dict)
df_logistic_results.sort_values(by = 'means',
                            axis = 0,
                            inplace = True,
                            ascending = False)
df_logistic_results.reset_index(drop = True,
                               inplace = True)
df_logistic_results.head(10)

Unnamed: 0,means,stds,params
0,0.864191,0.041872,"{'C': 100, 'class_weight': None, 'penalty': 'l..."
1,0.864191,0.041872,"{'C': 100, 'class_weight': None, 'penalty': 'l..."
2,0.864191,0.041872,"{'C': 100, 'class_weight': None, 'penalty': 'l..."
3,0.862701,0.040352,"{'C': 100, 'class_weight': None, 'penalty': 'l..."
4,0.862326,0.040275,"{'C': 100, 'class_weight': None, 'penalty': 'l..."
5,0.862326,0.040275,"{'C': 10, 'class_weight': None, 'penalty': 'l2..."
6,0.861952,0.040921,"{'C': 10, 'class_weight': None, 'penalty': 'l2..."
7,0.861952,0.040507,"{'C': 10, 'class_weight': None, 'penalty': 'l2..."
8,0.861952,0.040507,"{'C': 10, 'class_weight': None, 'penalty': 'l2..."
9,0.861952,0.040507,"{'C': 10, 'class_weight': None, 'penalty': 'l2..."


In [9]:
df_best_result_LR = pd.DataFrame(df_logistic_results.iloc[0,:]).T
df_best_result_LR['Model'] = "Logistic_Regression"
df_best_result_LR

Unnamed: 0,means,stds,params,Model
0,0.864191,0.041872,"{'C': 100, 'class_weight': None, 'penalty': 'l...",Logistic_Regression


In [10]:
model = LogisticRegression(solver = 'newton-cg',
                           penalty = 'l2',
                           C = 1.0,
                           class_weight = None)
model_result = cross_validate(model, X_train, Y_train, cv = 10,
                             scoring = ('accuracy', 'recall', 'f1','roc_auc'))
y_pred = cross_val_predict(model, X_train, Y_train, cv = 10)
pd.DataFrame(confusion_matrix(Y_train, y_pred))

Unnamed: 0,0,1
0,498,51
1,72,270


In [11]:
df_best_result_LR['Recall'] = model_result['test_recall'].mean()
df_best_result_LR['F1_Score'] = model_result['test_f1'].mean()
df_best_result_LR['Iteration'] = 11
df_best_result_LR['AUC'] = model_result['test_roc_auc'].mean()
df_best_result = pd.concat([df_best_result,df_best_result_LR],
         axis = 0)

In [12]:
pd.set_option('display.max_rows', 999)
df_best_result.sort_values('means',
                          axis = 0,
                          ascending = False)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.864191,0.041872,"{'C': 100, 'class_weight': None, 'penalty': 'l...",Logistic_Regression,0.78958,0.814052,11,0.908724
0,0.842501,0.038236,"{'colsample_bytree': 1.0, 'gamma': 0.5, 'learn...",XGBClassifier,0.760588,0.787955,7,0.882208
0,0.842501,0.038236,"{'colsample_bytree': 1.0, 'gamma': 0.5, 'learn...",XGBClassifier,0.760588,0.787955,8,0.882208
0,0.84248,0.038027,"{'colsample_bytree': 0.5, 'gamma': 0.3, 'learn...",XGBClassifier,0.760336,0.781871,9,0.886201
0,0.841748,0.035564,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.725546,0.771451,6,0.879804
0,0.839867,0.034705,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.728151,0.773902,1,0.874456
0,0.839867,0.034705,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.731176,0.767082,2,0.872206
0,0.839118,0.035118,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.731176,0.767082,3,0.8721
0,0.837624,0.034895,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.728319,0.767401,9,0.872104
0,0.837611,0.033098,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.731092,0.777132,6,0.864891


### 3.2. Support Vector Machines

In [15]:
#define the model
model = SVC()

#define the parameters to search
kernel = ['poly','rbf']#['linear','poly','rbf','sigmoid']
c_values = [10,1.0,0.1]#[100, 10, 1.0, 0.1, 0.01]
gamma = ['scale',1.0,0.1]

#define grid search
grid = dict(kernel = kernel,
           C = c_values,
           gamma = gamma)
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0,
                          verbose = 2)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Fitting 30 folds for each of 18 candidates, totalling 540 fits
Best: 0.865306 using {'C': 1.0, 'gamma': 'scale', 'kernel': 'rbf'}


In [16]:
df_SVC_results = pd.DataFrame(data = results_dict)
df_SVC_results.sort_values(by = 'means',
                            axis = 0,
                            inplace = True,
                            ascending = False)
df_SVC_results.reset_index(drop = True,
                               inplace = True)
df_SVC_results.head(10)

Unnamed: 0,means,stds,params
0,0.865306,0.032451,"{'C': 1.0, 'gamma': 'scale', 'kernel': 'rbf'}"
1,0.864931,0.032645,"{'C': 1.0, 'gamma': 0.1, 'kernel': 'rbf'}"
2,0.861939,0.030836,"{'C': 10, 'gamma': 'scale', 'kernel': 'poly'}"
3,0.861194,0.035452,"{'C': 10, 'gamma': 0.1, 'kernel': 'poly'}"
4,0.860441,0.030669,"{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}"
5,0.860075,0.037455,"{'C': 1.0, 'gamma': 'scale', 'kernel': 'poly'}"
6,0.858573,0.031983,"{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}"
7,0.857462,0.037047,"{'C': 1.0, 'gamma': 0.1, 'kernel': 'poly'}"
8,0.849958,0.036903,"{'C': 0.1, 'gamma': 1.0, 'kernel': 'poly'}"
9,0.84176,0.032557,"{'C': 0.1, 'gamma': 'scale', 'kernel': 'poly'}"


In [17]:
df_best_result_SVM = pd.DataFrame(df_SVC_results.iloc[0,:]).T
df_best_result_SVM['Model'] = "SVC"
df_best_result_SVM

Unnamed: 0,means,stds,params,Model
0,0.865306,0.032451,"{'C': 1.0, 'gamma': 'scale', 'kernel': 'rbf'}",SVC


In [18]:
model = SVC(kernel = 'rbf',
           C = 1.0,
           gamma = 'scale').fit(X_train, Y_train)
model_result = cross_validate(model, X_train, Y_train, cv = 10,
                             scoring = ('accuracy', 'recall', 'f1','roc_auc'))
y_pred = cross_val_predict(model, X_train, Y_train, cv = 10)
pd.DataFrame(confusion_matrix(Y_train, y_pred))

Unnamed: 0,0,1
0,513,36
1,87,255


In [19]:
df_best_result_SVM['Recall'] = model_result['test_recall'].mean()
df_best_result_SVM['F1_Score'] = model_result['test_f1'].mean()
df_best_result_SVM['Iteration'] = 11
df_best_result_SVM['AUC'] = model_result['test_roc_auc'].mean()
df_best_result = pd.concat([df_best_result,df_best_result_SVM],
         axis = 0)

In [20]:
pd.set_option('display.max_rows', 999)
df_best_result.sort_values('means',
                          axis = 0,
                          ascending = False)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.865306,0.032451,"{'C': 1.0, 'gamma': 'scale', 'kernel': 'rbf'}",SVC,0.745546,0.802375,11,0.905767
0,0.864191,0.041872,"{'C': 100, 'class_weight': None, 'penalty': 'l...",Logistic_Regression,0.78958,0.814052,11,0.908724
0,0.842501,0.038236,"{'colsample_bytree': 1.0, 'gamma': 0.5, 'learn...",XGBClassifier,0.760588,0.787955,8,0.882208
0,0.842501,0.038236,"{'colsample_bytree': 1.0, 'gamma': 0.5, 'learn...",XGBClassifier,0.760588,0.787955,7,0.882208
0,0.84248,0.038027,"{'colsample_bytree': 0.5, 'gamma': 0.3, 'learn...",XGBClassifier,0.760336,0.781871,9,0.886201
0,0.841748,0.035564,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.725546,0.771451,6,0.879804
0,0.839867,0.034705,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.731176,0.767082,2,0.872206
0,0.839867,0.034705,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.728151,0.773902,1,0.874456
0,0.839118,0.035118,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.731176,0.767082,3,0.8721
0,0.837624,0.034895,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.728319,0.767401,9,0.872104


### 3.3. Naive Bayes Classifiers

In [21]:
#define the model
model = GaussianNB()

#define the parameters to search
var_smoothing = np.logspace(0, -9, num = 50) 

#define grid search
grid = dict(var_smoothing = var_smoothing)
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0,
                          verbose = 2)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Fitting 30 folds for each of 50 candidates, totalling 1500 fits
Best: 0.812601 using {'var_smoothing': 0.12067926406393285}


In [22]:
df_GaussianNB_results = pd.DataFrame(data = results_dict)
df_GaussianNB_results.sort_values(by = ['means','stds'],
                            axis = 0,
                            inplace = True,
                            ascending = [False,True])
df_GaussianNB_results.reset_index(drop = True,
                               inplace = True)
df_GaussianNB_results.head(10)

Unnamed: 0,means,stds,params
0,0.812601,0.039492,{'var_smoothing': 0.12067926406393285}
1,0.809979,0.036243,{'var_smoothing': 0.07906043210907697}
2,0.804744,0.039102,{'var_smoothing': 0.0517947467923121}
3,0.803987,0.042361,{'var_smoothing': 0.022229964825261943}
4,0.801369,0.041884,{'var_smoothing': 0.03393221771895328}
5,0.801365,0.04121,{'var_smoothing': 0.014563484775012436}
6,0.801003,0.037626,{'var_smoothing': 0.1842069969326716}
7,0.800254,0.03265,{'var_smoothing': 0.281176869797423}
8,0.793508,0.04374,{'var_smoothing': 0.009540954763499934}
9,0.793138,0.035303,{'var_smoothing': 0.4291934260128778}


In [23]:
df_best_result_GNB = pd.DataFrame(df_GaussianNB_results.iloc[0,:]).T
df_best_result_GNB['Model'] = "GaussianNB"
df_best_result_GNB

Unnamed: 0,means,stds,params,Model
0,0.812601,0.039492,{'var_smoothing': 0.12067926406393285},GaussianNB


In [24]:
model = GaussianNB(var_smoothing = 0.12067926406393285).fit(X_train, Y_train)
model_result = cross_validate(model, X_train, Y_train, cv = 10,
                             scoring = ('accuracy', 'recall', 'f1','roc_auc'))
y_pred = cross_val_predict(model, X_train, Y_train, cv = 10)
pd.DataFrame(confusion_matrix(Y_train, y_pred))

Unnamed: 0,0,1
0,505,44
1,124,218


In [25]:
df_best_result_GNB['Recall'] = model_result['test_recall'].mean()
df_best_result_GNB['F1_Score'] = model_result['test_f1'].mean()
df_best_result_GNB['Iteration'] = 11
df_best_result_GNB['AUC'] = model_result['test_roc_auc'].mean()
df_best_result = pd.concat([df_best_result,df_best_result_GNB],
         axis = 0)

In [26]:
pd.set_option('display.max_rows', 999)
df_best_result.sort_values('means',
                          axis = 0,
                          ascending = False)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.865306,0.032451,"{'C': 1.0, 'gamma': 'scale', 'kernel': 'rbf'}",SVC,0.745546,0.802375,11,0.905767
0,0.864191,0.041872,"{'C': 100, 'class_weight': None, 'penalty': 'l...",Logistic_Regression,0.78958,0.814052,11,0.908724
0,0.842501,0.038236,"{'colsample_bytree': 1.0, 'gamma': 0.5, 'learn...",XGBClassifier,0.760588,0.787955,7,0.882208
0,0.842501,0.038236,"{'colsample_bytree': 1.0, 'gamma': 0.5, 'learn...",XGBClassifier,0.760588,0.787955,8,0.882208
0,0.84248,0.038027,"{'colsample_bytree': 0.5, 'gamma': 0.3, 'learn...",XGBClassifier,0.760336,0.781871,9,0.886201
0,0.841748,0.035564,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.725546,0.771451,6,0.879804
0,0.839867,0.034705,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.728151,0.773902,1,0.874456
0,0.839867,0.034705,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.731176,0.767082,2,0.872206
0,0.839118,0.035118,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.731176,0.767082,3,0.8721
0,0.837624,0.034895,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.728319,0.767401,9,0.872104


### 3.5. Perceptron

In [27]:
#define the model
model = Perceptron()

#define the parameters to search
eta0 = [0.0001,0.001,0.01,0.1,1.0]
max_iter = [10,100,1000,10000]
early_stopping = [True,False]
penalty = ['l2','l1',None]

#define grid search
grid = dict(eta0 = eta0,
           max_iter = max_iter,
           early_stopping = early_stopping,
           penalty = penalty)
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0,
                          verbose = 2)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Fitting 30 folds for each of 120 candidates, totalling 3600 fits
Best: 0.811411 using {'early_stopping': False, 'eta0': 0.0001, 'max_iter': 10, 'penalty': 'l1'}


In [28]:
df_Perceptron_results = pd.DataFrame(data = results_dict)
df_Perceptron_results.sort_values(by = ['means','stds'],
                            axis = 0,
                            inplace = True,
                            ascending = [False,True])
df_Perceptron_results.reset_index(drop = True,
                               inplace = True)
df_Perceptron_results.head(10)

Unnamed: 0,means,stds,params
0,0.811411,0.042603,"{'early_stopping': False, 'eta0': 0.0001, 'max..."
1,0.811411,0.042603,"{'early_stopping': False, 'eta0': 0.0001, 'max..."
2,0.811411,0.042603,"{'early_stopping': False, 'eta0': 0.0001, 'max..."
3,0.811411,0.042603,"{'early_stopping': False, 'eta0': 0.0001, 'max..."
4,0.811411,0.042603,"{'early_stopping': False, 'eta0': 0.001, 'max_..."
5,0.811411,0.042603,"{'early_stopping': False, 'eta0': 0.001, 'max_..."
6,0.811411,0.042603,"{'early_stopping': False, 'eta0': 0.001, 'max_..."
7,0.811411,0.042603,"{'early_stopping': False, 'eta0': 0.001, 'max_..."
8,0.808856,0.038162,"{'early_stopping': False, 'eta0': 1.0, 'max_it..."
9,0.807732,0.042375,"{'early_stopping': False, 'eta0': 0.1, 'max_it..."


In [29]:
df_best_result_Per = pd.DataFrame(df_Perceptron_results.iloc[0,:]).T
df_best_result_Per['Model'] = "Perceptron"
df_best_result_Per

Unnamed: 0,means,stds,params,Model
0,0.811411,0.042603,"{'early_stopping': False, 'eta0': 0.0001, 'max...",Perceptron


In [30]:
model = Perceptron(eta0 = 0.0001,
           max_iter = 10,
           early_stopping = False,
           penalty = 'l1').fit(X_train, Y_train)
model_result = cross_validate(model, X_train, Y_train, cv = 10,
                             scoring = ('accuracy', 'recall', 'f1','roc_auc'))
y_pred = cross_val_predict(model, X_train, Y_train, cv = 10)
pd.DataFrame(confusion_matrix(Y_train, y_pred))

Unnamed: 0,0,1
0,444,105
1,79,263


In [31]:
df_best_result_Per['Recall'] = model_result['test_recall'].mean()
df_best_result_Per['F1_Score'] = model_result['test_f1'].mean()
df_best_result_Per['Iteration'] = 11
df_best_result_Per['AUC'] = model_result['test_roc_auc'].mean()
df_best_result = pd.concat([df_best_result,df_best_result_Per],
         axis = 0)

In [32]:
pd.set_option('display.max_rows', 999)
df_best_result.sort_values('means',
                          axis = 0,
                          ascending = False)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.865306,0.032451,"{'C': 1.0, 'gamma': 'scale', 'kernel': 'rbf'}",SVC,0.745546,0.802375,11,0.905767
0,0.864191,0.041872,"{'C': 100, 'class_weight': None, 'penalty': 'l...",Logistic_Regression,0.78958,0.814052,11,0.908724
0,0.842501,0.038236,"{'colsample_bytree': 1.0, 'gamma': 0.5, 'learn...",XGBClassifier,0.760588,0.787955,8,0.882208
0,0.842501,0.038236,"{'colsample_bytree': 1.0, 'gamma': 0.5, 'learn...",XGBClassifier,0.760588,0.787955,7,0.882208
0,0.84248,0.038027,"{'colsample_bytree': 0.5, 'gamma': 0.3, 'learn...",XGBClassifier,0.760336,0.781871,9,0.886201
0,0.841748,0.035564,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.725546,0.771451,6,0.879804
0,0.839867,0.034705,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.728151,0.773902,1,0.874456
0,0.839867,0.034705,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.731176,0.767082,2,0.872206
0,0.839118,0.035118,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.731176,0.767082,3,0.8721
0,0.837624,0.034895,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.728319,0.767401,9,0.872104


### 3.6. Linear SVC

In [33]:
#define the model
model = LinearSVC()

#define the parameters to search
c_values = [100, 10, 1.0, 0.1, 0.01]
penalty = ['l2','l1']
max_iter = [1,10,100,1000,10000]

#define grid search
grid = dict(max_iter = max_iter,
           C = c_values,
           penalty = penalty)
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0,
                          verbose = 2)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Fitting 30 folds for each of 50 candidates, totalling 1500 fits
Best: 0.867553 using {'C': 10, 'max_iter': 10000, 'penalty': 'l2'}




In [34]:
df_LinearSVC_results = pd.DataFrame(data = results_dict)
df_LinearSVC_results.sort_values(by = ['means','stds'],
                            axis = 0,
                            inplace = True,
                            ascending = [False,True])
df_LinearSVC_results.reset_index(drop = True,
                               inplace = True)
df_LinearSVC_results.head(10)

Unnamed: 0,means,stds,params
0,0.867553,0.039931,"{'C': 10, 'max_iter': 10000, 'penalty': 'l2'}"
1,0.866434,0.040136,"{'C': 1.0, 'max_iter': 10000, 'penalty': 'l2'}"
2,0.866434,0.040747,"{'C': 100, 'max_iter': 10000, 'penalty': 'l2'}"
3,0.866063,0.039772,"{'C': 1.0, 'max_iter': 1000, 'penalty': 'l2'}"
4,0.866059,0.040289,"{'C': 10, 'max_iter': 1000, 'penalty': 'l2'}"
5,0.86494,0.040567,"{'C': 0.1, 'max_iter': 100, 'penalty': 'l2'}"
6,0.86494,0.040567,"{'C': 0.1, 'max_iter': 1000, 'penalty': 'l2'}"
7,0.86494,0.040567,"{'C': 0.1, 'max_iter': 10000, 'penalty': 'l2'}"
8,0.864569,0.039654,"{'C': 1.0, 'max_iter': 100, 'penalty': 'l2'}"
9,0.860836,0.041336,"{'C': 0.1, 'max_iter': 10, 'penalty': 'l2'}"


In [35]:
df_best_result_LSVC = pd.DataFrame(df_LinearSVC_results.iloc[0,:]).T
df_best_result_LSVC['Model'] = "LinearSVC"
df_best_result_LSVC

Unnamed: 0,means,stds,params,Model
0,0.867553,0.039931,"{'C': 10, 'max_iter': 10000, 'penalty': 'l2'}",LinearSVC


In [36]:
model = LinearSVC(max_iter = 10000,
           C = 10,
           penalty = 'l2').fit(X_train, Y_train)
model_result = cross_validate(model, X_train, Y_train, cv = 10,
                             scoring = ('accuracy', 'recall', 'f1','roc_auc'))
y_pred = cross_val_predict(model, X_train, Y_train, cv = 10)
pd.DataFrame(confusion_matrix(Y_train, y_pred))



Unnamed: 0,0,1
0,500,49
1,71,271


In [37]:
df_best_result_LSVC['Recall'] = model_result['test_recall'].mean()
df_best_result_LSVC['F1_Score'] = model_result['test_f1'].mean()
df_best_result_LSVC['Iteration'] = 11
df_best_result_LSVC['AUC'] = model_result['test_roc_auc'].mean()
df_best_result = pd.concat([df_best_result,df_best_result_LSVC],
         axis = 0)

In [38]:
pd.set_option('display.max_rows', 999)
df_best_result.sort_values('means',
                          axis = 0,
                          ascending = False)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.867553,0.039931,"{'C': 10, 'max_iter': 10000, 'penalty': 'l2'}",LinearSVC,0.792353,0.81782,11,0.905937
0,0.865306,0.032451,"{'C': 1.0, 'gamma': 'scale', 'kernel': 'rbf'}",SVC,0.745546,0.802375,11,0.905767
0,0.864191,0.041872,"{'C': 100, 'class_weight': None, 'penalty': 'l...",Logistic_Regression,0.78958,0.814052,11,0.908724
0,0.842501,0.038236,"{'colsample_bytree': 1.0, 'gamma': 0.5, 'learn...",XGBClassifier,0.760588,0.787955,7,0.882208
0,0.842501,0.038236,"{'colsample_bytree': 1.0, 'gamma': 0.5, 'learn...",XGBClassifier,0.760588,0.787955,8,0.882208
0,0.84248,0.038027,"{'colsample_bytree': 0.5, 'gamma': 0.3, 'learn...",XGBClassifier,0.760336,0.781871,9,0.886201
0,0.841748,0.035564,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.725546,0.771451,6,0.879804
0,0.839867,0.034705,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.728151,0.773902,1,0.874456
0,0.839867,0.034705,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.731176,0.767082,2,0.872206
0,0.839118,0.035118,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.731176,0.767082,3,0.8721


### 3.7. Stochastic Gradient Descent

In [39]:
#define the model
model = SGDClassifier()

#define the parameters to search
loss = ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']
penalty = ['l2','l1']
alpha = [0.0001,0.001,0.01,0.1,1]
learning_rate = ['optimal']


#define grid search
grid = dict(loss = loss,
           alpha = alpha,
           penalty = penalty,
           learning_rate = learning_rate)
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0,
                          verbose = 2)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Fitting 30 folds for each of 90 candidates, totalling 2700 fits
Best: 0.862701 using {'alpha': 0.001, 'learning_rate': 'optimal', 'loss': 'log', 'penalty': 'l1'}


In [40]:
df_SGDClassifier_results = pd.DataFrame(data = results_dict)
df_SGDClassifier_results.sort_values(by = ['means','stds'],
                            axis = 0,
                            inplace = True,
                            ascending = [False,True])
df_SGDClassifier_results.reset_index(drop = True,
                               inplace = True)
df_SGDClassifier_results.head(10)

Unnamed: 0,means,stds,params
0,0.862701,0.039615,"{'alpha': 0.001, 'learning_rate': 'optimal', '..."
1,0.861573,0.039708,"{'alpha': 0.01, 'learning_rate': 'optimal', 'l..."
2,0.861186,0.041518,"{'alpha': 0.01, 'learning_rate': 'optimal', 'l..."
3,0.860828,0.040466,"{'alpha': 0.01, 'learning_rate': 'optimal', 'l..."
4,0.860449,0.032927,"{'alpha': 0.01, 'learning_rate': 'optimal', 'l..."
5,0.860083,0.037759,"{'alpha': 0.001, 'learning_rate': 'optimal', '..."
6,0.858947,0.037937,"{'alpha': 0.1, 'learning_rate': 'optimal', 'lo..."
7,0.858947,0.03976,"{'alpha': 0.001, 'learning_rate': 'optimal', '..."
8,0.858943,0.039039,"{'alpha': 0.01, 'learning_rate': 'optimal', 'l..."
9,0.858198,0.042106,"{'alpha': 0.001, 'learning_rate': 'optimal', '..."


In [41]:
df_best_result_SGDC = pd.DataFrame(df_SGDClassifier_results.iloc[0,:]).T
df_best_result_SGDC['Model'] = "SGDClassifier"
df_best_result_SGDC

Unnamed: 0,means,stds,params,Model
0,0.862701,0.039615,"{'alpha': 0.001, 'learning_rate': 'optimal', '...",SGDClassifier


In [42]:
model = SGDClassifier(loss = 'log',
                      alpha = 0.001,
                      penalty = 'l1',
                      learning_rate = 'optimal').fit(X_train, Y_train)
model_result = cross_validate(model, X_train, Y_train, cv = 10,
                             scoring = ('accuracy', 'recall', 'f1','roc_auc'))
y_pred = cross_val_predict(model, X_train, Y_train, cv = 10)
pd.DataFrame(confusion_matrix(Y_train, y_pred))

Unnamed: 0,0,1
0,493,56
1,68,274


In [43]:
df_best_result_SGDC['Recall'] = model_result['test_recall'].mean()
df_best_result_SGDC['F1_Score'] = model_result['test_f1'].mean()
df_best_result_SGDC['Iteration'] = 11
df_best_result_SGDC['AUC'] = model_result['test_roc_auc'].mean()
df_best_result = pd.concat([df_best_result,df_best_result_SGDC],
         axis = 0)

In [44]:
pd.set_option('display.max_rows', 999)
df_best_result.sort_values('means',
                          axis = 0,
                          ascending = False)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.867553,0.039931,"{'C': 10, 'max_iter': 10000, 'penalty': 'l2'}",LinearSVC,0.792353,0.81782,11,0.905937
0,0.865306,0.032451,"{'C': 1.0, 'gamma': 'scale', 'kernel': 'rbf'}",SVC,0.745546,0.802375,11,0.905767
0,0.864191,0.041872,"{'C': 100, 'class_weight': None, 'penalty': 'l...",Logistic_Regression,0.78958,0.814052,11,0.908724
0,0.862701,0.039615,"{'alpha': 0.001, 'learning_rate': 'optimal', '...",SGDClassifier,0.789496,0.810995,11,0.909375
0,0.842501,0.038236,"{'colsample_bytree': 1.0, 'gamma': 0.5, 'learn...",XGBClassifier,0.760588,0.787955,7,0.882208
0,0.842501,0.038236,"{'colsample_bytree': 1.0, 'gamma': 0.5, 'learn...",XGBClassifier,0.760588,0.787955,8,0.882208
0,0.84248,0.038027,"{'colsample_bytree': 0.5, 'gamma': 0.3, 'learn...",XGBClassifier,0.760336,0.781871,9,0.886201
0,0.841748,0.035564,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.725546,0.771451,6,0.879804
0,0.839867,0.034705,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.731176,0.767082,2,0.872206
0,0.839867,0.034705,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.728151,0.773902,1,0.874456


### 3.8. Decision Tree

In [45]:
#define the model
model = DecisionTreeClassifier()

#define the parameters to search
criterion = ['gini', 'entropy']
splitter = ['best','random']
#max_depth = list(range(1,100,10))
min_samples_split = list(range(2,40,2))
min_samples_leaf = list(range(1,20,2))
max_features = ['auto','sqrt','log2', None]

#define grid search
grid = dict(criterion = criterion,
           splitter = splitter,
           min_samples_split = min_samples_split,
           min_samples_leaf = min_samples_leaf,
           max_features = max_features)
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0,
                          verbose = 2)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Fitting 30 folds for each of 3040 candidates, totalling 91200 fits
Best: 0.871677 using {'criterion': 'entropy', 'max_features': None, 'min_samples_leaf': 11, 'min_samples_split': 6, 'splitter': 'random'}


In [46]:
df_DecisionTreeClassifier_results = pd.DataFrame(data = results_dict)
df_DecisionTreeClassifier_results.sort_values(by = ['means','stds'],
                            axis = 0,
                            inplace = True,
                            ascending = [False,True])
df_DecisionTreeClassifier_results.reset_index(drop = True,
                               inplace = True)
df_DecisionTreeClassifier_results.head(10)

Unnamed: 0,means,stds,params
0,0.871677,0.031015,"{'criterion': 'entropy', 'max_features': None,..."
1,0.871303,0.028625,"{'criterion': 'entropy', 'max_features': None,..."
2,0.871303,0.030059,"{'criterion': 'entropy', 'max_features': None,..."
3,0.871303,0.030338,"{'criterion': 'entropy', 'max_features': None,..."
4,0.871303,0.030476,"{'criterion': 'entropy', 'max_features': None,..."
5,0.871303,0.030614,"{'criterion': 'entropy', 'max_features': None,..."
6,0.870928,0.029923,"{'criterion': 'entropy', 'max_features': None,..."
7,0.870928,0.030618,"{'criterion': 'gini', 'max_features': None, 'm..."
8,0.870179,0.031292,"{'criterion': 'gini', 'max_features': None, 'm..."
9,0.870179,0.030336,"{'criterion': 'entropy', 'max_features': None,..."


In [47]:
df_best_result_DTC = pd.DataFrame(df_DecisionTreeClassifier_results.iloc[0,:]).T
df_best_result_DTC['Model'] = "DecisionTreeClassifier"
df_best_result_DTC

Unnamed: 0,means,stds,params,Model
0,0.871677,0.031015,"{'criterion': 'entropy', 'max_features': None,...",DecisionTreeClassifier


In [48]:
model = DecisionTreeClassifier(criterion = 'entropy',
                       splitter = 'random',
                       min_samples_split = 6,
                       min_samples_leaf = 11,
                       max_features = None).fit(X_train, Y_train)
model_result = cross_validate(model, X_train, Y_train, cv = 10,
                             scoring = ('accuracy', 'recall', 'f1','roc_auc'))
y_pred = cross_val_predict(model, X_train, Y_train, cv = 10)
pd.DataFrame(confusion_matrix(Y_train, y_pred))

Unnamed: 0,0,1
0,525,24
1,88,254


In [49]:
df_best_result_DTC['Recall'] = model_result['test_recall'].mean()
df_best_result_DTC['F1_Score'] = model_result['test_f1'].mean()
df_best_result_DTC['Iteration'] = 11
df_best_result_DTC['AUC'] = model_result['test_roc_auc'].mean()
df_best_result = pd.concat([df_best_result,df_best_result_DTC],
         axis = 0)

In [50]:
pd.set_option('display.max_rows', 999)
df_best_result.sort_values('means',
                          axis = 0,
                          ascending = False)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.871677,0.031015,"{'criterion': 'entropy', 'max_features': None,...",DecisionTreeClassifier,0.742689,0.819146,11,0.904759
0,0.867553,0.039931,"{'C': 10, 'max_iter': 10000, 'penalty': 'l2'}",LinearSVC,0.792353,0.81782,11,0.905937
0,0.865306,0.032451,"{'C': 1.0, 'gamma': 'scale', 'kernel': 'rbf'}",SVC,0.745546,0.802375,11,0.905767
0,0.864191,0.041872,"{'C': 100, 'class_weight': None, 'penalty': 'l...",Logistic_Regression,0.78958,0.814052,11,0.908724
0,0.862701,0.039615,"{'alpha': 0.001, 'learning_rate': 'optimal', '...",SGDClassifier,0.789496,0.810995,11,0.909375
0,0.842501,0.038236,"{'colsample_bytree': 1.0, 'gamma': 0.5, 'learn...",XGBClassifier,0.760588,0.787955,7,0.882208
0,0.842501,0.038236,"{'colsample_bytree': 1.0, 'gamma': 0.5, 'learn...",XGBClassifier,0.760588,0.787955,8,0.882208
0,0.84248,0.038027,"{'colsample_bytree': 0.5, 'gamma': 0.3, 'learn...",XGBClassifier,0.760336,0.781871,9,0.886201
0,0.841748,0.035564,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.725546,0.771451,6,0.879804
0,0.839867,0.034705,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.728151,0.773902,1,0.874456


### 3.9. Random Forest

In [58]:
#define the model
model = RandomForestClassifier()

#define the parameters to search
n_estimators = [100,1000]
criterion = ['gini','entropy']
max_depth = [100,1000]
#max_depth.append(None)
min_samples_split = [12]
min_samples_leaf = [3,6,9,12,15]
max_features = [None]
bootstrap = [True]

#define grid search
grid = dict(criterion = criterion,
            n_estimators = n_estimators,
           max_depth = max_depth,
           min_samples_split = min_samples_split,
           min_samples_leaf = min_samples_leaf,
           max_features = max_features,
           bootstrap = bootstrap)
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0,
                          verbose = 2)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Fitting 30 folds for each of 40 candidates, totalling 1200 fits
Best: 0.868681 using {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 1000, 'max_features': None, 'min_samples_leaf': 6, 'min_samples_split': 12, 'n_estimators': 1000}


In [59]:
df_RandomForestClassifier_results = pd.DataFrame(data = results_dict)
df_RandomForestClassifier_results.sort_values(by = ['means','stds'],
                            axis = 0,
                            inplace = True,
                            ascending = [False,True])
df_RandomForestClassifier_results.reset_index(drop = True,
                               inplace = True)
with pd.option_context('display.max_colwidth', -1):     
    display(df_RandomForestClassifier_results.head(10))

Unnamed: 0,means,stds,params
0,0.868681,0.029004,"{'bootstrap': True, 'criterion': 'entropy', 'max_depth': 1000, 'max_features': None, 'min_samples_leaf': 6, 'min_samples_split': 12, 'n_estimators': 1000}"
1,0.867936,0.029064,"{'bootstrap': True, 'criterion': 'entropy', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 12, 'n_estimators': 1000}"
2,0.867932,0.028507,"{'bootstrap': True, 'criterion': 'entropy', 'max_depth': 1000, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 12, 'n_estimators': 1000}"
3,0.867928,0.031187,"{'bootstrap': True, 'criterion': 'entropy', 'max_depth': 1000, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 12, 'n_estimators': 100}"
4,0.867179,0.033147,"{'bootstrap': True, 'criterion': 'gini', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 12, 'n_estimators': 100}"
5,0.866434,0.029599,"{'bootstrap': True, 'criterion': 'gini', 'max_depth': 1000, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 12, 'n_estimators': 1000}"
6,0.866434,0.030946,"{'bootstrap': True, 'criterion': 'gini', 'max_depth': 1000, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 12, 'n_estimators': 100}"
7,0.866067,0.031174,"{'bootstrap': True, 'criterion': 'gini', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 12, 'n_estimators': 1000}"
8,0.865689,0.030451,"{'bootstrap': True, 'criterion': 'entropy', 'max_depth': 1000, 'max_features': None, 'min_samples_leaf': 6, 'min_samples_split': 12, 'n_estimators': 100}"
9,0.865323,0.028797,"{'bootstrap': True, 'criterion': 'entropy', 'max_depth': 100, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 12, 'n_estimators': 100}"


In [60]:
df_best_result_RF = pd.DataFrame(df_RandomForestClassifier_results.iloc[0,:]).T
df_best_result_RF['Model'] = "RandomForestClassifier"
df_best_result_RF

Unnamed: 0,means,stds,params,Model
0,0.868681,0.029004,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier


In [61]:
model = RandomForestClassifier(criterion = 'entropy',
                               n_estimators = 1000,
                               max_depth = 1000,
                               min_samples_split = 12,
                               min_samples_leaf = 6,
                               max_features = None,
                               bootstrap = True).fit(X_train, Y_train)
model_result = cross_validate(model, X_train, Y_train, cv = 10,
                             scoring = ('accuracy', 'recall', 'f1','roc_auc'))
y_pred = cross_val_predict(model, X_train, Y_train, cv = 10)
pd.DataFrame(confusion_matrix(Y_train, y_pred))

Unnamed: 0,0,1
0,507,42
1,80,262


In [62]:
df_best_result_RF['Recall'] = model_result['test_recall'].mean()
df_best_result_RF['F1_Score'] = model_result['test_f1'].mean()
df_best_result_RF['Iteration'] = 11
df_best_result_RF['AUC'] = model_result['test_roc_auc'].mean()
df_best_result = pd.concat([df_best_result,df_best_result_RF],
         axis = 0)

In [63]:
pd.set_option('display.max_rows', 999)
df_best_result.sort_values('means',
                          axis = 0,
                          ascending = False)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.871677,0.031015,"{'criterion': 'entropy', 'max_features': None,...",DecisionTreeClassifier,0.742689,0.819146,11,0.904759
0,0.868681,0.029004,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.769076,0.813234,11,0.919214
0,0.867553,0.039931,"{'C': 10, 'max_iter': 10000, 'penalty': 'l2'}",LinearSVC,0.792353,0.81782,11,0.905937
0,0.865306,0.032451,"{'C': 1.0, 'gamma': 'scale', 'kernel': 'rbf'}",SVC,0.745546,0.802375,11,0.905767
0,0.864191,0.041872,"{'C': 100, 'class_weight': None, 'penalty': 'l...",Logistic_Regression,0.78958,0.814052,11,0.908724
0,0.862701,0.039615,"{'alpha': 0.001, 'learning_rate': 'optimal', '...",SGDClassifier,0.789496,0.810995,11,0.909375
0,0.842501,0.038236,"{'colsample_bytree': 1.0, 'gamma': 0.5, 'learn...",XGBClassifier,0.760588,0.787955,7,0.882208
0,0.842501,0.038236,"{'colsample_bytree': 1.0, 'gamma': 0.5, 'learn...",XGBClassifier,0.760588,0.787955,8,0.882208
0,0.84248,0.038027,"{'colsample_bytree': 0.5, 'gamma': 0.3, 'learn...",XGBClassifier,0.760336,0.781871,9,0.886201
0,0.841748,0.035564,"{'colsample_bytree': 1.0, 'gamma': 1, 'learnin...",XGBClassifier,0.725546,0.771451,6,0.879804


### 3.10. XGB Classifier

In [77]:
#define the model
model = XGBClassifier()

#define the parameters to search
n_estimators = [500]
learning_rate = [0.1,0.01]
max_depth = [6,9]
min_child_weight = [6]
gamma = [0.5,0.7]
subsample = [0.7]
colsample_bytree = [0.5,0.7,0.9]

#define grid search
grid = dict(n_estimators = n_estimators,
           learning_rate = learning_rate,
           max_depth = max_depth,
           min_child_weight = min_child_weight,
           gamma = gamma,
           subsample = subsample,
           colsample_bytree = colsample_bytree,
           use_label_encoder=[False])
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0,
                          verbose = 2)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Fitting 30 folds for each of 24 candidates, totalling 720 fits
Best: 0.869051 using {'colsample_bytree': 0.9, 'gamma': 0.5, 'learning_rate': 0.01, 'max_depth': 9, 'min_child_weight': 6, 'n_estimators': 500, 'subsample': 0.7, 'use_label_encoder': False}


In [78]:
df_XGBClassifier_results = pd.DataFrame(data = results_dict)
df_XGBClassifier_results.sort_values(by = ['means','stds'],
                            axis = 0,
                            inplace = True,
                            ascending = [False,True])
df_XGBClassifier_results.reset_index(drop = True,
                               inplace = True)

with pd.option_context('display.max_colwidth', -1):     
    display(df_XGBClassifier_results.head(10))

Unnamed: 0,means,stds,params
0,0.869051,0.031821,"{'colsample_bytree': 0.9, 'gamma': 0.5, 'learning_rate': 0.01, 'max_depth': 9, 'min_child_weight': 6, 'n_estimators': 500, 'subsample': 0.7, 'use_label_encoder': False}"
1,0.868681,0.031911,"{'colsample_bytree': 0.9, 'gamma': 0.7, 'learning_rate': 0.01, 'max_depth': 6, 'min_child_weight': 6, 'n_estimators': 500, 'subsample': 0.7, 'use_label_encoder': False}"
2,0.868681,0.031911,"{'colsample_bytree': 0.9, 'gamma': 0.7, 'learning_rate': 0.01, 'max_depth': 9, 'min_child_weight': 6, 'n_estimators': 500, 'subsample': 0.7, 'use_label_encoder': False}"
3,0.868677,0.031799,"{'colsample_bytree': 0.9, 'gamma': 0.5, 'learning_rate': 0.01, 'max_depth': 6, 'min_child_weight': 6, 'n_estimators': 500, 'subsample': 0.7, 'use_label_encoder': False}"
4,0.868677,0.033349,"{'colsample_bytree': 0.7, 'gamma': 0.5, 'learning_rate': 0.01, 'max_depth': 9, 'min_child_weight': 6, 'n_estimators': 500, 'subsample': 0.7, 'use_label_encoder': False}"
5,0.868302,0.033449,"{'colsample_bytree': 0.7, 'gamma': 0.5, 'learning_rate': 0.01, 'max_depth': 6, 'min_child_weight': 6, 'n_estimators': 500, 'subsample': 0.7, 'use_label_encoder': False}"
6,0.867928,0.03367,"{'colsample_bytree': 0.7, 'gamma': 0.7, 'learning_rate': 0.01, 'max_depth': 9, 'min_child_weight': 6, 'n_estimators': 500, 'subsample': 0.7, 'use_label_encoder': False}"
7,0.867553,0.033385,"{'colsample_bytree': 0.7, 'gamma': 0.7, 'learning_rate': 0.01, 'max_depth': 6, 'min_child_weight': 6, 'n_estimators': 500, 'subsample': 0.7, 'use_label_encoder': False}"
8,0.866808,0.035245,"{'colsample_bytree': 0.5, 'gamma': 0.5, 'learning_rate': 0.01, 'max_depth': 9, 'min_child_weight': 6, 'n_estimators': 500, 'subsample': 0.7, 'use_label_encoder': False}"
9,0.866808,0.035245,"{'colsample_bytree': 0.5, 'gamma': 0.7, 'learning_rate': 0.01, 'max_depth': 9, 'min_child_weight': 6, 'n_estimators': 500, 'subsample': 0.7, 'use_label_encoder': False}"


In [79]:
df_best_result_XGB = pd.DataFrame(df_XGBClassifier_results.iloc[0,:]).T
df_best_result_XGB['Model'] = "XGBClassifier"
df_best_result_XGB

Unnamed: 0,means,stds,params,Model
0,0.869051,0.031821,"{'colsample_bytree': 0.9, 'gamma': 0.5, 'learn...",XGBClassifier


In [80]:
model = XGBClassifier(n_estimators = 500,
                       learning_rate = 0.01,
                       max_depth = 9,
                       min_child_weight = 6,
                       gamma = 0.5,
                       subsample = 0.7,
                       colsample_bytree = 0.9,
                       use_label_encoder=[False]).fit(X_train, Y_train)
model_result = cross_validate(model, X_train, Y_train, cv = 10,
                             scoring = ('accuracy', 'recall', 'f1','roc_auc'))
y_pred = cross_val_predict(model, X_train, Y_train, cv = 10)
pd.DataFrame(confusion_matrix(Y_train, y_pred))





















































































Unnamed: 0,0,1
0,513,36
1,85,257


In [81]:
df_best_result_XGB['Recall'] = model_result['test_recall'].mean()
df_best_result_XGB['F1_Score'] = model_result['test_f1'].mean()
df_best_result_XGB['Iteration'] = 11
df_best_result_XGB['AUC'] = model_result['test_roc_auc'].mean()
df_best_result = pd.concat([df_best_result,df_best_result_XGB],
         axis = 0)

In [82]:
pd.set_option('display.max_rows', 999)
df_best_result.sort_values('means',
                          axis = 0,
                          ascending = False)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.871677,0.031015,"{'criterion': 'entropy', 'max_features': None,...",DecisionTreeClassifier,0.742689,0.819146,11,0.904759
0,0.869051,0.031821,"{'colsample_bytree': 0.9, 'gamma': 0.5, 'learn...",XGBClassifier,0.751513,0.807436,11,0.924661
0,0.868681,0.029004,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.769076,0.813234,11,0.919214
0,0.867553,0.039931,"{'C': 10, 'max_iter': 10000, 'penalty': 'l2'}",LinearSVC,0.792353,0.81782,11,0.905937
0,0.865306,0.032451,"{'C': 1.0, 'gamma': 'scale', 'kernel': 'rbf'}",SVC,0.745546,0.802375,11,0.905767
0,0.864191,0.041872,"{'C': 100, 'class_weight': None, 'penalty': 'l...",Logistic_Regression,0.78958,0.814052,11,0.908724
0,0.862701,0.039615,"{'alpha': 0.001, 'learning_rate': 'optimal', '...",SGDClassifier,0.789496,0.810995,11,0.909375
0,0.842501,0.038236,"{'colsample_bytree': 1.0, 'gamma': 0.5, 'learn...",XGBClassifier,0.760588,0.787955,8,0.882208
0,0.842501,0.038236,"{'colsample_bytree': 1.0, 'gamma': 0.5, 'learn...",XGBClassifier,0.760588,0.787955,7,0.882208
0,0.84248,0.038027,"{'colsample_bytree': 0.5, 'gamma': 0.3, 'learn...",XGBClassifier,0.760336,0.781871,9,0.886201


In [83]:
df_best_result[df_best_result['Iteration'].isin([9,11])].sort_values('means',
                                                                   axis = 0,
                                                                   ascending = False)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.871677,0.031015,"{'criterion': 'entropy', 'max_features': None,...",DecisionTreeClassifier,0.742689,0.819146,11,0.904759
0,0.869051,0.031821,"{'colsample_bytree': 0.9, 'gamma': 0.5, 'learn...",XGBClassifier,0.751513,0.807436,11,0.924661
0,0.868681,0.029004,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.769076,0.813234,11,0.919214
0,0.867553,0.039931,"{'C': 10, 'max_iter': 10000, 'penalty': 'l2'}",LinearSVC,0.792353,0.81782,11,0.905937
0,0.865306,0.032451,"{'C': 1.0, 'gamma': 'scale', 'kernel': 'rbf'}",SVC,0.745546,0.802375,11,0.905767
0,0.864191,0.041872,"{'C': 100, 'class_weight': None, 'penalty': 'l...",Logistic_Regression,0.78958,0.814052,11,0.908724
0,0.862701,0.039615,"{'alpha': 0.001, 'learning_rate': 'optimal', '...",SGDClassifier,0.789496,0.810995,11,0.909375
0,0.84248,0.038027,"{'colsample_bytree': 0.5, 'gamma': 0.3, 'learn...",XGBClassifier,0.760336,0.781871,9,0.886201
0,0.837624,0.034895,"{'bootstrap': True, 'criterion': 'entropy', 'm...",RandomForestClassifier,0.728319,0.767401,9,0.872104
0,0.832031,0.034329,"{'C': 1.0, 'gamma': 0.1, 'kernel': 'rbf'}",SVC,0.73958,0.770063,9,0.863637


The new feature showed a generalised improvement.

In [84]:
df_best_result.to_csv('best_result_11.csv')