## 1. Library Import

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve, auc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

## 2. Load Data

In [31]:
df_train = pd.read_csv('train_iter_2.csv',
                      index_col = 0)
df_test = pd.read_csv('test_iter_2.csv',
                      index_col = 0)

X_train = df_train.drop('Survived',
                       axis = 1,)
Y_train = df_train.loc[:,'Survived']
X_test = df_test

df_best_result = pd.read_csv('../Iteration_1_Baseline/best_result_1.csv',
                             index_col = 0)

## 3. Model

### 3.1. Logistic Regression

In [3]:
#define the model
model = LogisticRegression()

#define the parameters to search
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
class_weight = [None, 'balanced']

#define grid search
grid = dict(solver = solvers,
           penalty = penalty,
           C = c_values,
           class_weight = class_weight)
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Best: 0.808057 using {'C': 0.01, 'class_weight': None, 'penalty': 'l2', 'solver': 'sag'}


In [12]:
df_logistic_results = pd.DataFrame(data = results_dict)
df_logistic_results.sort_values(by = 'means',
                            axis = 0,
                            inplace = True,
                            ascending = False)
df_logistic_results.reset_index(drop = True,
                               inplace = True)
df_logistic_results.head(10)

Unnamed: 0,means,stds,params
0,0.808057,0.041257,"{'C': 0.01, 'class_weight': None, 'penalty': '..."
1,0.807686,0.041278,"{'C': 0.01, 'class_weight': None, 'penalty': '..."
2,0.807686,0.041278,"{'C': 0.01, 'class_weight': None, 'penalty': '..."
3,0.807686,0.041278,"{'C': 0.01, 'class_weight': None, 'penalty': '..."
4,0.792734,0.037603,"{'C': 10, 'class_weight': None, 'penalty': 'l2..."
5,0.792734,0.037603,"{'C': 10, 'class_weight': None, 'penalty': 'l2..."
6,0.792734,0.037266,"{'C': 1.0, 'class_weight': None, 'penalty': 'l..."
7,0.792734,0.037266,"{'C': 1.0, 'class_weight': None, 'penalty': 'l..."
8,0.792734,0.037603,"{'C': 10, 'class_weight': None, 'penalty': 'l2..."
9,0.792734,0.037603,"{'C': 10, 'class_weight': None, 'penalty': 'l2..."


In [13]:
df_best_result_LR = pd.DataFrame(df_logistic_results.iloc[0,:]).T
df_best_result_LR['Model'] = "Logistic_Regression"
df_best_result_LR

Unnamed: 0,means,stds,params,Model
0,0.808057,0.041257,"{'C': 0.01, 'class_weight': None, 'penalty': '...",Logistic_Regression


In [14]:
model = LogisticRegression(solver = 'newton-cg',
                           penalty = 'l2',
                           C = 0.01,
                           class_weight = None).fit(X_train, Y_train)
print(model.score(X_train, Y_train))
pd.crosstab(Y_train, 
            model.predict(X_train), 
            rownames = ['Real data'], 
            colnames = ['Predicted'])

0.8114478114478114


Predicted,0.0,1.0
Real data,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,491,58
1.0,110,232


In [15]:
df_best_result_LR['Recall'] = recall_score(Y_train, model.predict(X_train))
df_best_result_LR['F1_Score'] = f1_score(Y_train, model.predict(X_train))
df_best_result_LR['Iteration'] = 2
false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_train, model.predict(X_train))
roc_auc = auc(false_positive_rate, true_positive_rate)
df_best_result_LR['AUC'] = roc_auc
df_best_result = pd.concat([df_best_result,df_best_result_LR],
         axis = 0)

In [33]:
df_best_result.sort_values('means',
                          axis = 0,
                          ascending = False)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.840612,0.034154,"{'colsample_bytree': 0.6, 'gamma': 1, 'learnin...",XGBClassifier,0.812865,0.854071,1,0.8782
0,0.832002,0.038314,"{'bootstrap': True, 'criterion': 'gini', 'max_...",RandomForestClassifier,0.894737,0.940092,1,0.944636
0,0.819309,0.040695,"{'criterion': 'gini', 'max_features': None, 'm...",DecisionTreeClassifier,0.824561,0.858447,1,0.882226
0,0.808057,0.041257,"{'C': 0.01, 'class_weight': None, 'penalty': '...",Logistic_Regression,0.678363,0.734177,2,0.786358
0,0.805814,0.043604,"{'C': 0.1, 'class_weight': None, 'penalty': 'l...",Logistic_Regression,0.681287,0.730408,1,0.783266
0,0.796101,0.040125,"{'C': 0.1, 'max_iter': 10000, 'penalty': 'l2'}",LinearSVC,0.687135,0.725309,1,0.778904
0,0.778506,0.051957,"{'alpha': 0.01, 'learning_rate': 'optimal', 'l...",SGDClassifier,0.792398,0.68262,1,0.731354
0,0.778136,0.033252,{'var_smoothing': 4.71486636345739e-06},GaussianNB,0.687135,0.711044,1,0.767065
0,0.749351,0.034313,"{'leaf_size': 26, 'n_neighbors': 11, 'p': 1}",KNeighbors,0.666667,0.706977,1,0.765027
0,0.670121,0.10997,"{'early_stopping': False, 'eta0': 0.1, 'max_it...",Perceptron,0.871345,0.687428,1,0.728933


### 3.3. Support Vector Machines

In [24]:
#define the model
model = SVC()

#define the parameters to search
kernel = ['poly','rbf']#['linear','poly','rbf','sigmoid']
c_values = [10,1.0,0.1]#[100, 10, 1.0, 0.1, 0.01]
gamma = ['scale', 1, 0.1, 0.01, 0.001]

#define grid search
grid = dict(kernel = kernel,
           C = c_values,
           gamma = gamma)
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0,
                          verbose = 2)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Fitting 30 folds for each of 30 candidates, totalling 900 fits
Best: 0.823774 using {'C': 1.0, 'gamma': 0.1, 'kernel': 'rbf'}


In [25]:
df_SVC_results = pd.DataFrame(data = results_dict)
df_SVC_results.sort_values(by = 'means',
                            axis = 0,
                            inplace = True,
                            ascending = False)
df_SVC_results.reset_index(drop = True,
                               inplace = True)
df_SVC_results.head(10)

Unnamed: 0,means,stds,params
0,0.823774,0.040589,"{'C': 1.0, 'gamma': 0.1, 'kernel': 'rbf'}"
1,0.821906,0.042054,"{'C': 1.0, 'gamma': 'scale', 'kernel': 'rbf'}"
2,0.815926,0.03914,"{'C': 10, 'gamma': 'scale', 'kernel': 'poly'}"
3,0.815543,0.037532,"{'C': 1.0, 'gamma': 'scale', 'kernel': 'poly'}"
4,0.815181,0.036459,"{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}"
5,0.814419,0.036158,"{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}"
6,0.81144,0.038553,"{'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}"
7,0.811436,0.039514,"{'C': 0.1, 'gamma': 1, 'kernel': 'poly'}"
8,0.810674,0.037567,"{'C': 10, 'gamma': 0.1, 'kernel': 'poly'}"
9,0.808073,0.04379,"{'C': 1.0, 'gamma': 1, 'kernel': 'rbf'}"


In [27]:
df_best_result_SVM = pd.DataFrame(df_SVC_results.iloc[0,:]).T
df_best_result_SVM['Model'] = "SVC"
df_best_result_SVM

Unnamed: 0,means,stds,params,Model
0,0.823774,0.040589,"{'C': 1.0, 'gamma': 0.1, 'kernel': 'rbf'}",SVC


In [28]:
model = SVC(kernel = 'rbf',
           C = 1.0,
           gamma = 0.1).fit(X_train, Y_train)
print(model.score(X_train, Y_train))
pd.crosstab(Y_train, 
            model.predict(X_train), 
            rownames = ['Real data'], 
            colnames = ['Predicted'])

0.8383838383838383


Predicted,0.0,1.0
Real data,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,505,44
1.0,100,242


In [34]:
df_best_result_SVM['Recall'] = recall_score(Y_train, model.predict(X_train))
df_best_result_SVM['F1_Score'] = f1_score(Y_train, model.predict(X_train))
df_best_result_SVM['Iteration'] = 2
false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_train, model.predict(X_train))
roc_auc = auc(false_positive_rate, true_positive_rate)
df_best_result_SVM['AUC'] = roc_auc
df_best_result = pd.concat([df_best_result,df_best_result_SVM],
         axis = 0)

In [35]:
df_best_result.sort_values('means',
                          axis = 0,
                          ascending = False)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.840612,0.034154,"{'colsample_bytree': 0.6, 'gamma': 1, 'learnin...",XGBClassifier,0.812865,0.854071,1,0.8782
0,0.832002,0.038314,"{'bootstrap': True, 'criterion': 'gini', 'max_...",RandomForestClassifier,0.894737,0.940092,1,0.944636
0,0.823774,0.040589,"{'C': 1.0, 'gamma': 0.1, 'kernel': 'rbf'}",SVC,0.707602,0.770701,2,0.813728
0,0.819309,0.040695,"{'criterion': 'gini', 'max_features': None, 'm...",DecisionTreeClassifier,0.824561,0.858447,1,0.882226
0,0.808057,0.041257,"{'C': 0.01, 'class_weight': None, 'penalty': '...",Logistic_Regression,0.678363,0.734177,2,0.786358
0,0.805814,0.043604,"{'C': 0.1, 'class_weight': None, 'penalty': 'l...",Logistic_Regression,0.681287,0.730408,1,0.783266
0,0.796101,0.040125,"{'C': 0.1, 'max_iter': 10000, 'penalty': 'l2'}",LinearSVC,0.687135,0.725309,1,0.778904
0,0.778506,0.051957,"{'alpha': 0.01, 'learning_rate': 'optimal', 'l...",SGDClassifier,0.792398,0.68262,1,0.731354
0,0.778136,0.033252,{'var_smoothing': 4.71486636345739e-06},GaussianNB,0.687135,0.711044,1,0.767065
0,0.749351,0.034313,"{'leaf_size': 26, 'n_neighbors': 11, 'p': 1}",KNeighbors,0.666667,0.706977,1,0.765027


### 3.3. Naive Bayes Classifiers

In [36]:
#define the model
model = GaussianNB()

#define the parameters to search
var_smoothing = np.logspace(0, -9, num = 50) 

#define grid search
grid = dict(var_smoothing = var_smoothing)
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0,
                          verbose = 2)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Fitting 30 folds for each of 50 candidates, totalling 1500 fits
Best: 0.778136 using {'var_smoothing': 0.12067926406393285}


In [37]:
df_GaussianNB_results = pd.DataFrame(data = results_dict)
df_GaussianNB_results.sort_values(by = ['means','stds'],
                            axis = 0,
                            inplace = True,
                            ascending = [False,True])
df_GaussianNB_results.reset_index(drop = True,
                               inplace = True)
df_GaussianNB_results.head(10)

Unnamed: 0,means,stds,params
0,0.778136,0.03604,{'var_smoothing': 0.12067926406393285}
1,0.777757,0.035448,{'var_smoothing': 0.07906043210907697}
2,0.777,0.037032,{'var_smoothing': 0.0517947467923121}
3,0.776642,0.039557,{'var_smoothing': 0.1842069969326716}
4,0.776259,0.035621,{'var_smoothing': 0.03393221771895328}
5,0.775135,0.035811,{'var_smoothing': 0.022229964825261943}
6,0.773637,0.036703,{'var_smoothing': 0.014563484775012436}
7,0.773263,0.040656,{'var_smoothing': 0.0004941713361323833}
8,0.773263,0.040656,{'var_smoothing': 0.00032374575428176434}
9,0.773263,0.039287,{'var_smoothing': 0.004094915062380423}


In [38]:
df_best_result_GNB = pd.DataFrame(df_GaussianNB_results.iloc[0,:]).T
df_best_result_GNB['Model'] = "GaussianNB"
df_best_result_GNB

Unnamed: 0,means,stds,params,Model
0,0.778136,0.03604,{'var_smoothing': 0.12067926406393285},GaussianNB


In [39]:
model = GaussianNB(var_smoothing = 0.12067926406393285).fit(X_train, Y_train)
print(model.score(X_train, Y_train))
pd.crosstab(Y_train, 
            model.predict(X_train), 
            rownames = ['Real data'], 
            colnames = ['Predicted'])

0.7890011223344556


Predicted,0.0,1.0
Real data,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,470,79
1.0,109,233


In [40]:
df_best_result_GNB['Recall'] = recall_score(Y_train, model.predict(X_train))
df_best_result_GNB['F1_Score'] = f1_score(Y_train, model.predict(X_train))
df_best_result_GNB['Iteration'] = 2
false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_train, model.predict(X_train))
roc_auc = auc(false_positive_rate, true_positive_rate)
df_best_result_GNB['AUC'] = roc_auc
df_best_result = pd.concat([df_best_result,df_best_result_GNB],
         axis = 0)

In [41]:
df_best_result.sort_values('means',
                          axis = 0,
                          ascending = False)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.840612,0.034154,"{'colsample_bytree': 0.6, 'gamma': 1, 'learnin...",XGBClassifier,0.812865,0.854071,1,0.8782
0,0.832002,0.038314,"{'bootstrap': True, 'criterion': 'gini', 'max_...",RandomForestClassifier,0.894737,0.940092,1,0.944636
0,0.823774,0.040589,"{'C': 1.0, 'gamma': 0.1, 'kernel': 'rbf'}",SVC,0.707602,0.770701,2,0.813728
0,0.819309,0.040695,"{'criterion': 'gini', 'max_features': None, 'm...",DecisionTreeClassifier,0.824561,0.858447,1,0.882226
0,0.808057,0.041257,"{'C': 0.01, 'class_weight': None, 'penalty': '...",Logistic_Regression,0.678363,0.734177,2,0.786358
0,0.805814,0.043604,"{'C': 0.1, 'class_weight': None, 'penalty': 'l...",Logistic_Regression,0.681287,0.730408,1,0.783266
0,0.796101,0.040125,"{'C': 0.1, 'max_iter': 10000, 'penalty': 'l2'}",LinearSVC,0.687135,0.725309,1,0.778904
0,0.778506,0.051957,"{'alpha': 0.01, 'learning_rate': 'optimal', 'l...",SGDClassifier,0.792398,0.68262,1,0.731354
0,0.778136,0.033252,{'var_smoothing': 4.71486636345739e-06},GaussianNB,0.687135,0.711044,1,0.767065
0,0.778136,0.03604,{'var_smoothing': 0.12067926406393285},GaussianNB,0.681287,0.712538,2,0.768694


### 3.5. Perceptron

In [43]:
#define the model
model = Perceptron()

#define the parameters to search
eta0 = [0.0001,0.001,0.01,0.1,1.0]
max_iter = [10,100,1000,10000]
early_stopping = [True,False]
penalty = ['l2','l1',None]

#define grid search
grid = dict(eta0 = eta0,
           max_iter = max_iter,
           early_stopping = early_stopping,
           penalty = penalty)
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0,
                          verbose = 2)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Fitting 30 folds for each of 120 candidates, totalling 3600 fits
Best: 0.738077 using {'early_stopping': False, 'eta0': 0.1, 'max_iter': 100, 'penalty': 'l1'}


In [44]:
df_Perceptron_results = pd.DataFrame(data = results_dict)
df_Perceptron_results.sort_values(by = ['means','stds'],
                            axis = 0,
                            inplace = True,
                            ascending = [False,True])
df_Perceptron_results.reset_index(drop = True,
                               inplace = True)
df_Perceptron_results.head(10)

Unnamed: 0,means,stds,params
0,0.738077,0.084509,"{'early_stopping': False, 'eta0': 0.1, 'max_it..."
1,0.738077,0.084509,"{'early_stopping': False, 'eta0': 0.1, 'max_it..."
2,0.738077,0.084509,"{'early_stopping': False, 'eta0': 0.1, 'max_it..."
3,0.735556,0.058014,"{'early_stopping': False, 'eta0': 0.1, 'max_it..."
4,0.735556,0.058014,"{'early_stopping': False, 'eta0': 1.0, 'max_it..."
5,0.728801,0.062437,"{'early_stopping': False, 'eta0': 1.0, 'max_it..."
6,0.728406,0.072984,"{'early_stopping': False, 'eta0': 0.01, 'max_i..."
7,0.728406,0.072984,"{'early_stopping': False, 'eta0': 0.01, 'max_i..."
8,0.728406,0.072984,"{'early_stopping': False, 'eta0': 0.01, 'max_i..."
9,0.728406,0.072984,"{'early_stopping': False, 'eta0': 0.01, 'max_i..."


In [45]:
df_best_result_Per = pd.DataFrame(df_Perceptron_results.iloc[0,:]).T
df_best_result_Per['Model'] = "Perceptron"
df_best_result_Per

Unnamed: 0,means,stds,params,Model
0,0.738077,0.084509,"{'early_stopping': False, 'eta0': 0.1, 'max_it...",Perceptron


In [46]:
model = Perceptron(eta0 = 0.1,
           max_iter = 100,
           early_stopping = False,
           penalty = 'l1').fit(X_train, Y_train)
print(model.score(X_train, Y_train))
pd.crosstab(Y_train, 
            model.predict(X_train), 
            rownames = ['Real data'], 
            colnames = ['Predicted'])

0.7081930415263749


Predicted,0.0,1.0
Real data,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,451,98
1.0,162,180


In [47]:
df_best_result_Per['Recall'] = recall_score(Y_train, model.predict(X_train))
df_best_result_Per['F1_Score'] = f1_score(Y_train, model.predict(X_train))
df_best_result_Per['Iteration'] = 2
false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_train, model.predict(X_train))
roc_auc = auc(false_positive_rate, true_positive_rate)
df_best_result_Per['AUC'] = roc_auc
df_best_result = pd.concat([df_best_result,df_best_result_Per],
         axis = 0)

In [48]:
df_best_result.sort_values('means',
                          axis = 0,
                          ascending = False)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.840612,0.034154,"{'colsample_bytree': 0.6, 'gamma': 1, 'learnin...",XGBClassifier,0.812865,0.854071,1,0.8782
0,0.832002,0.038314,"{'bootstrap': True, 'criterion': 'gini', 'max_...",RandomForestClassifier,0.894737,0.940092,1,0.944636
0,0.823774,0.040589,"{'C': 1.0, 'gamma': 0.1, 'kernel': 'rbf'}",SVC,0.707602,0.770701,2,0.813728
0,0.819309,0.040695,"{'criterion': 'gini', 'max_features': None, 'm...",DecisionTreeClassifier,0.824561,0.858447,1,0.882226
0,0.808057,0.041257,"{'C': 0.01, 'class_weight': None, 'penalty': '...",Logistic_Regression,0.678363,0.734177,2,0.786358
0,0.805814,0.043604,"{'C': 0.1, 'class_weight': None, 'penalty': 'l...",Logistic_Regression,0.681287,0.730408,1,0.783266
0,0.796101,0.040125,"{'C': 0.1, 'max_iter': 10000, 'penalty': 'l2'}",LinearSVC,0.687135,0.725309,1,0.778904
0,0.778506,0.051957,"{'alpha': 0.01, 'learning_rate': 'optimal', 'l...",SGDClassifier,0.792398,0.68262,1,0.731354
0,0.778136,0.033252,{'var_smoothing': 4.71486636345739e-06},GaussianNB,0.687135,0.711044,1,0.767065
0,0.778136,0.03604,{'var_smoothing': 0.12067926406393285},GaussianNB,0.681287,0.712538,2,0.768694


### 3.6. Linear SVC

In [49]:
#define the model
model = LinearSVC()

#define the parameters to search
c_values = [100, 10, 1.0, 0.1, 0.01]
penalty = ['l2','l1']
max_iter = [1,10,100,1000,10000]

#define grid search
grid = dict(max_iter = max_iter,
           C = c_values,
           penalty = penalty)
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0,
                          verbose = 2)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Fitting 30 folds for each of 50 candidates, totalling 1500 fits
Best: 0.797595 using {'C': 10, 'max_iter': 1000, 'penalty': 'l2'}




In [50]:
df_LinearSVC_results = pd.DataFrame(data = results_dict)
df_LinearSVC_results.sort_values(by = ['means','stds'],
                            axis = 0,
                            inplace = True,
                            ascending = [False,True])
df_LinearSVC_results.reset_index(drop = True,
                               inplace = True)
df_LinearSVC_results.head(10)

Unnamed: 0,means,stds,params
0,0.797595,0.039109,"{'C': 10, 'max_iter': 1000, 'penalty': 'l2'}"
1,0.796846,0.039207,"{'C': 100, 'max_iter': 10000, 'penalty': 'l2'}"
2,0.796088,0.038361,"{'C': 10, 'max_iter': 10000, 'penalty': 'l2'}"
3,0.794594,0.038134,"{'C': 1.0, 'max_iter': 10000, 'penalty': 'l2'}"
4,0.794594,0.038573,"{'C': 1.0, 'max_iter': 1000, 'penalty': 'l2'}"
5,0.79422,0.038266,"{'C': 0.1, 'max_iter': 100, 'penalty': 'l2'}"
6,0.79422,0.038266,"{'C': 0.1, 'max_iter': 1000, 'penalty': 'l2'}"
7,0.79422,0.038266,"{'C': 0.1, 'max_iter': 10000, 'penalty': 'l2'}"
8,0.793849,0.038914,"{'C': 1.0, 'max_iter': 100, 'penalty': 'l2'}"
9,0.793113,0.039735,"{'C': 0.01, 'max_iter': 10, 'penalty': 'l2'}"


In [51]:
df_best_result_LSVC = pd.DataFrame(df_LinearSVC_results.iloc[0,:]).T
df_best_result_LSVC['Model'] = "LinearSVC"
df_best_result_LSVC

Unnamed: 0,means,stds,params,Model
0,0.797595,0.039109,"{'C': 10, 'max_iter': 1000, 'penalty': 'l2'}",LinearSVC


In [52]:
model = LinearSVC(max_iter = 1000,
           C = 10,
           penalty = 'l2').fit(X_train, Y_train)
print(model.score(X_train, Y_train))
pd.crosstab(Y_train, 
            model.predict(X_train), 
            rownames = ['Real data'], 
            colnames = ['Predicted'])

0.7991021324354658




Predicted,0.0,1.0
Real data,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,473,76
1.0,103,239


In [53]:
df_best_result_LSVC['Recall'] = recall_score(Y_train, model.predict(X_train))
df_best_result_LSVC['F1_Score'] = f1_score(Y_train, model.predict(X_train))
df_best_result_LSVC['Iteration'] = 2
false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_train, model.predict(X_train))
roc_auc = auc(false_positive_rate, true_positive_rate)
df_best_result_LSVC['AUC'] = roc_auc
df_best_result = pd.concat([df_best_result,df_best_result_LSVC],
         axis = 0)

In [54]:
df_best_result.sort_values('means',
                          axis = 0,
                          ascending = False)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.840612,0.034154,"{'colsample_bytree': 0.6, 'gamma': 1, 'learnin...",XGBClassifier,0.812865,0.854071,1,0.8782
0,0.832002,0.038314,"{'bootstrap': True, 'criterion': 'gini', 'max_...",RandomForestClassifier,0.894737,0.940092,1,0.944636
0,0.823774,0.040589,"{'C': 1.0, 'gamma': 0.1, 'kernel': 'rbf'}",SVC,0.707602,0.770701,2,0.813728
0,0.819309,0.040695,"{'criterion': 'gini', 'max_features': None, 'm...",DecisionTreeClassifier,0.824561,0.858447,1,0.882226
0,0.808057,0.041257,"{'C': 0.01, 'class_weight': None, 'penalty': '...",Logistic_Regression,0.678363,0.734177,2,0.786358
0,0.805814,0.043604,"{'C': 0.1, 'class_weight': None, 'penalty': 'l...",Logistic_Regression,0.681287,0.730408,1,0.783266
0,0.797595,0.039109,"{'C': 10, 'max_iter': 1000, 'penalty': 'l2'}",LinearSVC,0.69883,0.727549,2,0.780198
0,0.796101,0.040125,"{'C': 0.1, 'max_iter': 10000, 'penalty': 'l2'}",LinearSVC,0.687135,0.725309,1,0.778904
0,0.778506,0.051957,"{'alpha': 0.01, 'learning_rate': 'optimal', 'l...",SGDClassifier,0.792398,0.68262,1,0.731354
0,0.778136,0.033252,{'var_smoothing': 4.71486636345739e-06},GaussianNB,0.687135,0.711044,1,0.767065


### 3.7. Stochastic Gradient Descent

In [55]:
#define the model
model = SGDClassifier()

#define the parameters to search
loss = ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']
penalty = ['l2','l1']
alpha = [0.0001,0.001,0.01,0.1,1]
learning_rate = ['optimal']


#define grid search
grid = dict(loss = loss,
           alpha = alpha,
           penalty = penalty,
           learning_rate = learning_rate)
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0,
                          verbose = 2)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Fitting 30 folds for each of 90 candidates, totalling 2700 fits
Best: 0.808061 using {'alpha': 1, 'learning_rate': 'optimal', 'loss': 'squared_loss', 'penalty': 'l2'}


In [56]:
df_SGDClassifier_results = pd.DataFrame(data = results_dict)
df_SGDClassifier_results.sort_values(by = ['means','stds'],
                            axis = 0,
                            inplace = True,
                            ascending = [False,True])
df_SGDClassifier_results.reset_index(drop = True,
                               inplace = True)
df_SGDClassifier_results.head(10)

Unnamed: 0,means,stds,params
0,0.808061,0.035054,"{'alpha': 1, 'learning_rate': 'optimal', 'loss..."
1,0.805443,0.03848,"{'alpha': 0.1, 'learning_rate': 'optimal', 'lo..."
2,0.80469,0.040244,"{'alpha': 1, 'learning_rate': 'optimal', 'loss..."
3,0.803575,0.038841,"{'alpha': 1, 'learning_rate': 'optimal', 'loss..."
4,0.803204,0.04275,"{'alpha': 1, 'learning_rate': 'optimal', 'loss..."
5,0.800191,0.04201,"{'alpha': 0.001, 'learning_rate': 'optimal', '..."
6,0.798348,0.04014,"{'alpha': 0.1, 'learning_rate': 'optimal', 'lo..."
7,0.796841,0.037019,"{'alpha': 0.01, 'learning_rate': 'optimal', 'l..."
8,0.796479,0.042497,"{'alpha': 0.1, 'learning_rate': 'optimal', 'lo..."
9,0.796105,0.042954,"{'alpha': 0.1, 'learning_rate': 'optimal', 'lo..."


In [57]:
df_best_result_SGDC = pd.DataFrame(df_SGDClassifier_results.iloc[0,:]).T
df_best_result_SGDC['Model'] = "SGDClassifier"
df_best_result_SGDC

Unnamed: 0,means,stds,params,Model
0,0.808061,0.035054,"{'alpha': 1, 'learning_rate': 'optimal', 'loss...",SGDClassifier


In [58]:
model = SGDClassifier(loss = 'squared_loss',
                      alpha = 1,
                      penalty = 'l2',
                      learning_rate = 'optimal').fit(X_train, Y_train)
print(model.score(X_train, Y_train))
pd.crosstab(Y_train, 
            model.predict(X_train), 
            rownames = ['Real data'], 
            colnames = ['Predicted'])

0.8159371492704826


Predicted,0.0,1.0
Real data,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,515,34
1.0,130,212


In [59]:
df_best_result_SGDC['Recall'] = recall_score(Y_train, model.predict(X_train))
df_best_result_SGDC['F1_Score'] = f1_score(Y_train, model.predict(X_train))
df_best_result_SGDC['Iteration'] = 2
false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_train, model.predict(X_train))
roc_auc = auc(false_positive_rate, true_positive_rate)
df_best_result_SGDC['AUC'] = roc_auc
df_best_result = pd.concat([df_best_result,df_best_result_SGDC],
         axis = 0)

In [60]:
df_best_result.sort_values('means',
                          axis = 0,
                          ascending = False)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.840612,0.034154,"{'colsample_bytree': 0.6, 'gamma': 1, 'learnin...",XGBClassifier,0.812865,0.854071,1,0.8782
0,0.832002,0.038314,"{'bootstrap': True, 'criterion': 'gini', 'max_...",RandomForestClassifier,0.894737,0.940092,1,0.944636
0,0.823774,0.040589,"{'C': 1.0, 'gamma': 0.1, 'kernel': 'rbf'}",SVC,0.707602,0.770701,2,0.813728
0,0.819309,0.040695,"{'criterion': 'gini', 'max_features': None, 'm...",DecisionTreeClassifier,0.824561,0.858447,1,0.882226
0,0.808061,0.035054,"{'alpha': 1, 'learning_rate': 'optimal', 'loss...",SGDClassifier,0.619883,0.721088,2,0.778976
0,0.808057,0.041257,"{'C': 0.01, 'class_weight': None, 'penalty': '...",Logistic_Regression,0.678363,0.734177,2,0.786358
0,0.805814,0.043604,"{'C': 0.1, 'class_weight': None, 'penalty': 'l...",Logistic_Regression,0.681287,0.730408,1,0.783266
0,0.797595,0.039109,"{'C': 10, 'max_iter': 1000, 'penalty': 'l2'}",LinearSVC,0.69883,0.727549,2,0.780198
0,0.796101,0.040125,"{'C': 0.1, 'max_iter': 10000, 'penalty': 'l2'}",LinearSVC,0.687135,0.725309,1,0.778904
0,0.778506,0.051957,"{'alpha': 0.01, 'learning_rate': 'optimal', 'l...",SGDClassifier,0.792398,0.68262,1,0.731354


### 3.8. Decision Tree

In [61]:
#define the model
model = DecisionTreeClassifier()

#define the parameters to search
criterion = ['gini', 'entropy']
splitter = ['best','random']
#max_depth = list(range(1,100,10))
min_samples_split = list(range(2,40,2))
min_samples_leaf = list(range(1,20,2))
max_features = ['auto','sqrt','log2', None]

#define grid search
grid = dict(criterion = criterion,
           splitter = splitter,
           min_samples_split = min_samples_split,
           min_samples_leaf = min_samples_leaf,
           max_features = max_features)
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0,
                          verbose = 2)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Fitting 30 folds for each of 3040 candidates, totalling 91200 fits
Best: 0.821914 using {'criterion': 'gini', 'max_features': None, 'min_samples_leaf': 5, 'min_samples_split': 10, 'splitter': 'best'}


In [62]:
df_DecisionTreeClassifier_results = pd.DataFrame(data = results_dict)
df_DecisionTreeClassifier_results.sort_values(by = ['means','stds'],
                            axis = 0,
                            inplace = True,
                            ascending = [False,True])
df_DecisionTreeClassifier_results.reset_index(drop = True,
                               inplace = True)
df_DecisionTreeClassifier_results.head(10)

Unnamed: 0,means,stds,params
0,0.821914,0.03747,"{'criterion': 'gini', 'max_features': None, 'm..."
1,0.821169,0.03611,"{'criterion': 'gini', 'max_features': None, 'm..."
2,0.819305,0.039235,"{'criterion': 'gini', 'max_features': None, 'm..."
3,0.819301,0.037268,"{'criterion': 'gini', 'max_features': None, 'm..."
4,0.818931,0.037637,"{'criterion': 'gini', 'max_features': None, 'm..."
5,0.817041,0.028965,"{'criterion': 'entropy', 'max_features': None,..."
6,0.816292,0.033475,"{'criterion': 'entropy', 'max_features': None,..."
7,0.815938,0.03286,"{'criterion': 'entropy', 'max_features': None,..."
8,0.815564,0.037209,"{'criterion': 'entropy', 'max_features': None,..."
9,0.815547,0.042925,"{'criterion': 'gini', 'max_features': None, 'm..."


In [63]:
df_best_result_DTC = pd.DataFrame(df_DecisionTreeClassifier_results.iloc[0,:]).T
df_best_result_DTC['Model'] = "DecisionTreeClassifier"
df_best_result_DTC

Unnamed: 0,means,stds,params,Model
0,0.821914,0.03747,"{'criterion': 'gini', 'max_features': None, 'm...",DecisionTreeClassifier


In [64]:
model = DecisionTreeClassifier(criterion = 'gini',
                       splitter = 'best',
                       min_samples_split = 10,
                       min_samples_leaf = 5,
                       max_features = None).fit(X_train, Y_train)
print(model.score(X_train, Y_train))
pd.crosstab(Y_train, 
            model.predict(X_train), 
            rownames = ['Real data'], 
            colnames = ['Predicted'])

0.8956228956228957


Predicted,0.0,1.0
Real data,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,516,33
1.0,60,282


In [65]:
df_best_result_DTC['Recall'] = recall_score(Y_train, model.predict(X_train))
df_best_result_DTC['F1_Score'] = f1_score(Y_train, model.predict(X_train))
df_best_result_DTC['Iteration'] = 2
false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_train, model.predict(X_train))
roc_auc = auc(false_positive_rate, true_positive_rate)
df_best_result_DTC['AUC'] = roc_auc
df_best_result = pd.concat([df_best_result,df_best_result_DTC],
         axis = 0)

In [66]:
df_best_result.sort_values('means',
                          axis = 0,
                          ascending = False)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.840612,0.034154,"{'colsample_bytree': 0.6, 'gamma': 1, 'learnin...",XGBClassifier,0.812865,0.854071,1,0.8782
0,0.832002,0.038314,"{'bootstrap': True, 'criterion': 'gini', 'max_...",RandomForestClassifier,0.894737,0.940092,1,0.944636
0,0.823774,0.040589,"{'C': 1.0, 'gamma': 0.1, 'kernel': 'rbf'}",SVC,0.707602,0.770701,2,0.813728
0,0.821914,0.03747,"{'criterion': 'gini', 'max_features': None, 'm...",DecisionTreeClassifier,0.824561,0.858447,2,0.882226
0,0.819309,0.040695,"{'criterion': 'gini', 'max_features': None, 'm...",DecisionTreeClassifier,0.824561,0.858447,1,0.882226
0,0.808061,0.035054,"{'alpha': 1, 'learning_rate': 'optimal', 'loss...",SGDClassifier,0.619883,0.721088,2,0.778976
0,0.808057,0.041257,"{'C': 0.01, 'class_weight': None, 'penalty': '...",Logistic_Regression,0.678363,0.734177,2,0.786358
0,0.805814,0.043604,"{'C': 0.1, 'class_weight': None, 'penalty': 'l...",Logistic_Regression,0.681287,0.730408,1,0.783266
0,0.797595,0.039109,"{'C': 10, 'max_iter': 1000, 'penalty': 'l2'}",LinearSVC,0.69883,0.727549,2,0.780198
0,0.796101,0.040125,"{'C': 0.1, 'max_iter': 10000, 'penalty': 'l2'}",LinearSVC,0.687135,0.725309,1,0.778904
