## 1. Library Import

In [294]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve, auc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

## 2. Load Data

In [22]:
df_train = pd.read_csv('train_iter_1.csv',
                      index_col = 0)
df_test = pd.read_csv('test_iter_1.csv',
                      index_col = 0)

X_train = df_train.drop('Survived',
                       axis = 1,)
Y_train = df_train.loc[:,'Survived']
X_test = df_test

## 3. Model

### 3.1. Logistic Regression

Let's start with logistic regression.

RepeatedStratifiedFold info --> <a href="https://machinelearningmastery.com/repeated-k-fold-cross-validation-with-python/" target="_blank">Link</a>

In [39]:
#define the model
model = LogisticRegression()

#define the parameters to search
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
class_weight = [None, 'balanced']

#define grid search
grid = dict(solver = solvers,
           penalty = penalty,
           C = c_values,
           class_weight = class_weight)
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Best: 0.805814 using {'C': 0.1, 'class_weight': None, 'penalty': 'l2', 'solver': 'liblinear'}


In [101]:
df_logistic_results = pd.DataFrame(data = results_dict)
df_logistic_results.sort_values(by = 'means',
                            axis = 0,
                            inplace = True,
                            ascending = False)
df_logistic_results.reset_index(drop = True,
                               inplace = True)
df_logistic_results.head(10)

Unnamed: 0,means,stds,params
0,0.805814,0.043604,"{'C': 0.1, 'class_weight': None, 'penalty': 'l..."
1,0.796467,0.040747,"{'C': 1.0, 'class_weight': None, 'penalty': 'l..."
2,0.794607,0.036156,"{'C': 10, 'class_weight': None, 'penalty': 'l2..."
3,0.793088,0.040382,"{'C': 0.1, 'class_weight': None, 'penalty': 'l..."
4,0.792734,0.037378,"{'C': 100, 'class_weight': None, 'penalty': 'l..."
5,0.79236,0.03783,"{'C': 100, 'class_weight': None, 'penalty': 'l..."
6,0.79236,0.03783,"{'C': 10, 'class_weight': None, 'penalty': 'l2..."
7,0.79236,0.037382,"{'C': 10, 'class_weight': None, 'penalty': 'l2..."
8,0.792355,0.038623,"{'C': 1.0, 'class_weight': None, 'penalty': 'l..."
9,0.792351,0.038094,"{'C': 100, 'class_weight': None, 'penalty': 'l..."


The best solver is clearly liblinear in this case, having the top3 results and 4 of the top10.

Also none class_weight shows to be better than balanced, although it might be overfitting and therefore givin a worse result on test dataset.

The best C in this particular case is 0.1.

In [126]:
df_best_result = pd.DataFrame(df_logistic_results.iloc[0,:]).T
df_best_result['Model'] = "Logistic_Regression"
df_best_result

Unnamed: 0,means,stds,params,Model
0,0.805814,0.043604,"{'C': 0.1, 'class_weight': None, 'penalty': 'l...",Logistic_Regression


In [104]:
model = LogisticRegression(C = 0.1,
                          random_state = 1,
                          solver = 'liblinear').fit(X_train, Y_train)
print(model.score(X_train, Y_train))
pd.crosstab(Y_train, 
            model.predict(X_train), 
            rownames = ['Real data'], 
            colnames = ['Predicted'])

0.8069584736251403


Predicted,0.0,1.0
Real data,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,486,63
1.0,109,233


In [132]:
df_best_result['Recall'] = recall_score(Y_train, model.predict(X_train))
df_best_result['F1_Score'] = f1_score(Y_train, model.predict(X_train))
df_best_result['Iteration'] = 1
false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_train, model.predict(X_train))
roc_auc = auc(false_positive_rate, true_positive_rate)
df_best_result['AUC'] = roc_auc

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.805814,0.043604,"{'C': 0.1, 'class_weight': None, 'penalty': 'l...",Logistic_Regression,0.681287,0.730408,1,0.783266


In [None]:
df_best_result.sort_values('means',
                          axis = 0,
                          ascending = False)

### 3.2. KNeighbors Classifier

In [175]:
#define the model
model = KNeighborsClassifier()

#define the parameters to search
leaf_size = list(range(20,40))
n_neighbors = list(range(5,20))
p = [1,2]

#define grid search
grid = dict(leaf_size = leaf_size,
           n_neighbors = n_neighbors,
           p = p)
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0,
                          verbose = 2)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Fitting 30 folds for each of 600 candidates, totalling 18000 fits
Best: 0.749351 using {'leaf_size': 20, 'n_neighbors': 11, 'p': 1}


In [176]:
df_KNeighbors_results = pd.DataFrame(data = results_dict)
df_KNeighbors_results.sort_values(by = ['means','stds'],
                            axis = 0,
                            inplace = True,
                            ascending = [False,True])
df_KNeighbors_results.reset_index(drop = True,
                               inplace = True)
df_KNeighbors_results.head(10)

Unnamed: 0,means,stds,params
0,0.749351,0.034313,"{'leaf_size': 26, 'n_neighbors': 11, 'p': 1}"
1,0.749351,0.034313,"{'leaf_size': 27, 'n_neighbors': 11, 'p': 1}"
2,0.749351,0.034313,"{'leaf_size': 28, 'n_neighbors': 11, 'p': 1}"
3,0.749351,0.034313,"{'leaf_size': 29, 'n_neighbors': 11, 'p': 1}"
4,0.749351,0.034313,"{'leaf_size': 30, 'n_neighbors': 11, 'p': 1}"
5,0.749351,0.034313,"{'leaf_size': 31, 'n_neighbors': 11, 'p': 1}"
6,0.749351,0.034313,"{'leaf_size': 32, 'n_neighbors': 11, 'p': 1}"
7,0.749351,0.034313,"{'leaf_size': 33, 'n_neighbors': 11, 'p': 1}"
8,0.749351,0.034313,"{'leaf_size': 34, 'n_neighbors': 11, 'p': 1}"
9,0.749351,0.034313,"{'leaf_size': 35, 'n_neighbors': 11, 'p': 1}"


In [177]:
df_best_result_KN = pd.DataFrame(df_KNeighbors_results.iloc[0,:]).T
df_best_result_KN['Model'] = "KNeighbors"
df_best_result_KN

Unnamed: 0,means,stds,params,Model
0,0.749351,0.034313,"{'leaf_size': 26, 'n_neighbors': 11, 'p': 1}",KNeighbors


In [178]:
model = KNeighborsClassifier(leaf_size = 20,
                            n_neighbors = 11,
                            p = 1).fit(X_train, Y_train)
print(model.score(X_train, Y_train))
pd.crosstab(Y_train, 
            model.predict(X_train), 
            rownames = ['Real data'], 
            colnames = ['Predicted'])

0.7878787878787878


Predicted,0.0,1.0
Real data,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,474,75
1.0,114,228


In [181]:
df_best_result_KN['Recall'] = recall_score(Y_train, model.predict(X_train))
df_best_result_KN['F1_Score'] = f1_score(Y_train, model.predict(X_train))
df_best_result_KN['Iteration'] = 1
false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_train, model.predict(X_train))
roc_auc = auc(false_positive_rate, true_positive_rate)
df_best_result_KN['AUC'] = roc_auc
df_best_result = pd.concat([df_best_result,df_best_result_KN],
         axis = 0)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.805814,0.043604,"{'C': 0.1, 'class_weight': None, 'penalty': 'l...",Logistic_Regression,0.681287,0.730408,1,0.783266
0,0.749351,0.034313,"{'leaf_size': 26, 'n_neighbors': 11, 'p': 1}",KNeighbors,0.666667,0.706977,1,0.765027


In [None]:
df_best_result.sort_values('means',
                          axis = 0,
                          ascending = False)

### 3.3. Support Vector Machines

In [158]:
#define the model
model = SVC()

#define the parameters to search
kernel = ['linear']#,'poly','rbf','sigmoid']
c_values = [100]#, 10, 1.0, 0.1, 0.01]
gamma = ['scale']#, 1, 0.1, 0.01, 0.001]

#define grid search
grid = dict(kernel = kernel,
           C = c_values,
           gamma = gamma)
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0,
                          verbose = 2)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Fitting 30 folds for each of 1 candidates, totalling 30 fits


KeyboardInterrupt: 

It seems that SVM algorithms are very sensitive to non scaled data. As it will be my second iteration, I leave this algorithm for then.

### 3.4. Naive Bayes Classifiers

In [182]:
#define the model
model = GaussianNB()

#define the parameters to search
var_smoothing = np.logspace(0, -9, num = 50) 

#define grid search
grid = dict(var_smoothing = var_smoothing)
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0,
                          verbose = 2)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Fitting 30 folds for each of 50 candidates, totalling 1500 fits
Best: 0.778136 using {'var_smoothing': 4.71486636345739e-06}


In [183]:
df_GaussianNB_results = pd.DataFrame(data = results_dict)
df_GaussianNB_results.sort_values(by = ['means','stds'],
                            axis = 0,
                            inplace = True,
                            ascending = [False,True])
df_GaussianNB_results.reset_index(drop = True,
                               inplace = True)
df_GaussianNB_results.head(10)

Unnamed: 0,means,stds,params
0,0.778136,0.033252,{'var_smoothing': 4.71486636345739e-06}
1,0.777378,0.036012,{'var_smoothing': 3.0888435964774785e-06}
2,0.776638,0.034464,{'var_smoothing': 7.196856730011514e-06}
3,0.775884,0.036157,{'var_smoothing': 2.0235896477251557e-06}
4,0.774761,0.036217,{'var_smoothing': 1.325711365590108e-06}
5,0.774024,0.038518,{'var_smoothing': 1.0985411419875572e-05}
6,0.773263,0.036627,{'var_smoothing': 8.685113737513521e-07}
7,0.773263,0.040656,{'var_smoothing': 2.9470517025518096e-08}
8,0.773263,0.039287,{'var_smoothing': 2.4420530945486497e-07}
9,0.773263,0.039287,{'var_smoothing': 1.5998587196060574e-07}


In [184]:
df_best_result_GNB = pd.DataFrame(df_GaussianNB_results.iloc[0,:]).T
df_best_result_GNB['Model'] = "GaussianNB"
df_best_result_GNB

Unnamed: 0,means,stds,params,Model
0,0.778136,0.033252,{'var_smoothing': 4.71486636345739e-06},GaussianNB


In [185]:
model = GaussianNB(var_smoothing = 4.71486636345739e-06).fit(X_train, Y_train)
print(model.score(X_train, Y_train))
pd.crosstab(Y_train, 
            model.predict(X_train), 
            rownames = ['Real data'], 
            colnames = ['Predicted'])

0.7856341189674523


Predicted,0.0,1.0
Real data,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,465,84
1.0,107,235


In [186]:
df_best_result_GNB['Recall'] = recall_score(Y_train, model.predict(X_train))
df_best_result_GNB['F1_Score'] = f1_score(Y_train, model.predict(X_train))
df_best_result_GNB['Iteration'] = 1
false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_train, model.predict(X_train))
roc_auc = auc(false_positive_rate, true_positive_rate)
df_best_result_GNB['AUC'] = roc_auc
df_best_result = pd.concat([df_best_result,df_best_result_GNB],
         axis = 0)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.805814,0.043604,"{'C': 0.1, 'class_weight': None, 'penalty': 'l...",Logistic_Regression,0.681287,0.730408,1,0.783266
0,0.749351,0.034313,"{'leaf_size': 26, 'n_neighbors': 11, 'p': 1}",KNeighbors,0.666667,0.706977,1,0.765027
0,0.778136,0.033252,{'var_smoothing': 4.71486636345739e-06},GaussianNB,0.687135,0.711044,1,0.767065


In [189]:
df_best_result.sort_values('means',
                          axis = 0,
                          ascending = False)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.805814,0.043604,"{'C': 0.1, 'class_weight': None, 'penalty': 'l...",Logistic_Regression,0.681287,0.730408,1,0.783266
0,0.778136,0.033252,{'var_smoothing': 4.71486636345739e-06},GaussianNB,0.687135,0.711044,1,0.767065
0,0.749351,0.034313,"{'leaf_size': 26, 'n_neighbors': 11, 'p': 1}",KNeighbors,0.666667,0.706977,1,0.765027


### 3.5. Perceptron

In [195]:
#define the model
model = Perceptron()

#define the parameters to search
eta0 = [0.0001,0.001,0.01,0.1,1.0]
max_iter = [1,10,100,1000,10000]
early_stopping = [True,False]
penalty = ['l2','l1',None]

#define grid search
grid = dict(eta0 = eta0,
           max_iter = max_iter,
           early_stopping = early_stopping,
           penalty = penalty)
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0,
                          verbose = 2)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Fitting 30 folds for each of 150 candidates, totalling 4500 fits
Best: 0.670121 using {'early_stopping': False, 'eta0': 0.1, 'max_iter': 100, 'penalty': 'l2'}


In [196]:
df_Perceptron_results = pd.DataFrame(data = results_dict)
df_Perceptron_results.sort_values(by = ['means','stds'],
                            axis = 0,
                            inplace = True,
                            ascending = [False,True])
df_Perceptron_results.reset_index(drop = True,
                               inplace = True)
df_Perceptron_results.head(10)

Unnamed: 0,means,stds,params
0,0.670121,0.10997,"{'early_stopping': False, 'eta0': 0.1, 'max_it..."
1,0.670121,0.10997,"{'early_stopping': False, 'eta0': 0.1, 'max_it..."
2,0.670121,0.10997,"{'early_stopping': False, 'eta0': 0.1, 'max_it..."
3,0.666983,0.08906,"{'early_stopping': False, 'eta0': 0.0001, 'max..."
4,0.666983,0.08906,"{'early_stopping': False, 'eta0': 0.0001, 'max..."
5,0.666983,0.08906,"{'early_stopping': False, 'eta0': 0.0001, 'max..."
6,0.660712,0.091404,"{'early_stopping': True, 'eta0': 0.0001, 'max_..."
7,0.660712,0.091404,"{'early_stopping': True, 'eta0': 0.0001, 'max_..."
8,0.660712,0.091404,"{'early_stopping': True, 'eta0': 0.0001, 'max_..."
9,0.660712,0.091404,"{'early_stopping': True, 'eta0': 0.0001, 'max_..."


In [197]:
df_best_result_Per = pd.DataFrame(df_Perceptron_results.iloc[0,:]).T
df_best_result_Per['Model'] = "Perceptron"
df_best_result_Per

Unnamed: 0,means,stds,params,Model
0,0.670121,0.10997,"{'early_stopping': False, 'eta0': 0.1, 'max_it...",Perceptron


In [198]:
model = Perceptron(eta0 = 0.1,
           max_iter = 100,
           early_stopping = False,
           penalty = 'l2').fit(X_train, Y_train)
print(model.score(X_train, Y_train))
pd.crosstab(Y_train, 
            model.predict(X_train), 
            rownames = ['Real data'], 
            colnames = ['Predicted'])

0.6958473625140292


Predicted,0.0,1.0
Real data,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,322,227
1.0,44,298


In [199]:
df_best_result_Per['Recall'] = recall_score(Y_train, model.predict(X_train))
df_best_result_Per['F1_Score'] = f1_score(Y_train, model.predict(X_train))
df_best_result_Per['Iteration'] = 1
false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_train, model.predict(X_train))
roc_auc = auc(false_positive_rate, true_positive_rate)
df_best_result_Per['AUC'] = roc_auc
df_best_result = pd.concat([df_best_result,df_best_result_Per],
         axis = 0)

In [200]:
df_best_result.sort_values('means',
                          axis = 0,
                          ascending = False)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.805814,0.043604,"{'C': 0.1, 'class_weight': None, 'penalty': 'l...",Logistic_Regression,0.681287,0.730408,1,0.783266
0,0.778136,0.033252,{'var_smoothing': 4.71486636345739e-06},GaussianNB,0.687135,0.711044,1,0.767065
0,0.749351,0.034313,"{'leaf_size': 26, 'n_neighbors': 11, 'p': 1}",KNeighbors,0.666667,0.706977,1,0.765027
0,0.670121,0.10997,"{'early_stopping': False, 'eta0': 0.1, 'max_it...",Perceptron,0.871345,0.687428,1,0.728933


### 3.6. Linear SVC

In [204]:
#define the model
model = LinearSVC()

#define the parameters to search
c_values = [100, 10, 1.0, 0.1, 0.01]
penalty = ['l2','l1']
max_iter = [1,10,100,1000,10000]

#define grid search
grid = dict(max_iter = max_iter,
           C = c_values,
           penalty = penalty)
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0,
                          verbose = 2)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Fitting 30 folds for each of 50 candidates, totalling 1500 fits
Best: 0.796101 using {'C': 0.1, 'max_iter': 10000, 'penalty': 'l2'}




In [205]:
df_LinearSVC_results = pd.DataFrame(data = results_dict)
df_LinearSVC_results.sort_values(by = ['means','stds'],
                            axis = 0,
                            inplace = True,
                            ascending = [False,True])
df_LinearSVC_results.reset_index(drop = True,
                               inplace = True)
df_LinearSVC_results.head(10)

Unnamed: 0,means,stds,params
0,0.796101,0.040125,"{'C': 0.1, 'max_iter': 10000, 'penalty': 'l2'}"
1,0.794594,0.041724,"{'C': 0.01, 'max_iter': 100, 'penalty': 'l2'}"
2,0.794224,0.045296,"{'C': 0.01, 'max_iter': 10000, 'penalty': 'l2'}"
3,0.793845,0.046439,"{'C': 0.01, 'max_iter': 1000, 'penalty': 'l2'}"
4,0.788614,0.03854,"{'C': 1.0, 'max_iter': 10000, 'penalty': 'l2'}"
5,0.785614,0.040423,"{'C': 0.1, 'max_iter': 1000, 'penalty': 'l2'}"
6,0.733571,0.091458,"{'C': 10, 'max_iter': 10000, 'penalty': 'l2'}"
7,0.719388,0.092881,"{'C': 0.1, 'max_iter': 100, 'penalty': 'l2'}"
8,0.717541,0.107712,"{'C': 1.0, 'max_iter': 1000, 'penalty': 'l2'}"
9,0.69737,0.08181,"{'C': 100, 'max_iter': 10000, 'penalty': 'l2'}"


In [206]:
df_best_result_LSVC = pd.DataFrame(df_LinearSVC_results.iloc[0,:]).T
df_best_result_LSVC['Model'] = "LinearSVC"
df_best_result_LSVC

Unnamed: 0,means,stds,params,Model
0,0.796101,0.040125,"{'C': 0.1, 'max_iter': 10000, 'penalty': 'l2'}",LinearSVC


In [211]:
model = LinearSVC(max_iter = 10000,
           C = 0.1,
           penalty = 'l2').fit(X_train, Y_train)
print(model.score(X_train, Y_train))
pd.crosstab(Y_train, 
            model.predict(X_train), 
            rownames = ['Real data'], 
            colnames = ['Predicted'])

0.8002244668911336




Predicted,0.0,1.0
Real data,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,478,71
1.0,107,235


In [212]:
df_best_result_LSVC['Recall'] = recall_score(Y_train, model.predict(X_train))
df_best_result_LSVC['F1_Score'] = f1_score(Y_train, model.predict(X_train))
df_best_result_LSVC['Iteration'] = 1
false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_train, model.predict(X_train))
roc_auc = auc(false_positive_rate, true_positive_rate)
df_best_result_LSVC['AUC'] = roc_auc
df_best_result = pd.concat([df_best_result,df_best_result_LSVC],
         axis = 0)

In [213]:
df_best_result.sort_values('means',
                          axis = 0,
                          ascending = False)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.805814,0.043604,"{'C': 0.1, 'class_weight': None, 'penalty': 'l...",Logistic_Regression,0.681287,0.730408,1,0.783266
0,0.796101,0.040125,"{'C': 0.1, 'max_iter': 10000, 'penalty': 'l2'}",LinearSVC,0.687135,0.725309,1,0.778904
0,0.778136,0.033252,{'var_smoothing': 4.71486636345739e-06},GaussianNB,0.687135,0.711044,1,0.767065
0,0.749351,0.034313,"{'leaf_size': 26, 'n_neighbors': 11, 'p': 1}",KNeighbors,0.666667,0.706977,1,0.765027
0,0.670121,0.10997,"{'early_stopping': False, 'eta0': 0.1, 'max_it...",Perceptron,0.871345,0.687428,1,0.728933


### 3.7. Stochastic Gradient Descent

In [221]:
#define the model
model = SGDClassifier()

#define the parameters to search
loss = ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']
penalty = ['l2','l1']
alpha = [0.0001,0.001,0.01,0.1,1]
learning_rate = ['optimal']


#define grid search
grid = dict(loss = loss,
           alpha = alpha,
           penalty = penalty,
           learning_rate = learning_rate)
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0,
                          verbose = 2)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Fitting 30 folds for each of 90 candidates, totalling 2700 fits
Best: 0.778506 using {'alpha': 0.01, 'learning_rate': 'optimal', 'loss': 'log', 'penalty': 'l1'}


In [222]:
df_SGDClassifier_results = pd.DataFrame(data = results_dict)
df_SGDClassifier_results.sort_values(by = ['means','stds'],
                            axis = 0,
                            inplace = True,
                            ascending = [False,True])
df_SGDClassifier_results.reset_index(drop = True,
                               inplace = True)
df_SGDClassifier_results.head(10)

Unnamed: 0,means,stds,params
0,0.778506,0.051957,"{'alpha': 0.01, 'learning_rate': 'optimal', 'l..."
1,0.775152,0.040889,"{'alpha': 0.001, 'learning_rate': 'optimal', '..."
2,0.770254,0.062395,"{'alpha': 0.01, 'learning_rate': 'optimal', 'l..."
3,0.76767,0.057179,"{'alpha': 0.01, 'learning_rate': 'optimal', 'l..."
4,0.765406,0.056448,"{'alpha': 0.001, 'learning_rate': 'optimal', '..."
5,0.76238,0.073533,"{'alpha': 0.001, 'learning_rate': 'optimal', '..."
6,0.761652,0.062654,"{'alpha': 0.0001, 'learning_rate': 'optimal', ..."
7,0.761294,0.07504,"{'alpha': 0.01, 'learning_rate': 'optimal', 'l..."
8,0.757936,0.077658,"{'alpha': 0.0001, 'learning_rate': 'optimal', ..."
9,0.75717,0.070347,"{'alpha': 0.001, 'learning_rate': 'optimal', '..."


In [224]:
df_best_result_SGDC = pd.DataFrame(df_SGDClassifier_results.iloc[0,:]).T
df_best_result_SGDC['Model'] = "SGDClassifier"
df_best_result_SGDC

Unnamed: 0,means,stds,params,Model
0,0.778506,0.051957,"{'alpha': 0.01, 'learning_rate': 'optimal', 'l...",SGDClassifier


In [226]:
model = SGDClassifier(loss = 'log',
                      alpha = 0.01,
                      penalty = 'l1',
                      learning_rate = 'optimal').fit(X_train, Y_train)
print(model.score(X_train, Y_train))
pd.crosstab(Y_train, 
            model.predict(X_train), 
            rownames = ['Real data'], 
            colnames = ['Predicted'])

0.7171717171717171


Predicted,0.0,1.0
Real data,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,368,181
1.0,71,271


In [227]:
df_best_result_SGDC['Recall'] = recall_score(Y_train, model.predict(X_train))
df_best_result_SGDC['F1_Score'] = f1_score(Y_train, model.predict(X_train))
df_best_result_SGDC['Iteration'] = 1
false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_train, model.predict(X_train))
roc_auc = auc(false_positive_rate, true_positive_rate)
df_best_result_SGDC['AUC'] = roc_auc
df_best_result = pd.concat([df_best_result,df_best_result_SGDC],
         axis = 0)

In [228]:
df_best_result.sort_values('means',
                          axis = 0,
                          ascending = False)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.805814,0.043604,"{'C': 0.1, 'class_weight': None, 'penalty': 'l...",Logistic_Regression,0.681287,0.730408,1,0.783266
0,0.796101,0.040125,"{'C': 0.1, 'max_iter': 10000, 'penalty': 'l2'}",LinearSVC,0.687135,0.725309,1,0.778904
0,0.778506,0.051957,"{'alpha': 0.01, 'learning_rate': 'optimal', 'l...",SGDClassifier,0.792398,0.68262,1,0.731354
0,0.778136,0.033252,{'var_smoothing': 4.71486636345739e-06},GaussianNB,0.687135,0.711044,1,0.767065
0,0.749351,0.034313,"{'leaf_size': 26, 'n_neighbors': 11, 'p': 1}",KNeighbors,0.666667,0.706977,1,0.765027
0,0.670121,0.10997,"{'early_stopping': False, 'eta0': 0.1, 'max_it...",Perceptron,0.871345,0.687428,1,0.728933


### 3.8. Decision Tree

In [237]:
#define the model
model = DecisionTreeClassifier()

#define the parameters to search
criterion = ['gini', 'entropy']
splitter = ['best','random']
#max_depth = list(range(1,100,10))
min_samples_split = list(range(2,40,2))
min_samples_leaf = list(range(1,20,2))
max_features = ['auto','sqrt','log2', None]

#define grid search
grid = dict(criterion = criterion,
           splitter = splitter,
           min_samples_split = min_samples_split,
           min_samples_leaf = min_samples_leaf,
           max_features = max_features)
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0,
                          verbose = 2)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Fitting 30 folds for each of 3040 candidates, totalling 91200 fits
Best: 0.819309 using {'criterion': 'gini', 'max_features': None, 'min_samples_leaf': 5, 'min_samples_split': 10, 'splitter': 'best'}


In [238]:
df_DecisionTreeClassifier_results = pd.DataFrame(data = results_dict)
df_DecisionTreeClassifier_results.sort_values(by = ['means','stds'],
                            axis = 0,
                            inplace = True,
                            ascending = [False,True])
df_DecisionTreeClassifier_results.reset_index(drop = True,
                               inplace = True)
df_DecisionTreeClassifier_results.head(10)

Unnamed: 0,means,stds,params
0,0.819309,0.040695,"{'criterion': 'gini', 'max_features': None, 'm..."
1,0.817815,0.040499,"{'criterion': 'gini', 'max_features': None, 'm..."
2,0.817437,0.042269,"{'criterion': 'gini', 'max_features': None, 'm..."
3,0.8163,0.040689,"{'criterion': 'gini', 'max_features': None, 'm..."
4,0.815943,0.040187,"{'criterion': 'gini', 'max_features': None, 'm..."
5,0.815918,0.034114,"{'criterion': 'entropy', 'max_features': None,..."
6,0.815556,0.040671,"{'criterion': 'gini', 'max_features': None, 'm..."
7,0.815556,0.038887,"{'criterion': 'gini', 'max_features': None, 'm..."
8,0.815543,0.02802,"{'criterion': 'entropy', 'max_features': None,..."
9,0.815531,0.030792,"{'criterion': 'entropy', 'max_features': None,..."


In [240]:
df_best_result_DTC = pd.DataFrame(df_DecisionTreeClassifier_results.iloc[0,:]).T
df_best_result_DTC['Model'] = "DecisionTreeClassifier"
df_best_result_DTC

Unnamed: 0,means,stds,params,Model
0,0.819309,0.040695,"{'criterion': 'gini', 'max_features': None, 'm...",DecisionTreeClassifier


In [242]:
model = DecisionTreeClassifier(criterion = 'gini',
                       splitter = 'best',
                       min_samples_split = 10,
                       min_samples_leaf = 5,
                       max_features = None).fit(X_train, Y_train)
print(model.score(X_train, Y_train))
pd.crosstab(Y_train, 
            model.predict(X_train), 
            rownames = ['Real data'], 
            colnames = ['Predicted'])

0.8956228956228957


Predicted,0.0,1.0
Real data,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,516,33
1.0,60,282


In [243]:
df_best_result_DTC['Recall'] = recall_score(Y_train, model.predict(X_train))
df_best_result_DTC['F1_Score'] = f1_score(Y_train, model.predict(X_train))
df_best_result_DTC['Iteration'] = 1
false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_train, model.predict(X_train))
roc_auc = auc(false_positive_rate, true_positive_rate)
df_best_result_DTC['AUC'] = roc_auc
df_best_result = pd.concat([df_best_result,df_best_result_DTC],
         axis = 0)

In [244]:
df_best_result.sort_values('means',
                          axis = 0,
                          ascending = False)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.819309,0.040695,"{'criterion': 'gini', 'max_features': None, 'm...",DecisionTreeClassifier,0.824561,0.858447,1,0.882226
0,0.805814,0.043604,"{'C': 0.1, 'class_weight': None, 'penalty': 'l...",Logistic_Regression,0.681287,0.730408,1,0.783266
0,0.796101,0.040125,"{'C': 0.1, 'max_iter': 10000, 'penalty': 'l2'}",LinearSVC,0.687135,0.725309,1,0.778904
0,0.778506,0.051957,"{'alpha': 0.01, 'learning_rate': 'optimal', 'l...",SGDClassifier,0.792398,0.68262,1,0.731354
0,0.778136,0.033252,{'var_smoothing': 4.71486636345739e-06},GaussianNB,0.687135,0.711044,1,0.767065
0,0.749351,0.034313,"{'leaf_size': 26, 'n_neighbors': 11, 'p': 1}",KNeighbors,0.666667,0.706977,1,0.765027
0,0.670121,0.10997,"{'early_stopping': False, 'eta0': 0.1, 'max_it...",Perceptron,0.871345,0.687428,1,0.728933


### 3.9. Random Forest

In [261]:
#define the model
model = RandomForestClassifier()

#define the parameters to search
n_estimators = [1000]
criterion = ['gini','entropy']
max_depth = [10]
#max_depth.append(None)
min_samples_split = [2]
min_samples_leaf = [1]
max_features = [None]
bootstrap = [True]

#define grid search
grid = dict(criterion = criterion,
            n_estimators = n_estimators,
           max_depth = max_depth,
           min_samples_split = min_samples_split,
           min_samples_leaf = min_samples_leaf,
           max_features = max_features,
           bootstrap = bootstrap)
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0,
                          verbose = 2)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Fitting 30 folds for each of 4 candidates, totalling 120 fits
Best: 0.832002 using {'bootstrap': True, 'criterion': 'gini', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}


In [263]:
df_RandomForestClassifier_results = pd.DataFrame(data = results_dict)
df_RandomForestClassifier_results.sort_values(by = ['means','stds'],
                            axis = 0,
                            inplace = True,
                            ascending = [False,True])
df_RandomForestClassifier_results.reset_index(drop = True,
                               inplace = True)
with pd.option_context('display.max_colwidth', -1):     
    display(df_RandomForestClassifier_results.head(10))

Unnamed: 0,means,stds,params
0,0.832002,0.038314,"{'bootstrap': True, 'criterion': 'gini', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}"
1,0.831998,0.035729,"{'bootstrap': True, 'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}"
2,0.81484,0.030974,"{'bootstrap': False, 'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}"
3,0.804715,0.034327,"{'bootstrap': False, 'criterion': 'gini', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}"


In [264]:
df_best_result_RF = pd.DataFrame(df_RandomForestClassifier_results.iloc[0,:]).T
df_best_result_RF['Model'] = "RandomForestClassifier"
df_best_result_RF

Unnamed: 0,means,stds,params,Model
0,0.832002,0.038314,"{'bootstrap': True, 'criterion': 'gini', 'max_...",RandomForestClassifier


In [267]:
model = RandomForestClassifier(criterion = 'gini',
                               n_estimators = 1000,
                               max_depth = 10,
                               min_samples_split = 2,
                               min_samples_leaf = 1,
                               max_features = None,
                               bootstrap = True).fit(X_train, Y_train)
print(model.score(X_train, Y_train))
pd.crosstab(Y_train, 
            model.predict(X_train), 
            rownames = ['Real data'], 
            colnames = ['Predicted'])

0.9562289562289562


Predicted,0.0,1.0
Real data,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,546,3
1.0,36,306


In [268]:
df_best_result_RF['Recall'] = recall_score(Y_train, model.predict(X_train))
df_best_result_RF['F1_Score'] = f1_score(Y_train, model.predict(X_train))
df_best_result_RF['Iteration'] = 1
false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_train, model.predict(X_train))
roc_auc = auc(false_positive_rate, true_positive_rate)
df_best_result_RF['AUC'] = roc_auc
df_best_result = pd.concat([df_best_result,df_best_result_RF],
         axis = 0)

In [269]:
df_best_result.sort_values('means',
                          axis = 0,
                          ascending = False)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.832002,0.038314,"{'bootstrap': True, 'criterion': 'gini', 'max_...",RandomForestClassifier,0.894737,0.940092,1,0.944636
0,0.819309,0.040695,"{'criterion': 'gini', 'max_features': None, 'm...",DecisionTreeClassifier,0.824561,0.858447,1,0.882226
0,0.805814,0.043604,"{'C': 0.1, 'class_weight': None, 'penalty': 'l...",Logistic_Regression,0.681287,0.730408,1,0.783266
0,0.796101,0.040125,"{'C': 0.1, 'max_iter': 10000, 'penalty': 'l2'}",LinearSVC,0.687135,0.725309,1,0.778904
0,0.778506,0.051957,"{'alpha': 0.01, 'learning_rate': 'optimal', 'l...",SGDClassifier,0.792398,0.68262,1,0.731354
0,0.778136,0.033252,{'var_smoothing': 4.71486636345739e-06},GaussianNB,0.687135,0.711044,1,0.767065
0,0.749351,0.034313,"{'leaf_size': 26, 'n_neighbors': 11, 'p': 1}",KNeighbors,0.666667,0.706977,1,0.765027
0,0.670121,0.10997,"{'early_stopping': False, 'eta0': 0.1, 'max_it...",Perceptron,0.871345,0.687428,1,0.728933


### 3.10. XGB Classifier

In [287]:
#define the model
model = XGBClassifier()

#define the parameters to search
n_estimators = [2000]
learning_rate = [0.01]
max_depth = [9]
min_child_weight = [3,5]
gamma = [1]
subsample = [1.0]
colsample_bytree = [1.0]

#define grid search
grid = dict(n_estimators = n_estimators,
           learning_rate = learning_rate,
           max_depth = max_depth,
           min_child_weight = min_child_weight,
           gamma = gamma,
           subsample = subsample,
           colsample_bytree = colsample_bytree,
           use_label_encoder=[False])
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0,
                          verbose = 2)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Fitting 30 folds for each of 6 candidates, totalling 180 fits
Best: 0.840612 using {'colsample_bytree': 0.6, 'gamma': 1, 'learning_rate': 0.01, 'max_depth': 9, 'min_child_weight': 5, 'n_estimators': 2000, 'subsample': 1.0, 'use_label_encoder': False}


In [286]:
df_XGBClassifier_results = pd.DataFrame(data = results_dict)
df_XGBClassifier_results.sort_values(by = ['means','stds'],
                            axis = 0,
                            inplace = True,
                            ascending = [False,True])
df_XGBClassifier_results.reset_index(drop = True,
                               inplace = True)
with pd.option_context('display.max_colwidth', -1):     
    display(df_XGBClassifier_results.head(10))

Unnamed: 0,means,stds,params
0,0.840612,0.034154,"{'colsample_bytree': 0.6, 'gamma': 1, 'learning_rate': 0.01, 'max_depth': 9, 'min_child_weight': 5, 'n_estimators': 2000, 'subsample': 1.0, 'use_label_encoder': False}"
1,0.840241,0.035618,"{'colsample_bytree': 0.6, 'gamma': 1, 'learning_rate': 0.01, 'max_depth': 9, 'min_child_weight': 3, 'n_estimators': 2000, 'subsample': 0.8, 'use_label_encoder': False}"
2,0.839501,0.03359,"{'colsample_bytree': 0.6, 'gamma': 1, 'learning_rate': 0.01, 'max_depth': 9, 'min_child_weight': 5, 'n_estimators': 2000, 'subsample': 0.8, 'use_label_encoder': False}"
3,0.839484,0.033521,"{'colsample_bytree': 0.6, 'gamma': 1, 'learning_rate': 0.01, 'max_depth': 9, 'min_child_weight': 3, 'n_estimators': 2000, 'subsample': 0.6, 'use_label_encoder': False}"
4,0.837998,0.031227,"{'colsample_bytree': 0.6, 'gamma': 1, 'learning_rate': 0.01, 'max_depth': 9, 'min_child_weight': 5, 'n_estimators': 2000, 'subsample': 0.6, 'use_label_encoder': False}"
5,0.835755,0.036506,"{'colsample_bytree': 0.6, 'gamma': 1, 'learning_rate': 0.01, 'max_depth': 9, 'min_child_weight': 3, 'n_estimators': 2000, 'subsample': 1.0, 'use_label_encoder': False}"


In [288]:
df_best_result_XGB = pd.DataFrame(df_XGBClassifier_results.iloc[0,:]).T
df_best_result_XGB['Model'] = "XGBClassifier"
df_best_result_XGB

Unnamed: 0,means,stds,params,Model
0,0.840612,0.034154,"{'colsample_bytree': 0.6, 'gamma': 1, 'learnin...",XGBClassifier


In [289]:
model = XGBClassifier(n_estimators = 2000,
                       learning_rate = 0.01,
                       max_depth = 9,
                       min_child_weight = 5,
                       gamma = 1.0,
                       subsample = 1.0,
                       colsample_bytree = 0.6,
                       use_label_encoder=[False]).fit(X_train, Y_train)
print(model.score(X_train, Y_train))
pd.crosstab(Y_train, 
            model.predict(X_train), 
            rownames = ['Real data'], 
            colnames = ['Predicted'])



0.8933782267115601


Predicted,0.0,1.0
Real data,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,518,31
1.0,64,278


In [290]:
df_best_result_XGB['Recall'] = recall_score(Y_train, model.predict(X_train))
df_best_result_XGB['F1_Score'] = f1_score(Y_train, model.predict(X_train))
df_best_result_XGB['Iteration'] = 1
false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_train, model.predict(X_train))
roc_auc = auc(false_positive_rate, true_positive_rate)
df_best_result_XGB['AUC'] = roc_auc
df_best_result = pd.concat([df_best_result,df_best_result_XGB],
         axis = 0)

In [291]:
df_best_result.sort_values('means',
                          axis = 0,
                          ascending = False)

Unnamed: 0,means,stds,params,Model,Recall,F1_Score,Iteration,AUC
0,0.840612,0.034154,"{'colsample_bytree': 0.6, 'gamma': 1, 'learnin...",XGBClassifier,0.812865,0.854071,1,0.8782
0,0.832002,0.038314,"{'bootstrap': True, 'criterion': 'gini', 'max_...",RandomForestClassifier,0.894737,0.940092,1,0.944636
0,0.819309,0.040695,"{'criterion': 'gini', 'max_features': None, 'm...",DecisionTreeClassifier,0.824561,0.858447,1,0.882226
0,0.805814,0.043604,"{'C': 0.1, 'class_weight': None, 'penalty': 'l...",Logistic_Regression,0.681287,0.730408,1,0.783266
0,0.796101,0.040125,"{'C': 0.1, 'max_iter': 10000, 'penalty': 'l2'}",LinearSVC,0.687135,0.725309,1,0.778904
0,0.778506,0.051957,"{'alpha': 0.01, 'learning_rate': 'optimal', 'l...",SGDClassifier,0.792398,0.68262,1,0.731354
0,0.778136,0.033252,{'var_smoothing': 4.71486636345739e-06},GaussianNB,0.687135,0.711044,1,0.767065
0,0.749351,0.034313,"{'leaf_size': 26, 'n_neighbors': 11, 'p': 1}",KNeighbors,0.666667,0.706977,1,0.765027
0,0.670121,0.10997,"{'early_stopping': False, 'eta0': 0.1, 'max_it...",Perceptron,0.871345,0.687428,1,0.728933


In [292]:
df_best_result.to_csv('best_result_1.csv')