## 1. Library Import

In [52]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve, auc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate, cross_val_predict

## 2. Load Data

In [3]:
df_train = pd.read_csv('train_iter_3.csv',
                      index_col = 0)
df_test = pd.read_csv('test_iter_3.csv',
                      index_col = 0)

X_train = df_train.drop('Survived',
                       axis = 1,)
Y_train = df_train.loc[:,'Survived']
X_test = df_test

df_best_result = pd.read_csv('../Iteration_2_Standardization/best_result_2.csv',
                             index_col = 0)

## 3. Model

### 3.1. Logistic Regression

In [4]:
#define the model
model = LogisticRegression()

#define the parameters to search
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
class_weight = [None, 'balanced']

#define grid search
grid = dict(solver = solvers,
           penalty = penalty,
           C = c_values,
           class_weight = class_weight)
cv = RepeatedStratifiedKFold(n_splits = 10, 
                             n_repeats = 3, 
                             random_state = 1)
grid_search = GridSearchCV(estimator = model,
                          param_grid = grid,
                          n_jobs = -1,
                          cv = cv,
                          scoring = 'accuracy',
                          error_score = 0)
grid_result = grid_search.fit(X_train, Y_train)

#summarize results
print('Best: %f using %s' % (grid_result.best_score_,
                            grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
results_dict = dict(means = means,
                  stds = stds,
                  params = params)

Best: 0.797216 using {'C': 1.0, 'class_weight': None, 'penalty': 'l2', 'solver': 'liblinear'}


In [5]:
df_logistic_results = pd.DataFrame(data = results_dict)
df_logistic_results.sort_values(by = 'means',
                            axis = 0,
                            inplace = True,
                            ascending = False)
df_logistic_results.reset_index(drop = True,
                               inplace = True)
df_logistic_results.head(10)

Unnamed: 0,means,stds,params
0,0.797216,0.040543,"{'C': 1.0, 'class_weight': None, 'penalty': 'l..."
1,0.795352,0.036237,"{'C': 10, 'class_weight': None, 'penalty': 'l2..."
2,0.793854,0.037244,"{'C': 1.0, 'class_weight': None, 'penalty': 'l..."
3,0.793854,0.037244,"{'C': 1.0, 'class_weight': None, 'penalty': 'l..."
4,0.793854,0.037244,"{'C': 1.0, 'class_weight': None, 'penalty': 'l..."
5,0.793854,0.037244,"{'C': 1.0, 'class_weight': None, 'penalty': 'l..."
6,0.793109,0.037031,"{'C': 100, 'class_weight': None, 'penalty': 'l..."
7,0.793104,0.037506,"{'C': 10, 'class_weight': None, 'penalty': 'l2..."
8,0.793104,0.037506,"{'C': 10, 'class_weight': None, 'penalty': 'l2..."
9,0.793104,0.037506,"{'C': 10, 'class_weight': None, 'penalty': 'l2..."


In [6]:
df_best_result_LR = pd.DataFrame(df_logistic_results.iloc[0,:]).T
df_best_result_LR['Model'] = "Logistic_Regression"
df_best_result_LR

Unnamed: 0,means,stds,params,Model
0,0.797216,0.040543,"{'C': 1.0, 'class_weight': None, 'penalty': 'l...",Logistic_Regression


In [45]:
model = LogisticRegression(solver = 'liblinear',
                           penalty = 'l2',
                           C = 1.0,
                           class_weight = None)
model_result = cross_validate(model, X_train, Y_train, cv = 10,
                             scoring = ('accuracy', 'recall', 'f1','roc_auc'))
y_pred = cross_val_predict(model, X_train, Y_train, cv = 10)
pd.DataFrame(confusion_matrix(Y_train, y_pred))

Unnamed: 0,0,1
0,469,80
1,100,242


In [49]:
df_best_result_LR['Recall'] = model_result['test_recall'].mean()
df_best_result_LR['F1_Score'] = model_result['test_f1'].mean()
df_best_result_LR['Iteration'] = 3
df_best_result_LR['AUC'] = model_result['test_roc_auc'].mean()