In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [10]:
titanic_df = pd.read_csv('datasets/titanic_processed.csv')
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,1,3,0,36.0,1,0,17.4,0,0,1
1,1,2,1,1.0,2,1,39.0,0,0,1
2,0,2,1,51.0,0,0,12.525,0,0,1
3,1,2,0,23.0,0,0,13.7917,1,0,0
4,0,2,1,27.0,0,0,26.0,0,0,1


In [15]:
X = titanic_df.drop('Survived', axis=1)
Y = titanic_df['Survived']
x_train, x_test, y_train,  y_test = train_test_split(X, Y, test_size=0.2)

In [16]:
def summarize_classification(y_test,y_pred):
    acc = accuracy_score(y_test, y_pred, normalize = True)
    num_acc = accuracy_score(y_test, y_pred, normalize = False)
    
    prec =  precision_score(y_test, y_pred)
    recall =  recall_score(y_test, y_pred)
    
    print("Test data count: ",len(y_test))
    print("Accuracy count: ", num_acc)
    print("Accuracy score: ",  acc)
    print("Precision score: ", prec)
    print("Recall score: ", recall)
    print()

In [19]:
#Import gridSearch for hyperparam tuning
#Use crossvalidation
#cv=3 means split datasets into 3 parts
#select best max_depth hyperparam
from sklearn.model_selection import GridSearchCV

parameters = {'max_depth': [2, 4, 5, 7, 9, 10]} #use max depth to tune our model
grid_search = GridSearchCV(DecisionTreeClassifier(), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'max_depth': 2}

In [20]:
#See the total gridSearch Result
for i in range(6):
    print('Parameters: ', grid_search.cv_results_['params'][i])
    
    print('Mean Test Score: ', grid_search.cv_results_['mean_test_score'][i])
    
    print('Rank: ', grid_search.cv_results_['rank_test_score'][i])

Parameters:  {'max_depth': 2}
Mean Test Score:  0.7873479996287015
Rank:  1
Parameters:  {'max_depth': 4}
Mean Test Score:  0.7820477118722732
Rank:  2
Parameters:  {'max_depth': 5}
Mean Test Score:  0.7679197994987469
Rank:  4
Parameters:  {'max_depth': 7}
Mean Test Score:  0.7574213311055417
Rank:  5
Parameters:  {'max_depth': 9}
Mean Test Score:  0.7679847767567066
Rank:  3
Parameters:  {'max_depth': 10}
Mean Test Score:  0.7328320802005012
Rank:  6


In [24]:
decision_tree_model = DecisionTreeClassifier( \
    max_depth = grid_search.best_params_['max_depth']).fit(x_train,y_train)

In [25]:
y_pred =  decision_tree_model.predict(x_test)

In [26]:
summarize_classification(y_test,y_pred)

Test data count:  143
Accuracy count:  114
Accuracy score:  0.7972027972027972
Precision score:  0.8333333333333334
Recall score:  0.5660377358490566



In [29]:
#now for logistic Regression Tuning
parameters = {'penalty': ['l1','l2'],
              'C': [0.1, 0.4, 0.8, 1, 2, 5]}
grid_search = GridSearchCV(LogisticRegression(solver='liblinear'), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'C': 5, 'penalty': 'l1'}

In [30]:
#See the total gridSearch Result of Logistic Regression
#12=2x6
for i in range(12):
    print('Parameters: ', grid_search.cv_results_['params'][i])
    print('Mean Test Score: ', grid_search.cv_results_['mean_test_score'][i])
    print('Rank: ', grid_search.cv_results_['rank_test_score'][i])

Parameters:  {'C': 0.1, 'penalty': 'l1'}
Mean Test Score:  0.7452056066091153
Rank:  12
Parameters:  {'C': 0.1, 'penalty': 'l2'}
Mean Test Score:  0.7539868188990996
Rank:  11
Parameters:  {'C': 0.4, 'penalty': 'l1'}
Mean Test Score:  0.7732850645131347
Rank:  9
Parameters:  {'C': 0.4, 'penalty': 'l2'}
Mean Test Score:  0.77857606980414
Rank:  3
Parameters:  {'C': 0.8, 'penalty': 'l1'}
Mean Test Score:  0.7767938364429593
Rank:  4
Parameters:  {'C': 0.8, 'penalty': 'l2'}
Mean Test Score:  0.7732943469785575
Rank:  7
Parameters:  {'C': 1, 'penalty': 'l1'}
Mean Test Score:  0.7767938364429593
Rank:  4
Parameters:  {'C': 1, 'penalty': 'l2'}
Mean Test Score:  0.7750487329434698
Rank:  6
Parameters:  {'C': 2, 'penalty': 'l1'}
Mean Test Score:  0.7838206627680312
Rank:  2
Parameters:  {'C': 2, 'penalty': 'l2'}
Mean Test Score:  0.7732943469785575
Rank:  7
Parameters:  {'C': 5, 'penalty': 'l1'}
Mean Test Score:  0.7838299452334541
Rank:  1
Parameters:  {'C': 5, 'penalty': 'l2'}
Mean Test Scor

In [31]:
logistic_model = LogisticRegression(solver='liblinear', \
    penalty=grid_search.best_params_['penalty'], C=grid_search.best_params_['C']). \
    fit(x_train, y_train)

In [32]:
y_pred =  logistic_model.predict(x_test)

In [33]:
summarize_classification(y_test, y_pred)

Test data count:  143
Accuracy count:  117
Accuracy score:  0.8181818181818182
Precision score:  0.8
Recall score:  0.6792452830188679

