# Hyperparameter Tuning

In [1]:
# import libraries
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv('datasets/titanic_train_processed.csv')

df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,1,1,0,15.0,0,1,211.3375,False,False,True
1,1,1,0,24.0,3,2,263.0,False,False,True
2,1,2,1,62.0,0,0,10.5,False,False,True
3,0,2,1,28.0,0,0,13.0,False,False,True
4,0,1,1,70.0,1,1,71.0,False,False,True


In [4]:
X = df.drop('Survived', axis=1)
Y = df['Survived']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [3]:
def summarize_classification(y_test, y_pred):

    accuracy = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)

    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    print('Test data count: ', len(y_test))
    print('accuracy: ', accuracy)
    print('precision: ', precision)
    print('recall: ', recall)
    print('accuracy_count: ', num_acc)
    print()

## Decision Tree Classifier

In [5]:
from sklearn.model_selection import GridSearchCV

parameters = {'max_depth': [2, 4, 5, 7, 9, 10]}

grid_search = GridSearchCV(DecisionTreeClassifier(), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'max_depth': 5}

In [6]:
for i in range(6):
    print('Parameters: ', grid_search.cv_results_['params'][i])
    print('Mean Test Score: ', grid_search.cv_results_['mean_test_score'][i])
    print('Rank: ', grid_search.cv_results_['rank_test_score'][i])

Parameters:  {'max_depth': 2}
Mean Test Score:  0.7698227049104243
Rank:  6
Parameters:  {'max_depth': 4}
Mean Test Score:  0.7944026733500418
Rank:  3
Parameters:  {'max_depth': 5}
Mean Test Score:  0.8013923698134224
Rank:  1
Parameters:  {'max_depth': 7}
Mean Test Score:  0.799647266313933
Rank:  2
Parameters:  {'max_depth': 9}
Mean Test Score:  0.7820384294068505
Rank:  4
Parameters:  {'max_depth': 10}
Mean Test Score:  0.7785296574770258
Rank:  5


In [8]:
decision_tree_model = DecisionTreeClassifier(max_depth=grid_search.best_params_['max_depth']).fit(x_train, y_train)

In [9]:
y_pred = decision_tree_model.predict(x_test)

In [10]:
summarize_classification(y_test, y_pred)

Test data count:  143
accuracy:  0.7972027972027972
precision:  0.7321428571428571
recall:  0.7454545454545455
accuracy_count:  114



## Logistic Regression Classifier

In [11]:
parameters = {'penalty': ['l1', 'l2'],
             'C': [0.1, 0.4, 0.8, 1, 2, 5]}

grid_search = GridSearchCV(LogisticRegression(solver='liblinear'), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'C': 5, 'penalty': 'l1'}

In [13]:
for i in range(12):
    print('Parameters: ', grid_search.cv_results_['params'][i])
    print('Mean Test Score: ', grid_search.cv_results_['mean_test_score'][i])
    print('Rank: ', grid_search.cv_results_['rank_test_score'][i])

Parameters:  {'C': 0.1, 'penalty': 'l1'}
Mean Test Score:  0.7662860855843311
Rank:  12
Parameters:  {'C': 0.1, 'penalty': 'l2'}
Mean Test Score:  0.7698134224450014
Rank:  11
Parameters:  {'C': 0.4, 'penalty': 'l1'}
Mean Test Score:  0.789139515455305
Rank:  7
Parameters:  {'C': 0.4, 'penalty': 'l2'}
Mean Test Score:  0.7821034066648102
Rank:  10
Parameters:  {'C': 0.8, 'penalty': 'l1'}
Mean Test Score:  0.7909124663510628
Rank:  5
Parameters:  {'C': 0.8, 'penalty': 'l2'}
Mean Test Score:  0.7873758470249698
Rank:  8
Parameters:  {'C': 1, 'penalty': 'l1'}
Mean Test Score:  0.7926668523159751
Rank:  4
Parameters:  {'C': 1, 'penalty': 'l2'}
Mean Test Score:  0.7873758470249698
Rank:  8
Parameters:  {'C': 2, 'penalty': 'l1'}
Mean Test Score:  0.7944119558154646
Rank:  2
Parameters:  {'C': 2, 'penalty': 'l2'}
Mean Test Score:  0.7908939014202172
Rank:  6
Parameters:  {'C': 5, 'penalty': 'l1'}
Mean Test Score:  0.7961663417803768
Rank:  1
Parameters:  {'C': 5, 'penalty': 'l2'}
Mean Test Sc

In [14]:
log_reg_model = LogisticRegression(solver='liblinear', penalty=grid_search.best_params_['penalty'], 
                                  C=grid_search.best_params_['C']).fit(x_train, y_train)

In [15]:
y_pred = log_reg_model.predict(x_test)

In [16]:
summarize_classification(y_test, y_pred)

Test data count:  143
accuracy:  0.7902097902097902
precision:  0.711864406779661
recall:  0.7636363636363637
accuracy_count:  113

