In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [3]:
titanic_df = pd.read_csv('c:/projects/datasets/titanic_processed.csv')

titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,0,28.0,0,0,7.8958,0,0,1
1,0,3,1,26.0,1,2,20.575,0,0,1
2,1,2,0,25.0,1,1,30.0,0,0,1
3,0,3,1,28.0,0,0,7.8958,0,0,1
4,0,3,1,29.0,1,0,7.0458,0,0,1


In [7]:
# drop the survived column to set our Survived features
X = titanic_df.drop('Survived', axis=1) # axis=1 means drop the headings of the columns
# training labels in the survived columns
Y = titanic_df['Survived']
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [8]:
Y

0      0
1      0
2      1
3      0
4      0
      ..
707    1
708    0
709    0
710    0
711    1
Name: Survived, Length: 712, dtype: int64

In [10]:
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,3,0,28.0,0,0,7.8958,0,0,1
1,3,1,26.0,1,2,20.5750,0,0,1
2,2,0,25.0,1,1,30.0000,0,0,1
3,3,1,28.0,0,0,7.8958,0,0,1
4,3,1,29.0,1,0,7.0458,0,0,1
...,...,...,...,...,...,...,...,...,...
707,2,0,14.0,1,0,30.0708,1,0,0
708,1,1,21.0,0,1,77.2875,0,0,1
709,1,1,70.0,1,1,71.0000,0,0,1
710,3,1,20.0,0,0,9.2250,0,0,1


In [22]:
def summarize_classification(y_test, y_pred):
    
    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)    
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    print("Test data count: ", len(y_test))
    print("accuracy_count: ", num_acc)
    print("accuracy_score: ", acc)
    print("precision_score: ", prec)
    print("recall_score: ", recall)
    print()

In [23]:
from sklearn.model_selection import GridSearchCV
    
parameters = {'max_depth': [2, 4, 5, 7, 9, 10]}

grid_search = GridSearchCV(DecisionTreeClassifier(), parameters, cv=3, return_train_score=True)
# will build 6 models with different max depth of the decision tree
# cv = 3 ... cross validation used to validate a ML model - use 3-fold cross validation to find the best model - split the dataset into 3 parts
# return_train_score - compare models using the default scoring mechanism for this estimator ie accuracy

grid_search.fit(x_train, y_train)

grid_search.best_params_

{'max_depth': 5}

In [24]:
for i in range(6):
    
    print('Parameters: ', grid_search.cv_results_['params'][i])
    
    print('Mean Test Score: ', grid_search.cv_results_['mean_test_score'][i])
    
    print('Rank: ', grid_search.cv_results_['rank_test_score'][i])

Parameters:  {'max_depth': 2}
Mean Test Score:  0.7838856400259909
Rank:  3
Parameters:  {'max_depth': 4}
Mean Test Score:  0.7908753364893716
Rank:  2
Parameters:  {'max_depth': 5}
Mean Test Score:  0.7908939014202172
Rank:  1
Parameters:  {'max_depth': 7}
Mean Test Score:  0.7803768680961664
Rank:  4
Parameters:  {'max_depth': 9}
Mean Test Score:  0.7715121136173767
Rank:  5
Parameters:  {'max_depth': 10}
Mean Test Score:  0.7627401837928153
Rank:  6


In [25]:
# after you find the best parameters, then build the model using them
decision_tree_model = DecisionTreeClassifier(max_depth = grid_search.best_params_['max_depth']).fit(x_train, y_train)

In [26]:
y_pred = decision_tree_model.predict(x_test)

In [27]:
summarize_classification(y_test, y_pred)

Test data count:  143
accuracy_count:  114
accuracy_score:  0.7972027972027972
precision_score:  0.7407407407407407
recall_score:  0.7272727272727273



In [29]:
parameters = {'penalty': ['l1','l2'], 'C': [0.1, 0.4, 0.8, 1, 2, 5]}
grid_search = GridSearchCV(LogisticRegression(solver='liblinear'), parameters, cv=3, return_train_score=True)
grid_search.fit(x_train, y_train)

grid_search.best_params_

{'C': 5, 'penalty': 'l1'}

In [30]:
for i in range(12):
    print('Parameters: ', grid_search.cv_results_['params'][i])
    print('Mean Test Score: ', grid_search.cv_results_['mean_test_score'][i])
    print('Rank: ', grid_search.cv_results_['rank_test_score'][i])

Parameters:  {'C': 0.1, 'penalty': 'l1'}
Mean Test Score:  0.7557133574677435
Rank:  12
Parameters:  {'C': 0.1, 'penalty': 'l2'}
Mean Test Score:  0.7663232154460223
Rank:  11
Parameters:  {'C': 0.4, 'penalty': 'l1'}
Mean Test Score:  0.7820941241993874
Rank:  8
Parameters:  {'C': 0.4, 'penalty': 'l2'}
Mean Test Score:  0.780339738234475
Rank:  10
Parameters:  {'C': 0.8, 'penalty': 'l1'}
Mean Test Score:  0.7926297224542839
Rank:  5
Parameters:  {'C': 0.8, 'penalty': 'l2'}
Mean Test Score:  0.7838485101642996
Rank:  7
Parameters:  {'C': 1, 'penalty': 'l1'}
Mean Test Score:  0.7978928803490207
Rank:  3
Parameters:  {'C': 1, 'penalty': 'l2'}
Mean Test Score:  0.7820941241993874
Rank:  8
Parameters:  {'C': 2, 'penalty': 'l1'}
Mean Test Score:  0.7979021628144435
Rank:  2
Parameters:  {'C': 2, 'penalty': 'l2'}
Mean Test Score:  0.787366564559547
Rank:  6
Parameters:  {'C': 5, 'penalty': 'l1'}
Mean Test Score:  0.7979207277452892
Rank:  1
Parameters:  {'C': 5, 'penalty': 'l2'}
Mean Test Sco

In [31]:
# instantiate a LogisticRegression estimator with the best parameters that we found with HyperParameterTuning
logistic_model = LogisticRegression(solver='liblinear',penalty=grid_search.best_params_['penalty'], C=grid_search.best_params_['C']).fit(x_train, y_train)

In [32]:
y_pred = logistic_model.predict(x_test)

In [33]:
summarize_classification(y_test, y_pred)

Test data count:  143
accuracy_count:  116
accuracy_score:  0.8111888111888111
precision_score:  0.7916666666666666
recall_score:  0.6909090909090909

