In [29]:
# Load libraries for this and next parts of the cases at once
import pandas as pd
import numpy as np
from pandas.plotting import scatter_matrix
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# dataset downloading, selecting and reordering columns for further analysis
dataset=pd.read_excel("cata.xlsx")
dataset=dataset.drop(['name','sibsp', 'parch', 'ticket', 'fare', 
                      'cabin', 'embarked', 'boat', 'body', 'home.dest'],axis=1)
dataset=dataset[['survived','pclass', 'sex', 'age']]

# checking the dataset for nan and deleting empty rows
dataset.isna().sum()
dataset=dataset.dropna(axis=0)

# formatting the "survived" column into categorical
dataset.survived.replace(to_replace=[0,1],value=["no","yes"], inplace=True)
dataset.sex.replace(to_replace=["female","male"],value=[0,1], inplace=True)
dataset = dataset.astype({'survived':'category','pclass':'int','sex':'int','age':'float'})

# split-out validation dataset
array = dataset.values
X = array[:,1:]
y = array[:,0]

# set up the train & test split
X_train, X_validation, Y_train, Y_validation = train_test_split(X, y, test_size=0.20, random_state=1)

# the goal is to check different models for prediction (linear and non-linear ones)
# and choose the most accurate one. the list of models is given below:
# 1. Logistic Regression (LR)
# 2. Linear Discriminant Analysis (LDA)
# 3. K-Nearest Neighbors (KNN)
# 4. Gaussian Naive Bayes (NB)
# 5. Support Vector Machines (SVM)
# 6. Classification and Regression Trees (CART)
# 7. Random Forest Classifier (RFC)

models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('RFC', RandomForestClassifier()))

# evaluate each model in turn and show the final results
# the Support Vector Machines (SVM) show the best results (correlation and std error)
# all the models show good performance (results may slightly differ if re-run the code)
# Logistic Regression (LR) and Support Vector Machines (SVM) show the best R2 and std performance
# for the validation I'll use the SVM model with the highest R2 score
results = []
names = []
for name, model in models:
    kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    print('%s: %.3f (%.2f)' % (name, cv_results.mean(), cv_results.std()))


LR: 0.780 (0.03)
LDA: 0.777 (0.03)
KNN: 0.762 (0.05)
NB: 0.775 (0.03)
SVM: 0.606 (0.03)
CART: 0.770 (0.04)
RFC: 0.780 (0.04)


In [30]:
# in this part I'll use two models: the best one (LR) and the worst one (SVC)
# in order to improve performance with Hyperparameter Tuning

In [31]:
# defining parameter range for SVC model (initially the worst one)
param_grid = {'C': [50, 10, 1.0, 0.1, 0.01], 
              'gamma': ['scale'],
              'kernel': ['linear','poly', 'rbf', 'sigmoid']} 

grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)

In [32]:
# fitting the model for grid search
grid.fit(X_train, Y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END ..C=50, gamma=scale, kernel=linear;, score=0.810 total time=   0.5s
[CV 2/5] END ..C=50, gamma=scale, kernel=linear;, score=0.808 total time=   0.4s
[CV 3/5] END ..C=50, gamma=scale, kernel=linear;, score=0.796 total time=   0.6s
[CV 4/5] END ..C=50, gamma=scale, kernel=linear;, score=0.737 total time=   0.3s
[CV 5/5] END ..C=50, gamma=scale, kernel=linear;, score=0.725 total time=   0.4s
[CV 1/5] END ....C=50, gamma=scale, kernel=poly;, score=0.798 total time=   0.0s
[CV 2/5] END ....C=50, gamma=scale, kernel=poly;, score=0.784 total time=   0.0s
[CV 3/5] END ....C=50, gamma=scale, kernel=poly;, score=0.772 total time=   0.0s
[CV 4/5] END ....C=50, gamma=scale, kernel=poly;, score=0.784 total time=   0.0s
[CV 5/5] END ....C=50, gamma=scale, kernel=poly;, score=0.754 total time=   0.0s
[CV 1/5] END .....C=50, gamma=scale, kernel=rbf;, score=0.815 total time=   0.0s
[CV 2/5] END .....C=50, gamma=scale, kernel=rbf

In [33]:
# the best parameters after tuning
grid.best_params_

{'C': 0.01, 'gamma': 'scale', 'kernel': 'linear'}

In [34]:
# how our model looks after hyper-parameter tuning
grid.best_estimator_

In [35]:
# print accuracy & classification report
# accuracy improved from 0.61 to 0.80
# high f1-score (precision, recall) (0.80)
# confusion matrix: T1 error (FP) - 20 (9.5%), T2 error (FN) - 23 (11%)
grid_predictions = grid.predict(X_validation)
print("accuracy_score")
print('%.2f' % accuracy_score(Y_validation, grid_predictions))
print("________________________________________________________")
print("confusion_matrix")
print(confusion_matrix(Y_validation, grid_predictions))
print("________________________________________________________")
print("classification_report")
print(classification_report(Y_validation, grid_predictions))

accuracy_score 0.80
________________________________________________________
confusion_matrix
[[100  23]
 [ 20  67]]
________________________________________________________
classification_report
              precision    recall  f1-score   support

          no       0.83      0.81      0.82       123
         yes       0.74      0.77      0.76        87

    accuracy                           0.80       210
   macro avg       0.79      0.79      0.79       210
weighted avg       0.80      0.80      0.80       210



In [52]:
# improving Accuracy with Hyperparameter Tuning for the initially best model (LR)
# defining parameter range for LR model 
param_grid = {'solver' : ['newton-cg', 'lbfgs', 'liblinear'], 
              'penalty' : ['l2'],
              'C' : [100, 10, 1.0, 0.1, 0.01]} 
grid = GridSearchCV(LogisticRegression(), param_grid, n_jobs=-1, cv=kfold, error_score=0)

In [53]:
# fitting the model for grid search
grid.fit(X_train, Y_train)

In [58]:
# summarize results
print("Best: %f using %s" % (grid.best_score_, grid.best_params_))
means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
params = grid.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.779848 using {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
0.773867 (0.031274) with: {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.773867 (0.031274) with: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
0.775072 (0.031375) with: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.773867 (0.031274) with: {'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
0.773867 (0.031274) with: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
0.775072 (0.031835) with: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
0.772662 (0.031588) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
0.772662 (0.031588) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
0.779848 (0.027770) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
0.773896 (0.028631) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
0.773896 (0.028631) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
0.775086 (0.028463) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
0.753571 (0.026356) wit

In [56]:
# how our model looks after hyper-parameter tuning
grid.best_estimator_

In [59]:
# print accuracy & classification report
# the improvement of the LR model vs initial one is negligible
# accuracy improved from 0.78 to 0.80 (since the model was initially fit with optimal parameters set (solver='liblinear', multi_class='ovr'))
# high f1-score (precision, recall) (0.80)
# confusion matrix: T1 error (FP) - 18 (8.6%), T2 error (FN) - 24 (11.4%)
grid_predictions = grid.predict(X_validation)
print("accuracy_score")
print('%.2f' % accuracy_score(Y_validation, grid_predictions))
print("________________________________________________________")
print("confusion_matrix")
print(confusion_matrix(Y_validation, grid_predictions))
print("________________________________________________________")
print("classification_report")
print(classification_report(Y_validation, grid_predictions))

accuracy_score
0.80
________________________________________________________
confusion_matrix
[[99 24]
 [18 69]]
________________________________________________________
classification_report
              precision    recall  f1-score   support

          no       0.85      0.80      0.83       123
         yes       0.74      0.79      0.77        87

    accuracy                           0.80       210
   macro avg       0.79      0.80      0.80       210
weighted avg       0.80      0.80      0.80       210



In [None]:
# Hyperparameter Tuning let us significantly improve the performance of the worst model (SVC)
# but the LR model performance was less significant
# the explanation is that the LR model was initially fit with optimal parameters set (solver='liblinear', multi_class='ovr')
# analogically one can fit initially SVC with (gamma='auto') to improve the starting model