In [23]:
# Load libraries for this and next parts of the cases at once
import pandas as pd
import numpy as np
from pandas.plotting import scatter_matrix
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# dataset downloading, selecting and reordering columns for further analysis
dataset=pd.read_excel("cata.xlsx")
dataset=dataset.drop(['name','sibsp', 'parch', 'ticket', 'fare', 
                      'cabin', 'embarked', 'boat', 'body', 'home.dest'],axis=1)
dataset=dataset[['survived','pclass', 'sex', 'age']]

# checking the dataset for nan and deleting empty rows
dataset.isna().sum()
dataset=dataset.dropna(axis=0)

# formatting the "survived" column into categorical
dataset.survived.replace(to_replace=[0,1],value=["no","yes"], inplace=True)
dataset.sex.replace(to_replace=["female","male"],value=[0,1], inplace=True)
dataset = dataset.astype({'survived':'category','pclass':'int','sex':'int','age':'float'})

# split-out validation dataset
array = dataset.values
X = array[:,1:]
y = array[:,0]

# set up the train & test split
X_train, X_validation, Y_train, Y_validation = train_test_split(X, y, test_size=0.20, random_state=1)

In [24]:
# the goal is to check different models for prediction (linear and non-linear ones)
# and choose the most accurate one. the list of models is given below:
# 1. Logistic Regression (LR)
# 2. Linear Discriminant Analysis (LDA)
# 3. K-Nearest Neighbors (KNN)
# 4. Gaussian Naive Bayes (NB)
# 5. Support Vector Machines (SVM)
# 6. Classification and Regression Trees (CART)
# 7. Random Forest Classifier (RFC)

models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('RFC', RandomForestClassifier()))

In [25]:
# evaluate each model in turn and show the final results
# results may slightly differ if re-run the code
# Logistic Regression (LR) shows the best R2 and std performance - LR: 0.780 (0.03)
# for the validation I'll use the LR model 
results = []
names = []
for name, model in models:
    kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    print('%s: %.3f (%.2f)' % (name, cv_results.mean(), cv_results.std()))

LR: 0.780 (0.03)
LDA: 0.777 (0.03)
KNN: 0.762 (0.05)
NB: 0.775 (0.03)
SVM: 0.606 (0.03)
CART: 0.770 (0.04)
RFC: 0.778 (0.04)


In [30]:
# making predictions on validation (test) dataset by using LR model
model = LogisticRegression(solver='liblinear', multi_class='ovr')
model.fit(X_train, Y_train)
predictions = model.predict(X_validation)

In [36]:
# Evaluate predictions: high accuracy score (0.80) = TP+TN
# high f1-score (precision, recall) (0.80)
# confusion matrix: T1 error (FP) - 18 (8.6%), T2 error (FN) - 24 (11.4%)
print("accuracy_score")
print('%.2f' % accuracy_score(Y_validation, predictions))
print("________________________________________________________")
print("confusion_matrix")
print(confusion_matrix(Y_validation, predictions))
print("________________________________________________________")
print("classification_report")
print(classification_report(Y_validation, predictions))

accuracy_score
0.80
________________________________________________________
confusion_matrix
[[99 24]
 [18 69]]
________________________________________________________
classification_report
              precision    recall  f1-score   support

          no       0.85      0.80      0.83       123
         yes       0.74      0.79      0.77        87

    accuracy                           0.80       210
   macro avg       0.79      0.80      0.80       210
weighted avg       0.80      0.80      0.80       210



In [37]:
# making predictions on the whole dataset by using LR model
model.fit(X, y)
predictions = model.predict(X)

# Evaluate predictions: high accuracy score (0.79) = TP+TN
# high f1-score (precision, recall) (0.78)
# confusion matrix: T1 error (FP) - 134 (12.8%), T2 error (FN) - 90 (8.6%)
# T1 error (FP) worsened
print("accuracy_score")
print('%.2f' % accuracy_score(y,predictions))
print("________________________________________________________")
print("confusion_matrix")
print(confusion_matrix(y,predictions))
print("________________________________________________________")
print("classification_report")
print(classification_report(y,predictions))

accuracy_score
0.79
________________________________________________________
confusion_matrix
[[529  90]
 [134 293]]
________________________________________________________
classification_report
              precision    recall  f1-score   support

          no       0.80      0.85      0.83       619
         yes       0.77      0.69      0.72       427

    accuracy                           0.79      1046
   macro avg       0.78      0.77      0.77      1046
weighted avg       0.78      0.79      0.78      1046



In [29]:
# in the next part I'll use two models: the best one (LR) and the worst one (SVC)
# in order to improve performance with Hyperparameter Tuning