In [130]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

In [146]:
#Datasets for learning
titanic = sns.load_dataset("titanic")

# Pre process data
- We intend to predict who will survive from the titanic

In [149]:
titanic.shape

(891, 15)

In [None]:
#Explore for missingness
titanic.isnull().sum()

In [154]:
#titanic.drop('deck', axis=1, inplace=True) #remove deck too many missing
titanic = titanic.loc[~titanic['age'].isna()] 
titanic = titanic.loc[~titanic['embarked'].isna()] 
titanic = titanic.loc[~titanic['embark_town'].isna()] 

In [366]:
#See if missingess is fixed
titanic.isnull().sum()

In [159]:
#Explore for numeric predictors using correlation
r = titanic.corr()
r

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,adult_male,alone
survived,1.0,-0.356462,-0.082446,-0.015523,0.095265,0.2661,-0.551151,-0.199741
pclass,-0.356462,1.0,-0.365902,0.065187,0.023666,-0.552893,0.094635,0.150576
age,-0.082446,-0.365902,1.0,-0.307351,-0.187896,0.093143,0.286543,0.195766
sibsp,-0.015523,0.065187,-0.307351,1.0,0.383338,0.13986,-0.313016,-0.629408
parch,0.095265,0.023666,-0.187896,0.383338,1.0,0.206624,-0.36558,-0.577109
fare,0.2661,-0.552893,0.093143,0.13986,0.206624,1.0,-0.177446,-0.262799
adult_male,-0.551151,0.094635,0.286543,-0.313016,-0.36558,-0.177446,1.0,0.400718
alone,-0.199741,0.150576,0.195766,-0.629408,-0.577109,-0.262799,0.400718,1.0


In [160]:
r2 = np.square(r)
r2

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,adult_male,alone
survived,1.0,0.127065,0.006797,0.000241,0.009075,0.070809,0.303767,0.039897
pclass,0.127065,1.0,0.133884,0.004249,0.00056,0.305691,0.008956,0.022673
age,0.006797,0.133884,1.0,0.094465,0.035305,0.008676,0.082107,0.038324
sibsp,0.000241,0.004249,0.094465,1.0,0.146948,0.019561,0.097979,0.396155
parch,0.009075,0.00056,0.035305,0.146948,1.0,0.042693,0.133648,0.333055
fare,0.070809,0.305691,0.008676,0.019561,0.042693,1.0,0.031487,0.069063
adult_male,0.303767,0.008956,0.082107,0.097979,0.133648,0.031487,1.0,0.160575
alone,0.039897,0.022673,0.038324,0.396155,0.333055,0.069063,0.160575,1.0


In [370]:
plt.figure(figsize=(8, 6))
sns.heatmap(r2, cmap=sns.cm.rocket_r)
plt.show

In [373]:
titanic_final = titanic[['alive', 'pclass', 'fare', 'adult_male', 'alone']]

In [371]:
#Exploring for categotical variables - first identify them
titanic.select_dtypes(exclude='number').columns

In [372]:
#Explore for association with categorical variables
for var in titanic.select_dtypes(exclude='number'):
    if var != 'alive':
        sns.countplot(data=titanic, x=var, hue='alive')
        plt.show()

In [374]:
#update the final data set 
titanic_final = titanic[['alive', 'pclass', 'fare', 'adult_male', 'alone', 'sex', 'embarked', 'class', 'who', 'adult_male', 'embark_town']]

In [375]:
#create dummy variables
dummies = pd.get_dummies(titanic_final[['sex', 'embarked', 'class', 'who', 'embark_town']], drop_first=True)

In [173]:
#put dummy variables back with the numeric
titanic_analytical = pd.concat([titanic_final[['alive', 'pclass', 'fare', 'adult_male', 'alone']], dummies], axis=1)

In [192]:
#removing duplicate variables - may not be necessary in other datasets
titanic_analytical = pd.concat([titanic_analytical.iloc[:, 0:4], titanic_analytical.iloc[:, 5:]], axis=1)

In [376]:
titanic_analytical

# Split Data

In [83]:
from sklearn.model_selection import train_test_split

In [194]:
label = titanic_analytical['alive']
features = titanic_analytical.drop('alive', axis=1)

In [397]:
feat_train, feat_test, lab_train, lab_test = train_test_split(features, label, test_size=0.2, random_state=101)

# Random Forest (Ensemble using the bagging method)
- This method works by creating multiple trees that will all predict and then vote

In [398]:
from sklearn.ensemble import RandomForestClassifier

In [399]:
rfc = RandomForestClassifier(n_estimators=100) #using 100 trees

In [400]:
rfc.fit(feat_train, lab_train) # train

RandomForestClassifier()

In [401]:
predicted = rfc.predict(feat_test) #predict

In [402]:
#predicted

In [403]:
#create a matrix for a quick assessment
prediction_matrix = pd.DataFrame({'actual': lab_test, 'predicted':predicted})

In [404]:
#Cross table to see actual against predicted
pd.crosstab(prediction_matrix['actual'], prediction_matrix['predicted'])

predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,64,20
1,14,45


# Evaluate model

In [405]:
from sklearn.metrics import classification_report, confusion_matrix

In [406]:
print(classification_report(lab_test, predicted))

              precision    recall  f1-score   support

           0       0.82      0.76      0.79        84
           1       0.69      0.76      0.73        59

    accuracy                           0.76       143
   macro avg       0.76      0.76      0.76       143
weighted avg       0.77      0.76      0.76       143



# Compare to other algorithms for making this prediction

# KNN

In [407]:
from sklearn.neighbors import KNeighborsClassifier

In [408]:
knn = KNeighborsClassifier(n_neighbors=27)

In [409]:
knn.fit(feat_train, lab_train)

KNeighborsClassifier(n_neighbors=27)

In [410]:
kpred = knn.predict(feat_test)

In [411]:
print(classification_report(lab_test, kpred))

              precision    recall  f1-score   support

           0       0.77      0.82      0.79        84
           1       0.72      0.64      0.68        59

    accuracy                           0.75       143
   macro avg       0.74      0.73      0.74       143
weighted avg       0.75      0.75      0.75       143



# Support Vector Classifier

In [412]:
from sklearn.svm import SVC

In [413]:
svc = SVC()

In [414]:
svc.fit(feat_train, lab_train)

SVC()

In [415]:
spred = svc.predict(feat_test)

In [416]:
print(classification_report(lab_test, spred))

              precision    recall  f1-score   support

           0       0.69      0.96      0.81        84
           1       0.88      0.39      0.54        59

    accuracy                           0.73       143
   macro avg       0.79      0.68      0.67       143
weighted avg       0.77      0.73      0.70       143



# Decision Tree Classisifer

In [417]:
from sklearn.tree import DecisionTreeClassifier

In [418]:
dtc = DecisionTreeClassifier()

In [419]:
dtc.fit(feat_train, lab_train)

DecisionTreeClassifier()

In [420]:
dpred = dtc.predict(feat_test)

In [421]:
print(classification_report(lab_test, dpred))

              precision    recall  f1-score   support

           0       0.79      0.76      0.78        84
           1       0.68      0.71      0.69        59

    accuracy                           0.74       143
   macro avg       0.73      0.74      0.73       143
weighted avg       0.74      0.74      0.74       143



In [453]:
label = titanic['survived']
features = titanic_analytical.drop('alive', axis=1)

In [454]:
feat_train, feat_test, lab_train, lab_test = train_test_split(features, label, test_size=0.2, random_state=101)

# XGBoost (Ensemble using the Boosting Method)
- This algorithm acts by trying to improve on the predictive accuracy

In [455]:
from xgboost import XGBClassifier

In [456]:
xgb = XGBClassifier()

In [457]:
xgb.fit(feat_train, lab_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [458]:
xpred = xgb.predict(feat_test)

In [459]:
print(classification_report(lab_test, xpred))

              precision    recall  f1-score   support

           0       0.81      0.79      0.80        84
           1       0.71      0.75      0.73        59

    accuracy                           0.77       143
   macro avg       0.76      0.77      0.76       143
weighted avg       0.77      0.77      0.77       143



# Logistic Regression

In [443]:
from sklearn.linear_model import LogisticRegression

In [444]:
logit = LogisticRegression(max_iter=1000)

In [445]:
logit.fit(feat_train, lab_train)

LogisticRegression(max_iter=1000)

In [446]:
lpred = logit.predict(feat_test)

In [447]:
print(classification_report(lab_test, lpred))

              precision    recall  f1-score   support

           0       0.81      0.79      0.80        84
           1       0.71      0.75      0.73        59

    accuracy                           0.77       143
   macro avg       0.76      0.77      0.76       143
weighted avg       0.77      0.77      0.77       143



# Ranking of the models for predicting survival from this dataset
1. XGBoost (83%)
2. Logistic Regression (80%)
3. Random Forest (76%) 
4. K Nearest Neighbors (75%) 
5. Decision Tree Classification (74%)
6. Support Vector Machine Classification (73%)