In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_absolute_error as mae, confusion_matrix as cm, classification_report, f1_score

import seaborn as sns
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from keras.layers import Dense, Dropout

In [3]:
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')
surv = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')

In [4]:
train.head(3)

Name, Ticket, and Fare likely don't contribute to survival predictions, and would be difficult to set up numerically for the model. The only reason fare would matter is to compare the relative class of people embarking, but the Pclass field is a much better metric for this purpose.

The Embarked feature would be interesting to keep, to see if certain regions were better. There are only three possibilities for this field, and so I can one-hot encode these into three new columns. Lastly, the cabin would also likely contribute to the survival rates, were there an easy way to make the data numerical.

I'll start by ignoring cabin for now, and making all the adjustments mentioned above.

In [5]:
features = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch'] # Features to keep
data = train.loc[:, features] # New frame with desired features
data['Cherbourg'] = (train.Embarked == 'C').astype(int) # One-hot encoding for embarking location
data['Queenstown'] = (train.Embarked == 'Q').astype(int)
data['Southampton'] = (train.Embarked == 'S').astype(int)
data.Sex = data.Sex.map(lambda p: 0 if p == 'male' else 1) # Numerical encoding of male/female

data.info()

There are still some null entries in the Age column. There is a relatively small amount, I think in the long-term I will try two approaches and compare the results. First will be imputation of missing values. Second, dropping the null rows. Lastly, a regressor to predict age based on available metrics.

In [6]:
imputer = SimpleImputer()
data_imputed = imputer.fit_transform(data)
data_imputed = pd.DataFrame(data_imputed,
                           columns = data.columns)

data_dropped = data.dropna(axis=0)

I'll take a closer look at the cabin feature to see if it is usable in any capacity:

In [7]:
print(train.Cabin.isna().value_counts(), '\n ~~~~~~~~~')
train.Cabin.value_counts()

I'd first like to check if there is any correlation between the deck of the passenger, and the Pclass feature. To do this, I will strip the deck of the first Cabin string for each row, and use this to compare agains the passenger class.

In [8]:
deck = train.Cabin.astype(str).map(lambda p: p.split(' ')[0][0]) # Strip the deck from the string
non_na_cabins = deck != 'n' # If there are any passengers who were split between multiple decks, this will not account for that.
deck.value_counts()

In [9]:
corr_data = pd.DataFrame() # Empty frame to use for correlation data

for Pclass in [1,2,3]: # Iterate over the three passenger classes
    corr_data[f'Class {str(Pclass)}'] = (train.Pclass.loc[non_na_cabins] == Pclass).astype(int) # One-hot encode the data
for deck_ in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T']:  # Iterate over possible decks
    corr_data[f'Deck {deck_}'] = (deck == deck_).astype(int) # One-hot encode the data

corr = corr_data.corr() # Plot the correlation matrix and do some formatting
ax = sns.heatmap(corr)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45);

It looks like there is some correlation, though it is not discrete. First class passengers were more often situated around decks A, B, and C. Second class were on decks D, E, and F. Lastly, third class were on decks E, F, and G. Deck T was likely a technical crew deck due to the very low passenger count, and the large jump in alphabetical order (A quick Google search confirms that the T deck was used for engine and boiler crew). There is definitely some dependence between the deck and the passenger class, however, I am unsure whether removing one or the other would be beneficial.

Due to the large number of missing cabin information, I will elect to drop the Cabin feature entirely. Nearly 80% of the data is missing a Cabin value. Combined with the relative correlation between Deck and Pclass, I feel this is a safe course of action. Since the Cabin feature was not pulled into the above data earlier, no extra work is necessary at this point.

In [10]:
data_imputed.head()

At this point, the imputed data contains all numerical values, no missing or NaN. The last step is to scale the data for use in the training model.

In [11]:
print((data_imputed < 0).any()) # Nothing is negative, that's good

In [12]:
print((data_dropped < 0).any()) # Nothing is negative, that's good

Using two separate scalers for each data approach:

In [13]:
imputed_scaler = MinMaxScaler()
data_imp_scl = pd.DataFrame(imputed_scaler.fit_transform(data_imputed))
data_imp_scl.columns = data_imputed.columns

dropped_scaler = MinMaxScaler()
data_drp_scl = pd.DataFrame(dropped_scaler.fit_transform(data_dropped))
data_drp_scl.columns = data_dropped.columns


In [14]:
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch']


X_imp = data_imp_scl[features]
y_imp = data_imp_scl['Survived']
X_dr = data_drp_scl[features]
y_dr = data_drp_scl['Survived']
"""
X_imp = data_imputed[features]
y_imp = data_imputed['Survived']
X_dr = data_dropped[features]
y_dr = data_dropped['Survived']
"""
X_tr_imp , X_te_imp, y_tr_imp, y_te_imp = train_test_split(X_imp, y_imp)
X_tr_drp , X_te_drp, y_tr_drp, y_te_drp = train_test_split(X_dr, y_dr)



In [15]:
print(X_tr_imp.shape)
print(y_tr_imp.shape)

In [16]:
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier


def eval_model(model, X_tr, X_te, y_tr, y_te):
    model.fit(X_tr, y_tr)
    y_pr = model.predict(X_te).astype(int)
    print(classification_report(y_te, y_pr, zero_division=0))
    return

models = {'KNN': KNeighborsClassifier(), 
          'CategoricalNB': CategoricalNB(), 
          'Radius Neighbors': RadiusNeighborsClassifier(radius=2), 
          'SGD Classifier': SGDClassifier(),
          'Support Vector Machine': SVC(),
          'Gaussian Classifier': GaussianProcessClassifier(),
          'Decision Tree Classifier': DecisionTreeClassifier()}

for model in models.keys():
    print(model)
    np.random.seed(31415) # Set the random seed every time for reproducible results to compare
    eval_model(models[model], X_tr_imp, X_te_imp, y_tr_imp, y_te_imp)



The Decision Tree, KNN, and SVC algorithms performed in the top bracket based on F1 score. I will pick these three to move forward. Next I want to compare the performance of imputation versus dropping, as well as the performance of StandardScaler vs MinMaxScaler.

In [17]:
models = {'KNN' : KNeighborsClassifier(),
          'Tree': DecisionTreeClassifier(),
          'SVC' : SVC()}

print('Dropped Values')
for model in models.keys():
    print(model)
    np.random.seed(31415)
    eval_model(models[model], X_tr_drp, X_te_drp, y_tr_drp,  y_te_drp)



When compared to the above results, the dropped values perform strongly ahead of the imputed values, particularly for the Decision Tree and KNN algorithms. Recall is still relatively low, at around 70% for both algorithms, but they both have a high precision ~90%. I'll set my focus on these two algorithms, starting with the KNN algorithm.

In [18]:
from sklearn.model_selection import RandomizedSearchCV
knn_model = KNeighborsClassifier()

params = {
    'n_neighbors' : np.arange(5, 101, 5),
    'weights' : ['uniform', 'distance'],
    'algorithm' : ['kd_tree', 'ball_tree', 'brute'],
    'leaf_size' : np.arange(10, 151, 5),
    'p' : [1,2,3]
}

clf = RandomizedSearchCV(knn_model, params, n_iter=1000, cv = 2, random_state = 31415)
model = clf.fit(X_tr_drp, y_tr_drp)

y_pr_drp = model.predict(X_te_drp)
print(classification_report(y_te_drp, y_pr_drp))

In [19]:
from sklearn.model_selection import GridSearchCV
dtc_model = DecisionTreeClassifier()

params = {
    'criterion' : ['gini', 'entropy'],
    'splitter' : ['best', 'random'],
    'max_leaf_nodes' : np.arange(10, 1001, 10)
}

clf = GridSearchCV(dtc_model, params, cv = 2)
model = clf.fit(X_tr_drp, y_tr_drp)

y_pr_drp = model.predict(X_te_drp)
print(classification_report(y_te_drp, y_pr_drp))

Once again the two algorithms are neck-and-neck. Surprisingly, the intial parameter search didn't significantly improve the results, and so this problem may be out of my expertise to optimize.