# Titanic ML Streamlined Attempt

## Read in the data

In [206]:
# Imports and check file locations

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# import matplotlib.pyplot as plt # data visualization
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [207]:
# Read files to pandas dataframes

test_df = pd.read_csv('./inputs/test.csv')
train_df = pd.read_csv('./inputs/train.csv')

In [208]:
# Look at the first few rows of the data

train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [209]:
# Check the test set looks the same

test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## Pre-processing steps

- data manipulation as needed e.g. convert formatting, drop unnecessary
- remove / handle NaN
- categorical data one hot encoding
- scaling of numerics
- label encoding (not required with this data)

In [210]:
# Look at the data a bit before we change it

# DataTypes
print("Data Types:\n" + str(train_df.dtypes))

# Missing Values
print("\nMissing Values:\n" + str(train_df.isnull().sum()))

# Value counts for categorical variables with missing data
print("\nValue Counts for Categorical Variables with Missing Data:")
print(train_df['Embarked'].value_counts())
print(train_df['Cabin'].value_counts())

Data Types:
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

Missing Values:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Value Counts for Categorical Variables with Missing Data:
Embarked
S    644
C    168
Q     77
Name: count, dtype: int64
Cabin
B96 B98        4
G6             4
C23 C25 C27    4
C22 C26        3
F33            3
              ..
E34            1
C7             1
C54            1
E36            1
C148           1
Name: count, Length: 147, dtype: int64


## Pre-processing strategy

1) Impute missing ages with the mean (DANGER!)
2) OneHotEncode categorical data
3) Scale numeric data 
4) Assume the 2 that did not embark did - will give us whether they likely would have survived
5) Create a new Unknown Category for Cabin as it is a very significant number of rows

Questions

1) Should we treat SibSp and Parch as categorical rather than numeric??

In [211]:
# Pre Processing

## Define the pre-processing function to be used

def preprocess(df):
    df = df.copy()
    
    def ticket_number(x):
        return x.split(" ")[-1]
        
    def ticket_item(x):
        items = x.split(" ")
        if len(items) == 1:
            return "NONE"
        return "_".join(items[0:-1])
    
    def family_name(x):
        return x.split(",")[0]
    
    df["Name"] = df["Name"].apply(family_name)
    df["Ticket_number"] = df["Ticket"].apply(ticket_number)
    df["Ticket_item"] = df["Ticket"].apply(ticket_item)  
    df["Cabin"] = df["Cabin"].apply(lambda x: "Unknown" if pd.isnull(x) else x[0])
    df["Embarked"] = df["Embarked"].apply(lambda x: df['Embarked'].mode()[0] if pd.isnull(x) else x[0])
    return df

## Preprocess the data with our function

prep_train_df = preprocess(train_df)
prep_test_df = preprocess(test_df)

In [212]:
# Define the input features, categorical features and scalable columns

all_features = list(prep_train_df.columns)
input_features = [c for c in all_features if c not in ["Survived", "Ticket", "Ticket_number", "PassengerId"]]
categorical_features = [ 'Pclass', 'Name', 'Sex', 'Cabin', 'Embarked', 'Ticket_item' ]
scalable_columns = ['Age', 'SibSp', 'Parch', 'Fare']
mean_imputable_columns = ['Age']

print(f"All         Features: {all_features}")
print(f"Input       Features: {input_features}")
print(f"Categorical Features: {categorical_features}")
print(f"Scalable    Features: {scalable_columns}")
print(f"Imputable   Features: {mean_imputable_columns}")


All         Features: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Ticket_number', 'Ticket_item']
Input       Features: ['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked', 'Ticket_item']
Categorical Features: ['Pclass', 'Name', 'Sex', 'Cabin', 'Embarked', 'Ticket_item']
Scalable    Features: ['Age', 'SibSp', 'Parch', 'Fare']
Imputable   Features: ['Age']


In [213]:
# Split the data into training and testing sets and separate the input features from the target

X_train = prep_train_df[input_features]
y_train = prep_train_df["Survived"].to_numpy()
X_test = prep_test_df[input_features]

In [214]:
# Preprocess the data for categorical columns and scalable columns using scikit-learn

# Create a pipeline for imputing and scaling as we need to do these sequentially
impute_and_scale = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
])

# Use the pipeline and OHE in a ColumnTransformer
ct = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('impute_and_scale', impute_and_scale, scalable_columns),
], remainder='passthrough')

# Print the shape of the training and testing data before changes
print(f"Training Data Shape: {X_train.shape}")
print(f"Testing Data Shape: {X_test.shape}")

# Process transforms on the training and testing data
X_train = pd.DataFrame(ct.fit_transform(X_train).toarray(), columns=ct.get_feature_names_out())
X_test = pd.DataFrame(ct.transform(X_test).toarray(), columns=ct.get_feature_names_out())

# Print the shape of the training and testing data after changes
print(f"Training Data Shape: {X_train.shape}")
print(f"Testing Data Shape: {X_test.shape}")

Training Data Shape: (891, 10)
Testing Data Shape: (418, 10)
Training Data Shape: (891, 732)
Testing Data Shape: (418, 732)


In [215]:
# Check preprocessing worked as expected

X_train.head()

Unnamed: 0,ohe__Pclass_1,ohe__Pclass_2,ohe__Pclass_3,ohe__Name_Abbing,ohe__Name_Abbott,ohe__Name_Abelson,ohe__Name_Adahl,ohe__Name_Adams,ohe__Name_Ahlin,ohe__Name_Aks,...,ohe__Ticket_item_STON/O_2.,ohe__Ticket_item_SW/PP,ohe__Ticket_item_W./C.,ohe__Ticket_item_W.E.P.,ohe__Ticket_item_W/C,ohe__Ticket_item_WE/P,impute_and_scale__Age,impute_and_scale__SibSp,impute_and_scale__Parch,impute_and_scale__Fare
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.592481,0.432793,-0.473674,-0.502445
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.638789,0.432793,-0.473674,0.786845
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.284663,-0.474545,-0.473674,-0.488854
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.407926,0.432793,-0.473674,0.42073
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.407926,-0.474545,-0.473674,-0.486337


In [216]:
# Check preprocessing worked as expected

X_test.head()

Unnamed: 0,ohe__Pclass_1,ohe__Pclass_2,ohe__Pclass_3,ohe__Name_Abbing,ohe__Name_Abbott,ohe__Name_Abelson,ohe__Name_Adahl,ohe__Name_Adams,ohe__Name_Ahlin,ohe__Name_Aks,...,ohe__Ticket_item_STON/O_2.,ohe__Ticket_item_SW/PP,ohe__Ticket_item_W./C.,ohe__Ticket_item_W.E.P.,ohe__Ticket_item_W/C,ohe__Ticket_item_WE/P,impute_and_scale__Age,impute_and_scale__SibSp,impute_and_scale__Parch,impute_and_scale__Fare
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.369449,-0.474545,-0.473674,-0.490783
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.331378,0.432793,-0.473674,-0.507479
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.485693,-0.474545,-0.473674,-0.453367
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.207709,-0.474545,-0.473674,-0.474005
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.592481,0.432793,0.76763,-0.401017


# Try some models

- Let's try some different models to see what the data tells us 

In [217]:
# Create a validate set from the training data


ohe__Pclass_1              0
ohe__Pclass_2              0
ohe__Pclass_3              0
ohe__Name_Abbing           0
ohe__Name_Abbott           0
                          ..
ohe__Ticket_item_WE/P      0
impute_and_scale__Age      0
impute_and_scale__SibSp    0
impute_and_scale__Parch    0
impute_and_scale__Fare     0
Length: 732, dtype: int64

### KNN (K-nearest neighbours)

In [218]:
# Train the model

from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)