# Titanic ML Streamlined Attempt

## Read in the data

In [25]:
# Imports and check file locations

# Fundamentals
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Preprocessing
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# Model selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,  mean_absolute_error, confusion_matrix, classification_report

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import VotingClassifier

In [26]:
# Read files to pandas dataframes

test_df = pd.read_csv('./inputs/test.csv')
test_idx = test_df['PassengerId']
train_df = pd.read_csv('./inputs/train.csv')

In [27]:
# Look at the first few rows of the data

train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [28]:
# Check the test set looks the same

test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## Pre-processing steps

- data manipulation as needed e.g. convert formatting, drop unnecessary
- remove / handle NaN
- categorical data one hot encoding
- scaling of numerics
- label encoding (not required with this data)

In [29]:
# Look at the data a bit before we change it

# DataTypes
print("Data Types:\n" + str(train_df.dtypes))

# Missing Values
print("\nMissing Values:\n" + str(train_df.isnull().sum()))

# Value counts for categorical variables with missing data
print("\nValue Counts for Categorical Variables with Missing Data:")
print(train_df['Embarked'].value_counts())
print(train_df['Cabin'].value_counts())

Data Types:
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

Missing Values:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Value Counts for Categorical Variables with Missing Data:
Embarked
S    644
C    168
Q     77
Name: count, dtype: int64
Cabin
B96 B98        4
G6             4
C23 C25 C27    4
C22 C26        3
F33            3
              ..
E34            1
C7             1
C54            1
E36            1
C148           1
Name: count, Length: 147, dtype: int64


## Pre-processing strategy

1) Impute missing ages with the mean (DANGER!)
2) OneHotEncode categorical data
3) Scale numeric data 
4) Assume the 2 that did not embark did - will give us whether they likely would have survived
5) Create a new Unknown Category for Cabin as it is a very significant number of rows

Questions

1) Should we treat SibSp and Parch as categorical rather than numeric??

In [30]:
# Pre Processing

## Define the pre-processing function to be used

def preprocess(df):
    df = df.copy()
    
    def ticket_number(x):
        return x.split(" ")[-1]
        
    def ticket_item(x):
        items = x.split(" ")
        if len(items) == 1:
            return "NONE"
        return "_".join(items[0:-1])
    
    def family_name(x):
        return x.split(",")[0]
    
    df["Name"] = df["Name"].apply(family_name)
    df["Ticket_number"] = df["Ticket"].apply(ticket_number)
    df["Ticket_item"] = df["Ticket"].apply(ticket_item)  
    df["Cabin"] = df["Cabin"].apply(lambda x: "Unknown" if pd.isnull(x) else x[0])
    df["Embarked"] = df["Embarked"].apply(lambda x: df['Embarked'].mode()[0] if pd.isnull(x) else x[0])
    return df

## Preprocess the data with our function

prep_train_df = preprocess(train_df)
prep_test_df = preprocess(test_df)

In [31]:
# Define the input features, categorical features and scalable columns

all_features = list(prep_train_df.columns)
input_features = [c for c in all_features if c not in ["Survived", "Ticket", "Ticket_number", "PassengerId"]]
categorical_features = [ 'Pclass', 'Name', 'Sex', 'Cabin', 'Embarked', 'Ticket_item' ]
scalable_columns = ['Age', 'SibSp', 'Parch', 'Fare']
mean_imputable_columns = ['Age']

print(f"All         Features: {all_features}")
print(f"Input       Features: {input_features}")
print(f"Categorical Features: {categorical_features}")
print(f"Scalable    Features: {scalable_columns}")
print(f"Imputable   Features: {mean_imputable_columns}")


All         Features: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Ticket_number', 'Ticket_item']
Input       Features: ['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked', 'Ticket_item']
Categorical Features: ['Pclass', 'Name', 'Sex', 'Cabin', 'Embarked', 'Ticket_item']
Scalable    Features: ['Age', 'SibSp', 'Parch', 'Fare']
Imputable   Features: ['Age']


In [32]:
# Split the data into training, validation and testing sets and separate the input features from the target

X_train = prep_train_df[input_features]
y_train = prep_train_df["Survived"].to_numpy()
X_test = prep_test_df[input_features]

In [33]:
X_train.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Ticket_item
0,3,Braund,male,22.0,1,0,7.25,Unknown,S,A/5
1,1,Cumings,female,38.0,1,0,71.2833,C,C,PC
2,3,Heikkinen,female,26.0,0,0,7.925,Unknown,S,STON/O2.
3,1,Futrelle,female,35.0,1,0,53.1,C,S,NONE
4,3,Allen,male,35.0,0,0,8.05,Unknown,S,NONE


In [34]:
# Preprocess the data for categorical columns and scalable columns using scikit-learn

# Create a pipeline for imputing and scaling as we need to do these sequentially
impute_and_scale = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
])

# Use the pipeline and OHE in a ColumnTransformer
ct = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('impute_and_scale', impute_and_scale, scalable_columns),
], remainder='passthrough')

# Print the shape of the training and testing data before changes
print(f"Training Data Shape: {X_train.shape}")
print(f"Testing Data Shape: {X_test.shape}")

# Process transforms on the training and testing data
X_train = pd.DataFrame(ct.fit_transform(X_train).toarray(), columns=ct.get_feature_names_out())
X_test = pd.DataFrame(ct.transform(X_test).toarray(), columns=ct.get_feature_names_out())

# Print the shape of the training and testing data after changes
print(f"Training Data Shape: {X_train.shape}")
print(f"Testing Data Shape: {X_test.shape}")

Training Data Shape: (891, 10)
Testing Data Shape: (418, 10)
Training Data Shape: (891, 732)
Testing Data Shape: (418, 732)


In [35]:
# Check preprocessing worked as expected

X_train.head()

Unnamed: 0,ohe__Pclass_1,ohe__Pclass_2,ohe__Pclass_3,ohe__Name_Abbing,ohe__Name_Abbott,ohe__Name_Abelson,ohe__Name_Adahl,ohe__Name_Adams,ohe__Name_Ahlin,ohe__Name_Aks,...,ohe__Ticket_item_STON/O_2.,ohe__Ticket_item_SW/PP,ohe__Ticket_item_W./C.,ohe__Ticket_item_W.E.P.,ohe__Ticket_item_W/C,ohe__Ticket_item_WE/P,impute_and_scale__Age,impute_and_scale__SibSp,impute_and_scale__Parch,impute_and_scale__Fare
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.592481,0.432793,-0.473674,-0.502445
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.638789,0.432793,-0.473674,0.786845
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.284663,-0.474545,-0.473674,-0.488854
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.407926,0.432793,-0.473674,0.42073
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.407926,-0.474545,-0.473674,-0.486337


In [36]:
# Check preprocessing worked as expected

X_test.head()

Unnamed: 0,ohe__Pclass_1,ohe__Pclass_2,ohe__Pclass_3,ohe__Name_Abbing,ohe__Name_Abbott,ohe__Name_Abelson,ohe__Name_Adahl,ohe__Name_Adams,ohe__Name_Ahlin,ohe__Name_Aks,...,ohe__Ticket_item_STON/O_2.,ohe__Ticket_item_SW/PP,ohe__Ticket_item_W./C.,ohe__Ticket_item_W.E.P.,ohe__Ticket_item_W/C,ohe__Ticket_item_WE/P,impute_and_scale__Age,impute_and_scale__SibSp,impute_and_scale__Parch,impute_and_scale__Fare
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.369449,-0.474545,-0.473674,-0.490783
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.331378,0.432793,-0.473674,-0.507479
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.485693,-0.474545,-0.473674,-0.453367
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.207709,-0.474545,-0.473674,-0.474005
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.592481,0.432793,0.76763,-0.401017


## Try some models


In [37]:
# Create a validation set from the training data so we can score it (we don't have y_test - need to submit competition to get that)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state = 0)
model_scores = {}

In [38]:
def score_model(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    accuracy = accuracy_score(y_val, y_pred)
    print(f"Accuracy: {accuracy}")
    print("")

    mae = mean_absolute_error(y_val, y_pred)
    print(f"Mean Absolute Error: {mae}")
    print("")

    cm = confusion_matrix(y_val, y_pred)
    print(f"Confusion Matrix:\n{cm}")
    print("")
    
    cr = classification_report(y_val, y_pred)    
    print(f"Classification Report:\n{cr}")
    print("")

    precision = cm[1][1] / (cm[1][1] + cm[0][1])
    sensitivity = cm[1][1] / (cm[1][1] + cm[1][0])
    specificity = cm[0][0] / (cm[0][0] + cm[0][1])

    # return a tuple of the accuracy, prescision, sensitivity, and specificity plus the model for submission
    return (accuracy,  precision, sensitivity, specificity, model)


## Logistic Regression

In [39]:
# Train the model and predict the validation set
classifier = LogisticRegression(random_state = 0)
logreg = score_model(classifier, X_train, y_train, X_val, y_val)
model_scores["LogisticRegression"] = logreg


Accuracy: 0.8251121076233184

Mean Absolute Error: 0.17488789237668162

Confusion Matrix:
[[123  16]
 [ 23  61]]

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.88      0.86       139
           1       0.79      0.73      0.76        84

    accuracy                           0.83       223
   macro avg       0.82      0.81      0.81       223
weighted avg       0.82      0.83      0.82       223




# Support Vector Machines

In [40]:
# Train the model and predict the validation set
classifier = SVC(kernel='linear', random_state = 0)
svc = score_model(classifier, X_train, y_train, X_val, y_val)
model_scores["SupportVectorMachines"] = svc

Accuracy: 0.7937219730941704

Mean Absolute Error: 0.2062780269058296

Confusion Matrix:
[[119  20]
 [ 26  58]]

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.86      0.84       139
           1       0.74      0.69      0.72        84

    accuracy                           0.79       223
   macro avg       0.78      0.77      0.78       223
weighted avg       0.79      0.79      0.79       223




# KNN

In [41]:
# Train the model and predict the validation set
classifier = KNeighborsClassifier(n_neighbors = 2, metric = 'minkowski', p = 2)
knn = score_model(classifier, X_train, y_train, X_val, y_val)
model_scores["KNearestNeighbours"] = knn

Accuracy: 0.7668161434977578

Mean Absolute Error: 0.23318385650224216

Confusion Matrix:
[[130   9]
 [ 43  41]]

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.94      0.83       139
           1       0.82      0.49      0.61        84

    accuracy                           0.77       223
   macro avg       0.79      0.71      0.72       223
weighted avg       0.78      0.77      0.75       223




# Decision Tree Classifier

In [42]:
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
dtc = score_model(classifier, X_train, y_train, X_val, y_val)
model_scores["DecisionTree"] = dtc

Accuracy: 0.8161434977578476

Mean Absolute Error: 0.18385650224215247

Confusion Matrix:
[[122  17]
 [ 24  60]]

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.88      0.86       139
           1       0.78      0.71      0.75        84

    accuracy                           0.82       223
   macro avg       0.81      0.80      0.80       223
weighted avg       0.81      0.82      0.81       223




# Naive Bayes

In [43]:
classifier = GaussianNB()
nba = score_model(classifier, X_train, y_train, X_val, y_val)
model_scores["NaiveBayes"] = nba

Accuracy: 0.45739910313901344

Mean Absolute Error: 0.5426008968609866

Confusion Matrix:
[[ 34 105]
 [ 16  68]]

Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.24      0.36       139
           1       0.39      0.81      0.53        84

    accuracy                           0.46       223
   macro avg       0.54      0.53      0.44       223
weighted avg       0.57      0.46      0.42       223




# Kernel SVM

In [44]:
classifier = SVC(kernel = 'rbf', random_state = 0)
svm = score_model(classifier, X_train, y_train, X_val, y_val)
model_scores["KernelSVM"] = svm

Accuracy: 0.8116591928251121

Mean Absolute Error: 0.18834080717488788

Confusion Matrix:
[[120  19]
 [ 23  61]]

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.86      0.85       139
           1       0.76      0.73      0.74        84

    accuracy                           0.81       223
   macro avg       0.80      0.79      0.80       223
weighted avg       0.81      0.81      0.81       223




# Random Forest

In [45]:
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
rf = score_model(classifier, X_train, y_train, X_val, y_val)
model_scores["RandomForest"] = rf

Accuracy: 0.8251121076233184

Mean Absolute Error: 0.17488789237668162

Confusion Matrix:
[[124  15]
 [ 24  60]]

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.89      0.86       139
           1       0.80      0.71      0.75        84

    accuracy                           0.83       223
   macro avg       0.82      0.80      0.81       223
weighted avg       0.82      0.83      0.82       223




# Multi-layer Perceptron (Neural)

In [46]:
classifier = MLPClassifier(random_state = 0, max_iter=300)
mlp = score_model(classifier, X_train, y_train, X_val, y_val)
model_scores["MLPerceptron"] = mlp

Accuracy: 0.7847533632286996

Mean Absolute Error: 0.21524663677130046

Confusion Matrix:
[[113  26]
 [ 22  62]]

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.81      0.82       139
           1       0.70      0.74      0.72        84

    accuracy                           0.78       223
   macro avg       0.77      0.78      0.77       223
weighted avg       0.79      0.78      0.79       223






# Voting Ensemble

In [48]:
classifier = VotingClassifier(estimators=[('lr', logreg[4]), ('dtc', dtc[4]), ('rf', rf[4])], voting='hard')
ve = score_model(classifier, X_train, y_train, X_val, y_val)
model_scores["VotingEnsemble"] = ve

Accuracy: 0.8251121076233184

Mean Absolute Error: 0.17488789237668162

Confusion Matrix:
[[127  12]
 [ 27  57]]

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.91      0.87       139
           1       0.83      0.68      0.75        84

    accuracy                           0.83       223
   macro avg       0.83      0.80      0.81       223
weighted avg       0.83      0.83      0.82       223




# Results

In [49]:
# here we print a table with the details of the model_scores dictionary
print("Model Scores:")
print("{:<25s}{:<15s}{:<15s}{:<15s}{:<15s}".format("Model", "Accuracy", "Precision", "Sensitivity", "Specificity"))
for model, scores in model_scores.items():
    print("{:<25s}{:<15.2f}{:<15.2f}{:<15.2f}{:<15.2f}".format(model, scores[0], scores[1], scores[2], scores[3]))

Model Scores:
Model                    Accuracy       Precision      Sensitivity    Specificity    
LogisticRegression       0.83           0.79           0.73           0.88           
SupportVectorMachines    0.79           0.74           0.69           0.86           
KNearestNeighbours       0.77           0.82           0.49           0.94           
DecisionTree             0.82           0.78           0.71           0.88           
NaiveBayes               0.46           0.39           0.81           0.24           
KernelSVM                0.81           0.76           0.73           0.86           
RandomForest             0.83           0.80           0.71           0.89           
MLPerceptron             0.78           0.70           0.74           0.81           
VotingEnsemble           0.83           0.83           0.68           0.91           


# Submission

In [50]:
y_test = model_scores["VotingEnsemble"][4].predict(X_test)
output = pd.DataFrame({'PassengerId': test_idx , 'Survived': y_test})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
