# Titanic - Machine Learning from Disaster

Filename: titanic-model.ipynb \
Author: Timothy Holland \
Last updated: 17/05/2024 \
Kaggle competition: https://www.kaggle.com/competitions/titanic/data



## 1. Data Preprocessing
#### Uploading Data

In [124]:
import pandas as pd

# Loading dataset into DataFrames
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Split training into features and target variable
x_train = train_df.drop(['PassengerId', 'Survived'], axis=1)
y_train = train_df['Survived']
# Split test into features and target variable
x_test = test_df.drop(['PassengerId'], axis=1)


### 1.1 Feature Engineering
### Defining features

In [125]:
# Extract titles from the 'Name' column
x_train['Title'] = x_train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
x_test['Title'] = x_test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Drop 'Name' column
x_train.drop('Name', axis=1, inplace=True)
x_test.drop('Name', axis=1, inplace=True)

# Specify numeric and non-numeric columns
numeric_cols = x_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
non_numeric_cols = x_test.select_dtypes(exclude=['int64', 'float64']).columns.tolist()

print(f"Numeric: {numeric_cols}")
print(f"Non-numeric: {non_numeric_cols}")
# # Missing numerical values filled with average
# train_df[numeric_cols] = train_df[numeric_cols].fillna(train_df[numeric_cols].mean())
# test_df[numeric_cols] = test_df[numeric_cols].fillna(test_df[numeric_cols].mean())

# # Missing non-numeric values filled with mode
# for col in non_numeric_cols:
#     train_df[col].fillna(train_df[col].mode()[0], inplace=True)
#     test_df[col].fillna(test_df[col].mode()[0], inplace=True)

Numeric: ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
Non-numeric: ['Sex', 'Ticket', 'Cabin', 'Embarked', 'Title']


### Analysing Features

#### Transforming Non-numeric Features

In [126]:
import matplotlib.pyplot as plt

# Examine the unique values, their frequencies, and missing values for non-numeric features
for feature in non_numeric_cols:
    print(f"Feature: {feature}")
    print(x_train[feature].value_counts(dropna=False))
    
    null_count = x_train[feature].isnull().sum()
    null_percentage = null_count / len(x_train) * 100
    print(f"Missing Values: {null_count} ({null_percentage:.2f}%)")
    print("Test distribution")
    print(x_test[feature].value_counts(dropna=False))
    print()

Feature: Sex
Sex
male      577
female    314
Name: count, dtype: int64
Missing Values: 0 (0.00%)
Test distribution
Sex
male      266
female    152
Name: count, dtype: int64

Feature: Ticket
Ticket
347082      7
CA. 2343    7
1601        7
3101295     6
CA 2144     6
           ..
9234        1
19988       1
2693        1
PC 17612    1
370376      1
Name: count, Length: 681, dtype: int64
Missing Values: 0 (0.00%)
Test distribution
Ticket
PC 17608    5
CA. 2343    4
113503      4
PC 17483    3
220845      3
           ..
349226      1
2621        1
4133        1
113780      1
2668        1
Name: count, Length: 363, dtype: int64

Feature: Cabin
Cabin
NaN            687
C23 C25 C27      4
G6               4
B96 B98          4
C22 C26          3
              ... 
E34              1
C7               1
C54              1
E36              1
C148             1
Name: count, Length: 148, dtype: int64
Missing Values: 687 (77.10%)
Test distribution
Cabin
NaN                327
B57 B59 B63 B66     

##### 'Ticket'
Feature is categorical and sparse, applying label encoding over one-hot to reduce dimensionality. Therefore, there is a potential problem for ordering to affect the outcome.

In [127]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# Convert 'Cabin' values to strings
x_train['Ticket'] = x_train['Ticket'].astype(str)
x_test['Ticket'] = x_test['Ticket'].astype(str)

# Concatenate the training and test sets for fitting the label encoder
x_ticket = pd.concat([x_train['Ticket'], x_test['Ticket']])

# Fit the label encoder on the combined data
le = LabelEncoder()
le.fit(x_ticket)

x_train['Ticket'] = le.transform(x_train['Ticket'])
x_test['Ticket'] = le.transform(x_test['Ticket'])

##### 'Cabin'
Feature has majority missing values, categorical, and sparse.

In [128]:
from sklearn.preprocessing import LabelEncoder

# Create category for missing values
x_train['Cabin'] = x_train['Cabin'].fillna('Unknown')
x_test['Cabin'] = x_test['Cabin'].fillna('Unknown')

# Convert 'Cabin' values to strings
x_train['Cabin'] = x_train['Cabin'].astype(str)
x_test['Cabin'] = x_test['Cabin'].astype(str)

# Concatenate the training and test sets for fitting the label encoder
x_cabin = pd.concat([x_train['Cabin'], x_test['Cabin']])

# Fit the label encoder on the combined data
le = LabelEncoder()
le.fit(x_cabin)

# Transform the training and test sets separately
x_train['Cabin'] = le.transform(x_train['Cabin'])
x_test['Cabin'] = le.transform(x_test['Cabin'])

In [129]:
# Display 'Cabin' information
print(x_train['Cabin'].head())

0    186
1    106
2    186
3     70
4    186
Name: Cabin, dtype: int64


##### 'Embarked' 

In [130]:
from sklearn.preprocessing import OneHotEncoder

# Replace unknowns with mode
most_frequent_value = x_train['Embarked'].mode()[0]
x_train['Embarked'].fillna(most_frequent_value, inplace=True)
x_train['Embarked'].fillna(most_frequent_value, inplace=True)

# Reshape the training data to be 2D
x_train_embarked = x_train['Embarked'].values.reshape(-1, 1)

print(x_train['Embarked'].unique())

# Fit the encoder on the training data
ohe = OneHotEncoder(handle_unknown='ignore')
ohe.fit(x_train_embarked)

# Transform training and test data
x_train_embarked_encoded = ohe.transform(x_train_embarked).toarray()
x_test_embarked_encoded = ohe.transform(x_test['Embarked'].values.reshape(-1, 1)).toarray()
embarked_encoded_columns = ohe.get_feature_names_out(['Embarked'])

#  Drop embarked and replace with OHE
x_train = x_train.drop('Embarked', axis=1)
x_test = x_test.drop('Embarked', axis=1)
x_train = pd.concat([x_train, pd.DataFrame(x_train_embarked_encoded, columns=embarked_encoded_columns)], axis=1)
x_test = pd.concat([x_test, pd.DataFrame(x_test_embarked_encoded, columns=embarked_encoded_columns)], axis=1)

['S' 'C' 'Q']


In [131]:
print(x_train.keys())

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin',
       'Title', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')


##### 'Sex'
Binary encoding of categorical data.

In [132]:
# Binary encoding of 'Sex'
x_train['Sex'] = x_train['Sex'].replace({'male': 0, 'female': 1})
x_test['Sex'] = x_test['Sex'].replace({'male': 0, 'female': 1})

##### 'Title'
One hot encoding of categorical data

In [133]:
# Regroup title categories

def group_titles(title):
    if title in ['Mr', 'Miss', 'Mrs', 'Master']:
        return title
    else:
        return 'Other'
    
x_train['Title'] = x_train['Title'].apply(group_titles)
x_test['Title'] = x_test['Title'].apply(group_titles)


# Apply one-hot-encoding
x_train = pd.get_dummies(x_train, columns=['Title'])
x_test = pd.get_dummies(x_test, columns=['Title'])


In [134]:
print(x_train.keys())
print(x_test.keys())

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin',
       'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Title_Master', 'Title_Miss',
       'Title_Mr', 'Title_Mrs', 'Title_Other'],
      dtype='object')
Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin',
       'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Title_Master', 'Title_Miss',
       'Title_Mr', 'Title_Mrs', 'Title_Other'],
      dtype='object')


#### Transforming Numeric Features

In [135]:
# Calculate summary statistics and missing values for numeric features
print("Numeric Features:")
print(x_train[numeric_cols].describe().transpose())

null_counts = x_train[numeric_cols].isnull().sum()
null_percentages = null_counts / len(x_train) * 100
print("\nMissing Values:")
for feature, count, percentage in zip(numeric_cols, null_counts, null_percentages):
    print(f"{feature}: {count} ({percentage:.2f}%)")

Numeric Features:
        count       mean        std   min      25%      50%   75%       max
Pclass  891.0   2.308642   0.836071  1.00   2.0000   3.0000   3.0    3.0000
Age     714.0  29.699118  14.526497  0.42  20.1250  28.0000  38.0   80.0000
SibSp   891.0   0.523008   1.102743  0.00   0.0000   0.0000   1.0    8.0000
Parch   891.0   0.381594   0.806057  0.00   0.0000   0.0000   0.0    6.0000
Fare    891.0  32.204208  49.693429  0.00   7.9104  14.4542  31.0  512.3292

Missing Values:
Pclass: 0 (0.00%)
Age: 177 (19.87%)
SibSp: 0 (0.00%)
Parch: 0 (0.00%)
Fare: 0 (0.00%)


##### 'Pclass'

In [136]:
# No changes

##### 'Age'
KNN imputation of missing values (n=178). \
Last feature with missing values.\
Normalisation to [0, 1] range.

In [137]:
from sklearn.impute import KNNImputer

# Create imputer object
imputer = KNNImputer(n_neighbors=5) 

# Fix imputer on training data
x_train_imputed = imputer.fit_transform(x_train)
x_test_imputed = imputer.transform(x_test)

# Create dataframe
x_train = pd.DataFrame(x_train_imputed, columns=x_train.columns, index=x_train.index)
x_test = pd.DataFrame(x_test_imputed, columns=x_test.columns, index=x_test.index)

In [138]:
print(x_train['Age'].describe())

count    891.000000
mean      29.471969
std       13.618072
min        0.420000
25%       21.000000
50%       28.200000
75%       36.700000
max       80.000000
Name: Age, dtype: float64


In [139]:
from sklearn.preprocessing import MinMaxScaler

# Create instance of scaler
scaler = MinMaxScaler()

# Normalise age column
x_train['Age'] = scaler.fit_transform(x_train[['Age']])
x_test['Age'] = scaler.transform(x_test[['Age']])

In [140]:
print(x_train['Age'].describe())

count    891.000000
mean       0.365066
std        0.171124
min        0.000000
25%        0.258608
50%        0.349083
75%        0.455893
max        1.000000
Name: Age, dtype: float64


##### 'Sibsp: # of siblings / spouses aboard the Titanic
Normalisation applied.

In [141]:
from sklearn.preprocessing import MinMaxScaler

# Create instance of scaler
scaler = MinMaxScaler()

# Normalise age column
x_train['SibSp'] = scaler.fit_transform(x_train[['SibSp']])
x_test['SibSp'] = scaler.transform(x_test[['SibSp']])

##### 'Parch': # of parents / children aboard the Titanic
Normalisation applied

In [142]:
from sklearn.preprocessing import MinMaxScaler

# Create instance of scaler
scaler = MinMaxScaler()

# Normalise age column
x_train['Parch'] = scaler.fit_transform(x_train[['Parch']])
x_test['Parch'] = scaler.transform(x_test[['Parch']])

##### 'Fare': cost of ticket
Normalisation applied.
Apply logarithmic transformation (right skewed).

In [143]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Create instance of scaler
scaler = MinMaxScaler()

# Apply logarithmic transformation
x_train['Fare'] = np.log1p(x_train['Fare'])
x_test['Fare'] = np.log1p(x_test['Fare'])

# Normalise age column
x_train['Fare'] = scaler.fit_transform(x_train[['Fare']])
x_test['Fare'] = scaler.transform(x_test[['Fare']])

### Split Dataset

In [144]:
from sklearn.model_selection import train_test_split

# Assuming you have your features (X) and target variable (y) defined

# Split the data into training and evaluation sets (80-20 split)
# x_train, x_evaluation, y_train, y_evaluation = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

## 2. Model

### 2.1 KNN

In [145]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Preprocess the data
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_evaluation_scaled = scaler.transform(x_evaluation)

# Define the expanded hyperparameter grid
param_grid = {
    'n_neighbors': range(1, 31),
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2, 3, 4, 5],
    'leaf_size': [10, 20, 30, 40, 50]
}

# Create a KNN classifier
knn = KNeighborsClassifier()

# Perform grid search with cross-validation
grid_search = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy', n_jobs=-1)
grid_search.fit(x_train_scaled, y_train)

# Rerieve best parameters
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("Best hyperparameters:", best_params)
print("Best cross-validation score:", best_score)

# # Train model on hyperparameters
# best_knn = KNeighborsClassifier(**best_params)
# best_knn.fit(x_train_scaled, y_train)

# # Make predications and calculate accuracy
# y_pred = best_knn.predict(x_evaluation_scaled)
# knn_accuracy = accuracy_score(y_evaluation, y_pred)
# print("Accuracy of the best model:", knn_accuracy)

Best hyperparameters: {'algorithm': 'auto', 'leaf_size': 10, 'n_neighbors': 21, 'p': 1, 'weights': 'distance'}
Best cross-validation score: 0.8271910112359551


### 2.2 Linear Regression

In [146]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
import numpy as np

# Train the linear regression model
model = LinearRegression()
model.fit(x_train, y_train)

# Make predictions on the evaluation set
predictions = model.predict(x_evaluation)

# Define a range of threshold values to evaluate
thresholds = np.arange(0.1, 0.9, 0.1)

# Initialize variables to store the best threshold and accuracy
best_threshold = None
best_accuracy = 0

# Iterate over the threshold values
for threshold in thresholds:
    # Convert predictions to binary class labels using the current threshold
    binary_predictions = np.where(predictions >= threshold, 1, 0)
    
    # Calculate the accuracy for the current threshold
    accuracy = accuracy_score(y_evaluation, binary_predictions)
    
    # Update the best threshold and accuracy if the current threshold performs better
    if accuracy > best_accuracy:
        best_threshold = threshold
        best_accuracy = accuracy

# Print the best threshold and corresponding accuracy
print("Best threshold:", best_threshold)
print("Accuracy with best threshold:", best_accuracy)

# Make final predictions using the best threshold
final_predictions = np.where(predictions >= best_threshold, 1, 0)

Best threshold: 0.5
Accuracy with best threshold: 0.8268156424581006


In [147]:
# from sklearn.inspection import permutation_importance

# # Get the coefficients of the model
# coefficients = model.coef_

# # Get the feature names (assuming x_train is a DataFrame)
# feature_names = x_train.columns

# # Print the coefficients and their corresponding features
# for feature, coef in zip(feature_names, coefficients):
#     print(f"{feature}: {coef}")

# # Calculate permutation feature importance
# perm_importance = permutation_importance(model, x_evaluation, y_evaluation, n_repeats=10, random_state=42)

# # Get the mean feature importance scores
# importance_scores = perm_importance.importances_mean

# # Print the feature importance scores
# print("\nFeature importance")
# for feature, score in zip(feature_names, importance_scores):
#     print(f"{feature}: {score}")

### 2.3 Naive Bayes

In [148]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

# Create a Naive Bayes classifier
nb_classifier = GaussianNB()

# Perform 5-fold cross-validation on the training set
cv_scores = cross_val_score(nb_classifier, x_train, y_train, cv=5)

# Print the results
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())

# Train the classifier on the entire training set
nb_classifier.fit(x_train, y_train)

# # Make predictions on the evaluation set
# y_pred = nb_classifier.predict(x_evaluation)

# # Calculate the accuracy on the evaluation set
# accuracy = accuracy_score(y_evaluation, y_pred)
# print("Accuracy on the evaluation set:", accuracy)

Cross-validation scores: [0.7877095  0.76966292 0.78651685 0.78651685 0.82022472]
Mean cross-validation score: 0.7901261691042623


### 2.4 Random Forest

In [155]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the random forest classifier
rf = RandomForestClassifier()

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)
grid_search.fit(x_train, y)

# Print the best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

NameError: name 'X' is not defined

### Neural Network

##### Hyperparameter Search

##### Cross validation