# Titanic - Machine Learning from Disaster

Filename: titanic-model.ipynb \
Author: Timothy Holland \
Last updated: 17/05/2024 \
Kaggle competition: https://www.kaggle.com/competitions/titanic/data



### 1. Data Preprocessing
#### Uploading Data

In [44]:
import pandas as pd

# Loading dataset into DataFrames
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Split training into features and target variable
x_train = train_df.drop(['PassengerId', 'Survived'], axis=1)
y_train = train_df['Survived']
# Split test into features and target variable
x_test = test_df.drop(['PassengerId'], axis=1)

print(non_numeric_cols)

['Sex', 'Ticket', 'Cabin', 'Embarked', 'Title']


#### 1.1 Feature Engineering
##### Defining features

In [45]:
# Extract titles from the 'Name' column
train_df['Title'] = train_df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test_df['Title'] = test_df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Drop 'Name' column
train_df.drop('Name', axis=1, inplace=True)
test_df.drop('Name', axis=1, inplace=True)

# Specify numeric and non-numeric columns
numeric_cols = test_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
non_numeric_cols = test_df.select_dtypes(exclude=['int64', 'float64']).columns.tolist()

# # Missing numerical values filled with average
# train_df[numeric_cols] = train_df[numeric_cols].fillna(train_df[numeric_cols].mean())
# test_df[numeric_cols] = test_df[numeric_cols].fillna(test_df[numeric_cols].mean())

# # Missing non-numeric values filled with mode
# for col in non_numeric_cols:
#     train_df[col].fillna(train_df[col].mode()[0], inplace=True)
#     test_df[col].fillna(test_df[col].mode()[0], inplace=True)

##### Analysing Features

In [48]:
# Examine the unique values, their frequencies, and missing values for non-numeric features
for feature in non_numeric_cols:
    print(f"Feature: {feature}")
    print(train_df[feature].value_counts(dropna=False))
    
    null_count = train_df[feature].isnull().sum()
    null_percentage = null_count / len(train_df) * 100
    print(f"Missing Values: {null_count} ({null_percentage:.2f}%)")
    print()

# Calculate summary statistics and missing values for numeric features
print("Numeric Features:")
print(train_df[numeric_cols].describe())

null_counts = train_df[numeric_cols].isnull().sum()
null_percentages = null_counts / len(train_df) * 100
print("\nMissing Values:")
for feature, count, percentage in zip(numeric_cols, null_counts, null_percentages):
    print(f"{feature}: {count} ({percentage:.2f}%)")

Feature: Sex
Sex
male      577
female    314
Name: count, dtype: int64
Missing Values: 0 (0.00%)

Feature: Ticket
Ticket
347082      7
CA. 2343    7
1601        7
3101295     6
CA 2144     6
           ..
9234        1
19988       1
2693        1
PC 17612    1
370376      1
Name: count, Length: 681, dtype: int64
Missing Values: 0 (0.00%)

Feature: Cabin
Cabin
NaN            687
C23 C25 C27      4
G6               4
B96 B98          4
C22 C26          3
              ... 
E34              1
C7               1
C54              1
E36              1
C148             1
Name: count, Length: 148, dtype: int64
Missing Values: 687 (77.10%)

Feature: Embarked
Embarked
S      644
C      168
Q       77
NaN      2
Name: count, dtype: int64
Missing Values: 2 (0.22%)

Feature: Title
Title
Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Mlle          2
Major         2
Col           2
Countess      1
Capt          1
Ms            1
Sir           1
Lady   

##### 'Ticket' Feature
Feature is categorical and sparse, applying label encoding over one-hot to reduce dimensionality. Therefore, there is a potential problem for ordering to affect the outcome.

In [90]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

le.fit(x_train['Ticket'])

x_train['Ticket'] = le.transform(x_train['Ticket'])
x_test['Ticket'] = le.transform(x_test['Ticket'])

886    101
887     14
888    675
889      8
890    466
Name: Ticket, dtype: int64
413    267
414    324
415    346
416    220
417    105
Name: Ticket, dtype: int64


##### 'Cabin' Feature
Feature has majority missing values, categorical, and sparse.

In [97]:
from sklearn.preprocessing import LabelEncoder

# Create category for missing values
x_train['Cabin'] = x_train['Cabin'].fillna('Unknown')
x_test['Cabin'] = x_test['Cabin'].fillna('Unknown')

# Categorically encode
le = LabelEncoder()
x_cabin = pd.concat([x_train['Cabin'], x_test['Cabin']], axis=0)
le.fit(x_cabin)

x_train['Cabin'] = le.transform(x_train['Cabin'])
x_test['Cabin'] = le.transform(x_test['Cabin'])


TypeError: Encoders require their input to be uniformly strings or numbers. Got ['int', 'str']

##### Transforming Non-Numeric Features

In [77]:
# Binary encoding of 'Sex'
train_df['Sex'] = train_df['Sex'].replace({'male': 0, 'female': 1})
test_df['Sex'] = test_df['Sex'].replace({'male': 0, 'female': 1})

# Calculate Mutual Information between 'Ticket' and 'Survived'
mi_score = mutual_info_classif(train_df_ticket[['Pclass']], train_df_ticket['Survived'])[0]

# Print the Mutual Information score
print(f"Mutual Information between 'Ticket' and 'Survived': {mi_score}")


KeyError: "None of [Index(['Pclass'], dtype='object')] are in the [columns]"

##### Transforming Numeric Features