# Titanic - Machine Learning from Disaster

Filename: titanic-model.ipynb \
Author: Timothy Holland \
Last updated: 17/05/2024 \
Kaggle competition: https://www.kaggle.com/competitions/titanic/data



### 1. Data Preprocessing
#### Uploading Data

In [260]:
import pandas as pd

# Loading dataset into DataFrames
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# Split training into features and target variable
x_train = train_df.drop(['PassengerId', 'Survived'], axis=1)
y_train = train_df['Survived']
# Split test into features and target variable
x_test = test_df.drop(['PassengerId'], axis=1)

print(non_numeric_cols)

['Sex', 'Ticket', 'Cabin', 'Embarked', 'Title']


## 1.1 Feature Engineering
### Defining features

In [261]:
# Extract titles from the 'Name' column
x_train['Title'] = x_train['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
x_test['Title'] = x_test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Drop 'Name' column
x_train.drop('Name', axis=1, inplace=True)
x_test.drop('Name', axis=1, inplace=True)

# Specify numeric and non-numeric columns
numeric_cols = x_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
non_numeric_cols = x_test.select_dtypes(exclude=['int64', 'float64']).columns.tolist()

print(f"Numeric: {numeric_cols}")
print(f"Non-numeric: {non_numeric_cols}")
# # Missing numerical values filled with average
# train_df[numeric_cols] = train_df[numeric_cols].fillna(train_df[numeric_cols].mean())
# test_df[numeric_cols] = test_df[numeric_cols].fillna(test_df[numeric_cols].mean())

# # Missing non-numeric values filled with mode
# for col in non_numeric_cols:
#     train_df[col].fillna(train_df[col].mode()[0], inplace=True)
#     test_df[col].fillna(test_df[col].mode()[0], inplace=True)

Numeric: ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
Non-numeric: ['Sex', 'Ticket', 'Cabin', 'Embarked', 'Title']


### Analysing Features

#### Transforming Non-numeric Features

In [262]:
import matplotlib.pyplot as plt

# Examine the unique values, their frequencies, and missing values for non-numeric features
for feature in non_numeric_cols:
    print(f"Feature: {feature}")
    print(x_train[feature].value_counts(dropna=False))
    
    null_count = x_train[feature].isnull().sum()
    null_percentage = null_count / len(x_train) * 100
    print(f"Missing Values: {null_count} ({null_percentage:.2f}%)")
    print("Test distribution")
    print(x_test[feature].value_counts(dropna=False))
    print()

Feature: Sex
Sex
male      577
female    314
Name: count, dtype: int64
Missing Values: 0 (0.00%)
Test distribution
Sex
male      266
female    152
Name: count, dtype: int64

Feature: Ticket
Ticket
347082      7
CA. 2343    7
1601        7
3101295     6
CA 2144     6
           ..
9234        1
19988       1
2693        1
PC 17612    1
370376      1
Name: count, Length: 681, dtype: int64
Missing Values: 0 (0.00%)
Test distribution
Ticket
PC 17608    5
CA. 2343    4
113503      4
PC 17483    3
220845      3
           ..
349226      1
2621        1
4133        1
113780      1
2668        1
Name: count, Length: 363, dtype: int64

Feature: Cabin
Cabin
NaN            687
C23 C25 C27      4
G6               4
B96 B98          4
C22 C26          3
              ... 
E34              1
C7               1
C54              1
E36              1
C148             1
Name: count, Length: 148, dtype: int64
Missing Values: 687 (77.10%)
Test distribution
Cabin
NaN                327
B57 B59 B63 B66     

##### 'Ticket' Feature
Feature is categorical and sparse, applying label encoding over one-hot to reduce dimensionality. Therefore, there is a potential problem for ordering to affect the outcome.

In [263]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# Convert 'Cabin' values to strings
x_train['Ticket'] = x_train['Ticket'].astype(str)
x_test['Ticket'] = x_test['Ticket'].astype(str)

# Concatenate the training and test sets for fitting the label encoder
x_ticket = pd.concat([x_train['Ticket'], x_test['Ticket']])

# Fit the label encoder on the combined data
le = LabelEncoder()
le.fit(x_ticket)

x_train['Ticket'] = le.transform(x_train['Ticket'])
x_test['Ticket'] = le.transform(x_test['Ticket'])

##### 'Cabin' Feature
Feature has majority missing values, categorical, and sparse.

In [264]:
from sklearn.preprocessing import LabelEncoder

# Create category for missing values
x_train['Cabin'] = x_train['Cabin'].fillna('Unknown')
x_test['Cabin'] = x_test['Cabin'].fillna('Unknown')

# Convert 'Cabin' values to strings
x_train['Cabin'] = x_train['Cabin'].astype(str)
x_test['Cabin'] = x_test['Cabin'].astype(str)

# Concatenate the training and test sets for fitting the label encoder
x_cabin = pd.concat([x_train['Cabin'], x_test['Cabin']])

# Fit the label encoder on the combined data
le = LabelEncoder()
le.fit(x_cabin)

# Transform the training and test sets separately
x_train['Cabin'] = le.transform(x_train['Cabin'])
x_test['Cabin'] = le.transform(x_test['Cabin'])

In [265]:
# Display 'Cabin' information
print(x_train['Cabin'].head())

0    186
1    106
2    186
3     70
4    186
Name: Cabin, dtype: int64


##### 'Embarked' Feature

In [266]:
from sklearn.preprocessing import OneHotEncoder

# Replace unknowns with mode
most_frequent_value = x_train['Embarked'].mode()[0]
x_train['Embarked'].fillna(most_frequent_value, inplace=True)
x_train['Embarked'].fillna(most_frequent_value, inplace=True)

# Reshape the training data to be 2D
x_train_embarked = x_train['Embarked'].values.reshape(-1, 1)

print(x_train['Embarked'].unique())

# Fit the encoder on the training data
ohe = OneHotEncoder(handle_unknown='ignore')
ohe.fit(x_train_embarked)

# Transform training and test data
x_train_embarked_encoded = ohe.transform(x_train_embarked).toarray()
x_test_embarked_encoded = ohe.transform(x_test['Embarked'].values.reshape(-1, 1)).toarray()
embarked_encoded_columns = ohe.get_feature_names_out(['Embarked'])

#  Drop embarked and replace with OHE
x_train = x_train.drop('Embarked', axis=1)
x_test = x_test.drop('Embarked', axis=1)
x_train = pd.concat([x_train, pd.DataFrame(x_train_embarked_encoded, columns=embarked_encoded_columns)], axis=1)
x_test = pd.concat([x_test, pd.DataFrame(x_test_embarked_encoded, columns=embarked_encoded_columns)], axis=1)

['S' 'C' 'Q']


In [267]:
print(x_train.keys())

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin',
       'Title', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')


##### 'Sex' Feature
Binary encoding of categorical data.

In [268]:
# Binary encoding of 'Sex'
train_df['Sex'] = train_df['Sex'].replace({'male': 0, 'female': 1})
test_df['Sex'] = test_df['Sex'].replace({'male': 0, 'female': 1})

##### 'Title' Feature
One hot encoding of categorical data

In [269]:
# Regroup title categories

def group_titles(title):
    if title in ['Mr', 'Miss', 'Mrs', 'Master']:
        return title
    else:
        return 'Other'
    
x_train['Title'] = x_train['Title'].apply(group_titles)
x_test['Title'] = x_test['Title'].apply(group_titles)


# Apply one-hot-encoding
x_train = pd.get_dummies(x_train, columns=['Title'])
x_test = pd.get_dummies(x_test, columns=['Title'])


In [270]:
print(x_train.keys())
print(x_test.keys())

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin',
       'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Title_Master', 'Title_Miss',
       'Title_Mr', 'Title_Mrs', 'Title_Other'],
      dtype='object')
Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin',
       'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Title_Master', 'Title_Miss',
       'Title_Mr', 'Title_Mrs', 'Title_Other'],
      dtype='object')


#### Transforming Numeric Features

In [272]:
# Calculate summary statistics and missing values for numeric features
print("Numeric Features:")
print(x_train[numeric_cols].describe())

null_counts = x_train[numeric_cols].isnull().sum()
null_percentages = null_counts / len(x_train) * 100
print("\nMissing Values:")
for feature, count, percentage in zip(numeric_cols, null_counts, null_percentages):
    print(f"{feature}: {count} ({percentage:.2f}%)")

Numeric Features:
           Pclass         Age       SibSp       Parch        Fare
count  891.000000  714.000000  891.000000  891.000000  891.000000
mean     2.308642   29.699118    0.523008    0.381594   32.204208
std      0.836071   14.526497    1.102743    0.806057   49.693429
min      1.000000    0.420000    0.000000    0.000000    0.000000
25%      2.000000   20.125000    0.000000    0.000000    7.910400
50%      3.000000   28.000000    0.000000    0.000000   14.454200
75%      3.000000   38.000000    1.000000    0.000000   31.000000
max      3.000000   80.000000    8.000000    6.000000  512.329200

Missing Values:
Pclass: 0 (0.00%)
Age: 177 (19.87%)
SibSp: 0 (0.00%)
Parch: 0 (0.00%)
Fare: 0 (0.00%)
