In [1]:
# Kaggle Titanic Machine Learning Model: Basic Implementation
# Studying and possibly tweaking this sample solution
# https://medium.com/@sinha.raunak/kaggle-titanic-machine-learning-model-basic-implementation-363c7f073d70

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from matplotlib import pyplot as plt
import torch
from torch import tensor
import time

In [2]:
# Create and load train and test datasets from csv files

train_file_path = "../data/titanic_train.csv"
test_file_path = "../data/titanic_test.csv"

train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)

print(f"TRAIN DF\n{train_df.head().to_string()}\n")
# print(f"TEST DF\n{test_df.head().to_string()}")

TRAIN DF
   PassengerId  Survived  Pclass                                                 Name     Sex   Age  SibSp  Parch            Ticket     Fare Cabin Embarked
0            1         0       3                              Braund, Mr. Owen Harris    male  22.0      1      0         A/5 21171   7.2500   NaN        S
1            2         1       1  Cumings, Mrs. John Bradley (Florence Briggs Thayer)  female  38.0      1      0          PC 17599  71.2833   C85        C
2            3         1       3                               Heikkinen, Miss. Laina  female  26.0      0      0  STON/O2. 3101282   7.9250   NaN        S
3            4         1       1         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1      0            113803  53.1000  C123        S
4            5         0       3                             Allen, Mr. William Henry    male  35.0      0      0            373450   8.0500   NaN        S



In [None]:
# Get info on the features

train_df.info()

In [None]:
# Summary plot for sex

num_male_pasengers = len(train_df[(train_df['Sex'] == 'male')])
num_male_survivors = train_df[(train_df['Survived'] == 1) & (train_df['Sex'] == 'male')]['Survived'].sum()
male_survival_rate = num_male_survivors / num_male_pasengers
print(f"  Male Survival Rate: {male_survival_rate:.8f} ({round(male_survival_rate, 2)})")

num_female_pasengers = len(train_df[(train_df['Sex'] == 'female')])
num_female_survivors = train_df[(train_df['Survived'] == 1) & (train_df['Sex'] == 'female')]['Survived'].sum()
female_survival_rate = num_female_survivors / num_female_pasengers
print(f"Female Survival Rate: {female_survival_rate:.8f} ({round(female_survival_rate, 2)})")

plt.bar(['Male Survival Rate', 'Femal Survival Rate'], [male_survival_rate, female_survival_rate])

In [None]:
# Summary plot for Passenger Class

# get passenger counts for each class
num_pclass1 = len(train_df[(train_df['Pclass'] == 1)])
num_pclass2 = len(train_df[(train_df['Pclass'] == 2)])
num_pclass3 = len(train_df[(train_df['Pclass'] == 3)])

print("Number of Pclass1:", num_pclass1)
print("Number of Pclass1:", num_pclass2)
print("Number of Pclass3:", num_pclass3)
print("--")

# get survival counts for each class
survived_pclass1 = train_df[(train_df['Survived'] == 1) & (train_df['Pclass'] == 1)]['Survived'].sum()
survived_pclass2 = train_df[(train_df['Survived'] == 1) & (train_df['Pclass'] == 2)]['Survived'].sum()
survived_pclass3 = train_df[(train_df['Survived'] == 1) & (train_df['Pclass'] == 3)]['Survived'].sum()

print("Survived Pclass 1:", survived_pclass1)
print("Survived Pclass 2:", survived_pclass2)
print("Survived Pclass 3:", survived_pclass3)
print("--")

# get average survival rate for each class
survive_rate_pc1 = survived_pclass1 / num_pclass1
survive_rate_pc2 = survived_pclass2 / num_pclass2
survive_rate_pc3 = survived_pclass3 / num_pclass3

print(f"Survival rate Pclass1: {survive_rate_pc1:.3f}")
print(f"Survival rate Pclass2: {survive_rate_pc2:.3f}")
print(f"Survival rate Pclass3: {survive_rate_pc3:.3f}")

plt.bar(['Survival Rate Pclass1', 'Survival Rate Pclass2', 'Survival Rate Pclass3'], \
        [survive_rate_pc1, survive_rate_pc2, survive_rate_pc3])

In [None]:
# Data Cleansing

# what do the numerical variables tell us?
train_df.describe()

In [3]:
# Impute values for missing data in AGE variable

# let's use age "mode" (24.0) to fill in the null values
age_mode = train_df['Age'].mode().iloc[0]
print(type(age_mode))
print(age_mode)

train_df['Age'] = train_df['Age'].fillna(age_mode)
print(train_df.head(10).to_string())

<class 'numpy.float64'>
24.0
   PassengerId  Survived  Pclass                                                 Name     Sex   Age  SibSp  Parch            Ticket     Fare Cabin Embarked
0            1         0       3                              Braund, Mr. Owen Harris    male  22.0      1      0         A/5 21171   7.2500   NaN        S
1            2         1       1  Cumings, Mrs. John Bradley (Florence Briggs Thayer)  female  38.0      1      0          PC 17599  71.2833   C85        C
2            3         1       3                               Heikkinen, Miss. Laina  female  26.0      0      0  STON/O2. 3101282   7.9250   NaN        S
3            4         1       1         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1      0            113803  53.1000  C123        S
4            5         0       3                             Allen, Mr. William Henry    male  35.0      0      0            373450   8.0500   NaN        S
5            6         0       3   

In [4]:
# Impute values for missing data in EMBARKED variable

# let's use the embarked mode to fill in the two missing values
embarked_mode = train_df['Embarked'].mode().iloc[0]
train_df['Embarked'] = train_df['Embarked'].fillna(embarked_mode)

In [None]:
# Take a fresh look at the dataframe stats
# Looks like all variables have full set of values (count = 891)

train_df.describe()

In [None]:
# Just for kicks, let's see if this cleansing and imputing helps our original score
# Let's do some AI/ML stuff - building a random forest model
# Same score (0.77511)

y = train_df["Survived"]                        # target variable (did the passenger survive)
features = ["Pclass", "Sex", "SibSp", "Parch", "Fare"]  # attributes from the input dataset

# transform the dataset into a machine learning friendly DataFrame format
X = pd.get_dummies(train_df[features])          # features dataframe (after one-hot encoding)
X_test = pd.get_dummies(test_df[features])      # features dataframe (after one-hot encoding)

model = RandomForestClassifier(n_estimators=500, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerID': test_df.PassengerId, 'Survived': predictions})
output.to_csv('../output/titanic_20250918_02.csv', index=False)
print("Your submission was successfully saved!")

In [5]:
# Next up, Normalization of certain columns
# To-Do: learn more about normalization

# normalize age & fare for both datasets
max_age_train = train_df['Age'].max()
train_df['N_Age'] = train_df['Age']/max_age_train
train_df['N_Fare'] = np.log(train_df['Fare'] + 1) # add 1 so that N_Fare is never negative

max_age_test = train_df['Age'].max()
test_df['N_Age'] = test_df['Age']/max_age_test
test_df['N_Fare'] = np.log(test_df['Fare'] + 1) # add 1 so that N_Fare is never negative

In [None]:
# Try again using the normalized age and normalized fare

# Just for kicks, let's see if this cleansing and imputing helps our original score
# Let's do some AI/ML stuff - building a random forest model
# Improved score (0.78221)

y = train_df["Survived"]                        # target variable (did the passenger survive)
features = ["Pclass", "Sex", "SibSp", "Parch", "N_Fare", "N_Age"]  # attributes from the input dataset

# transform the dataset into a machine learning friendly DataFrame format
X = pd.get_dummies(train_df[features])          # features dataframe (after one-hot encoding)
X_test = pd.get_dummies(test_df[features])      # features dataframe (after one-hot encoding)

model = RandomForestClassifier(n_estimators=500, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerID': test_df.PassengerId, 'Survived': predictions})
output.to_csv('../output/titanic_20250918_03.csv', index=False)
print("Your submission was successfully saved!")

In [None]:
print(train_df.columns)

In [None]:
# One Hot Coding for Categoricals
# Categorical variables such as 'Sex', 'Pclass', and 'Embarked' need to be encoded into numerical format.
# We can achieve this by using the get_dummies function from pandas, which creates binary columns for
# each unique level of these categorical variables, converting them into a series of 0/1 indicators.

# create numeric vars for categorical vars
cat_columns = ['Sex', 'Pclass', 'Embarked']
train_df = pd.get_dummies(data=train_df, columns=cat_columns, dtype=int)
test_df  = pd.get_dummies(data=test_df,  columns=cat_columns, dtype=int)

print(train_df)

In [None]:
print(train_df.columns)
print(test_df.columns)

In [None]:
# Try again, including "Embarked" variable
# Skip the above one-hot encoding, since our submission code below does that
# Improved score (0.78709)

y = train_df["Survived"]                        # target variable (did the passenger survive)
features = ["Pclass", "Sex", "SibSp", "Parch", "Embarked", "N_Fare", "N_Age"]  # attributes from the input dataset

# transform the dataset into a machine learning friendly DataFrame format
X = pd.get_dummies(train_df[features])          # features dataframe (after one-hot encoding)
X_test = pd.get_dummies(test_df[features])      # features dataframe (after one-hot encoding)

model = RandomForestClassifier(n_estimators=500, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerID': test_df.PassengerId, 'Survived': predictions})
output.to_csv('../output/titanic_20250919_01.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
