In [236]:
# Task 1: Data Loading and Initial Exploration

import pandas as pd

titanic_data = pd.read_excel("/Users/clairestewart/ML-fundamentals-2025/titanic3.xls")

# Statistical description of the data to see trends in the dataset 
print(titanic_data.describe())

# need to figure out ways to visualize data 

            pclass     survived          age        sibsp        parch  \
count  1309.000000  1309.000000  1046.000000  1309.000000  1309.000000   
mean      2.294882     0.381971    29.881135     0.498854     0.385027   
std       0.837836     0.486055    14.413500     1.041658     0.865560   
min       1.000000     0.000000     0.166700     0.000000     0.000000   
25%       2.000000     0.000000    21.000000     0.000000     0.000000   
50%       3.000000     0.000000    28.000000     0.000000     0.000000   
75%       3.000000     1.000000    39.000000     1.000000     0.000000   
max       3.000000     1.000000    80.000000     8.000000     9.000000   

              fare        body  
count  1308.000000  121.000000  
mean     33.295479  160.809917  
std      51.758668   97.696922  
min       0.000000    1.000000  
25%       7.895800   72.000000  
50%      14.454200  155.000000  
75%      31.275000  256.000000  
max     512.329200  328.000000  


In [237]:
# Task 2: Managing Missing Values

missing_values = titanic_data.isnull()
# print(missing_values.sum())

# Before any data is changed the columns with missing data are: age, fare, cabin, embarked, boat, body, and home.dest 

# Age:
# Filling in the missing values with the mean age. Median is similar to mean, so either would work. 
mean_age = titanic_data['age'].mean()
titanic_data['age'] = titanic_data['age'].fillna(mean_age)

# Fare
# Filling in fares with the mode 
mean_fare =  titanic_data['fare'].mean()
titanic_data['fare'] = titanic_data['fare'].fillna(mean_fare)

# Home.Dest
home_mode = titanic_data['home.dest'].mode()
titanic_data['home.dest'] = titanic_data['home.dest'].fillna(str(home_mode))

# Embarked
embark_mode = titanic_data['embarked'].mode()[0]
titanic_data['embarked'] = titanic_data['embarked'].fillna(str(embark_mode))

# Boat
titanic_data['boat'] = titanic_data['boat'].fillna(-1)


# Dropping values
titanic_data.drop(columns=['body', 'cabin'], inplace=True, errors='ignore')

print(missing_values.sum())



pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64


In [238]:
# Task 3: Encoding Categorical Variables

from sklearn.preprocessing import OneHotEncoder

# Identifying Categorical Variables:
# Columns: pclass, sex, embarked, boat, home.dest

# Performing One Hot Encoding on pclass, sex, and embarked 
pd.get_dummies(titanic_data, columns=['pclass', 'sex', 'embarked'])

# Observe the transformation and discuss its impact on machine learning models

Unnamed: 0,survived,name,age,sibsp,parch,ticket,fare,boat,home.dest,pclass_1,pclass_2,pclass_3,sex_female,sex_male,embarked_C,embarked_Q,embarked_S
0,1,"Allen, Miss. Elisabeth Walton",29.000000,0,0,24160,211.3375,2,"St Louis, MO",True,False,False,True,False,False,False,True
1,1,"Allison, Master. Hudson Trevor",0.916700,1,2,113781,151.5500,11,"Montreal, PQ / Chesterville, ON",True,False,False,False,True,False,False,True
2,0,"Allison, Miss. Helen Loraine",2.000000,1,2,113781,151.5500,-1,"Montreal, PQ / Chesterville, ON",True,False,False,True,False,False,False,True
3,0,"Allison, Mr. Hudson Joshua Creighton",30.000000,1,2,113781,151.5500,-1,"Montreal, PQ / Chesterville, ON",True,False,False,False,True,False,False,True
4,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",25.000000,1,2,113781,151.5500,-1,"Montreal, PQ / Chesterville, ON",True,False,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,0,"Zabour, Miss. Hileni",14.500000,1,0,2665,14.4542,-1,"0 New York, NY\nName: home.dest, dtype: object",False,False,True,True,False,True,False,False
1305,0,"Zabour, Miss. Thamine",29.881135,1,0,2665,14.4542,-1,"0 New York, NY\nName: home.dest, dtype: object",False,False,True,True,False,True,False,False
1306,0,"Zakarian, Mr. Mapriededer",26.500000,0,0,2656,7.2250,-1,"0 New York, NY\nName: home.dest, dtype: object",False,False,True,False,True,True,False,False
1307,0,"Zakarian, Mr. Ortin",27.000000,0,0,2670,7.2250,-1,"0 New York, NY\nName: home.dest, dtype: object",False,False,True,False,True,True,False,False


In [239]:
# Task 4: Feature Scaling

from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Numerical Columns:
# age, fare, sibsp, parch

# Apply StandardScaler to numerical columns
standard = StandardScaler()
titanic_data[['age', 'fare', 'sibsp', 'parch']] = standard.fit_transform(titanic_data[['age', 'fare', 'sibsp', 'parch']])

titanic_data.head()








Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,embarked,boat,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,-0.06842,-0.479087,-0.445,24160,3.44248,S,2,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,-2.249092,0.481288,1.866526,113781,2.286476,S,11,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,-2.164974,0.481288,1.866526,113781,2.286476,S,-1,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,0.00923,0.481288,1.866526,113781,2.286476,S,-1,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,-0.379021,0.481288,1.866526,113781,2.286476,S,-1,"Montreal, PQ / Chesterville, ON"


In [240]:
# Apply MinMaxScaler to numerical columns
min_max = MinMaxScaler()
titanic_data[['age', 'fare', 'sibsp', 'parch']] = min_max.fit_transform(titanic_data[['age', 'fare', 'sibsp', 'parch']])

titanic_data.head()

# Discuss the differences and importance of both

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,embarked,boat,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,0.361169,0.0,0.0,24160,0.412503,S,2,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.009395,0.125,0.222222,113781,0.295806,S,11,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,0.022964,0.125,0.222222,113781,0.295806,S,-1,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,0.373695,0.125,0.222222,113781,0.295806,S,-1,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,0.311064,0.125,0.222222,113781,0.295806,S,-1,"Montreal, PQ / Chesterville, ON"


In [254]:
# Task 5: Data Splitting 

import numpy as np
from sklearn.model_selection import train_test_split

# x = titanic_data.drop(columns=['survived'])
y = titanic_data['survived']

# First split: split into training data and others
titanic_train, titanic_others = train_test_split(titanic_data, 
                                           test_size=0.4,  
                                           stratify=y, 
                                           random_state=42)

# Second split: split others into test and validation
titanic_validation, titanic_test = train_test_split(titanic_others, 
                                              test_size=0.5,  
                                              stratify=titanic_others['survived'], 
                                              random_state=42)

print("Training set size:", titanic_train.shape)
print("Validation set size:", titanic_validation.shape)
print("Test set size:", titanic_test.shape)


Training set size: (785, 12)
Validation set size: (262, 12)
Test set size: (262, 12)


In [None]:
# Task 6: Addressing Class Imbalance 
