In [1]:
!python --version

Python 3.10.12


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Importing dataset

In [3]:
# Importing dataset using read_csv() function, dataset has already been split into training and testing sets
train = pd.read_csv('train.csv')
X_test = pd.read_csv('test.csv')
y_test = pd.read_csv('gender_submission.csv')

In [4]:
print('Training dataset shape: {}'.format(train.shape))
print('X_test shape: {}, y_test shape: {}'.format(X_test.shape, y_test.shape))

Training dataset shape: (891, 12)
X_test shape: (418, 11), y_test shape: (418, 2)


In [5]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
X = train.drop('Survived', axis=1)
y = train['Survived']

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42) # Splitting dataset into training and validation sets

## Preprocessing dataset

In [8]:
X_train.info() # Getting familiar with features

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 331 to 102
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    int64  
 1   Pclass       712 non-null    int64  
 2   Name         712 non-null    object 
 3   Sex          712 non-null    object 
 4   Age          572 non-null    float64
 5   SibSp        712 non-null    int64  
 6   Parch        712 non-null    int64  
 7   Ticket       712 non-null    object 
 8   Fare         712 non-null    float64
 9   Cabin        159 non-null    object 
 10  Embarked     710 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 66.8+ KB


In [9]:
num_features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'] # Useful numerical features
cat_features = ['Sex', 'Embarked'] # Useful categorical features, other features are not useful

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

# Pipeline to handle missing numerical values and scale them
num_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Pipeline to handle missing and encode categorical values
cat_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(sparse_output=False))
])

In [11]:
from sklearn.compose import ColumnTransformer

preprocessing = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])

# Creating a classifer

In [16]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(criterion='gini', random_state=42)

X_train_preprocessed = preprocessing.fit_transform(X_train)
X_val_preprocessed = preprocessing.transform(X_val)

dt.fit(X_train_preprocessed, y_train)

dt_prediction = dt.predict(X_val_preprocessed)

In [17]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print('Accuracy: {:.2f}'.format(accuracy_score(y_val, dt_prediction)))
print('Precision: {:.2f}'.format(precision_score(y_val, dt_prediction)))
print('Recall: {:.2f}'.format(recall_score(y_val, dt_prediction)))
print('F1: {:.2f}'.format(f1_score(y_val, dt_prediction)))

Accuracy: 0.79
Precision: 0.75
Recall: 0.76
F1: 0.75


In [14]:
# Result is justified, considering we did not do any tune any hyperparameters

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

forest = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': (50, 100, 200),
    'criterion': ('gini', 'log_loss'),
    'max_depth': ( None, 3, 5, 7)
}

grid = GridSearchCV(forest, param_grid, scoring='f1', cv=4)
grid.fit(X_train_preprocessed, y_train)
print(f'Best params: {grid.best_params_}\nBest F1 score: {grid.best_score_}')

Best params: {'criterion': 'gini', 'max_depth': 5, 'n_estimators': 50}
Best F1 score: 0.7571863901383408


In [26]:
forest = RandomForestClassifier(n_estimators=50, max_depth=5, criterion='gini')
forest.fit(X_train_preprocessed, y_train)

forest_prediction = forest.predict(X_val_preprocessed)

In [27]:
print('Accuracy: {:.2f}'.format(accuracy_score(y_val, forest_prediction)))
print('Precision: {:.2f}'.format(precision_score(y_val, forest_prediction)))
print('Recall: {:.2f}'.format(recall_score(y_val, forest_prediction)))
print('F1: {:.2f}'.format(f1_score(y_val, forest_prediction)))

Accuracy: 0.82
Precision: 0.84
Recall: 0.69
F1: 0.76


In [None]:
# Performance of this model is reasonable and pretty good, considering how small and random is the titanic dataset