In [58]:
# Importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import GridSearchCV

## Exploratory Analysis

### Data Structure and Target Identification

The dataset's features and datatypes (along with their descriptions via Kaggle) are as follows:

> 1. **PassengerId (int)**: passenger id
> 2. **Survived (int)**: passenger survival status (0 = No, 1 = Yes)
> 3. **Pclass (string)**: ticket class
> 4. **Name (string)**: passenger name
> 5. **Sex (string)**: sex
> 6. **Age (float)**: age in years
> 7. **SibSp (int)**: # of siblings / spouses aboard the Titanic
> 8. **Parch (int)**: # of parents / children aboard the Titanic
> 9. **Ticket (string)**: ticket number
> 10. **Fare (float)**: passenger fare
> 11. **Cabin (string)**: cabin number 	
> 12. **Embarked (string)**: port of Embarkation 

The target feature is `Survived`. Since it is discrete (can only be 0 or 1) we are building a classification model.

---

In [59]:
# Reading the CSV files into DataFrames
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

### Removing Irrelevant Columns

The columns that I chose to remove for this model were `Name`, `Ticket`, and `Cabin`. There are probably some valuable insights that the model can gain from properly using these columns, but my goal for this iteration was to build a relatively straightforward model that would predict whether a passenger survived.

We can also drop `PassengerId` for the training set since it doesn't contain any relevant information but it needs to stay for the testing set.

In [60]:
# Removing irrelevant columns
df_train.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True)
df_test.drop(columns=['Name', 'Ticket', 'Cabin'], inplace=True)

### Removing Empty Rows

Other strategies could be used to keep these problematic entries under consideration (e.g., imputation) but for this first model I will just remove any rows with empty values

In [61]:
# Dropping rows with any missing values in the training set
df_train = df_train.dropna()

### Splitting the Features and the Target

We can only use `train.csv` to train and evaluate our model. `test.csv` has no `Survived` column so it can only be used in the final submission. Instead we will use 20% of df_train to validate and 80% to train.   

In [62]:
# Splitting the Features and the Target (Survived)
X, y = df_train.drop(columns=['Survived']), df_train['Survived']

# Splitting the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)

## Data Preprocessing

The remaining features (excluding the target) are `PassengerId`, `Pclass`, `Sex`, `Age`, `SibSp`, `Parch`, `Fare`, and `Embarked`. 

### Encoding Categorical Variables

Of the remaining features, `Sex`, `PClass`, and `Embarked` are categorical. Because there is no noticeable natural order to `Sex` and `Embarked` they will be One-Hot Encoded. Because there is order to `PClass` (i.e., presumably there would be preferential treatment to higher-class passengers) it will be treated as an ordinal piece of data. 

In [63]:
# Transforming Pclass to an integer type
X_train['Pclass'] = X_train['Pclass'].astype(int)
X_val['Pclass'] = X_val['Pclass'].astype(int)

# Defining the categorical features to be encoded
categorical_features = ['Sex', 'Embarked']

# Initializing and fitting the encoder on the training data to prevent data leakage
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoder.fit(X_train[categorical_features])

# Getting the names of the encoded columns
encoded_cols = encoder.get_feature_names_out(categorical_features)

# Applying the encoder to the categorical features in both training and validation sets
X_train_encoded = encoder.transform(X_train[categorical_features])
X_val_encoded = encoder.transform(X_val[categorical_features])

# Turning the encoded features into DataFrames
X_train_encoded = pd.DataFrame(X_train_encoded, columns=encoded_cols, index=X_train.index)
X_val_encoded = pd.DataFrame(X_val_encoded, columns=encoded_cols, index=X_val.index)

X_train = pd.concat([X_train.drop(columns=categorical_features), X_train_encoded], axis=1)
X_val = pd.concat([X_val.drop(columns=categorical_features), X_val_encoded], axis=1)

In [64]:
# Transforming Pclass to an integer type in the test set
df_test['Pclass'] = df_test['Pclass'].astype(int)

# Encoding the categorical features in the test set
test_encoded = encoder.transform(df_test[categorical_features])
df_test_encoded = pd.DataFrame(test_encoded, columns=encoded_cols, index=df_test.index)
df_test = pd.concat([df_test.drop(columns=categorical_features), df_test_encoded], axis=1)

### Normalizing/Standardizing Numeric Features

The features that need to be scaled are `Age`, `Fare`, `Pclass`, `SibSp`, and `Parch`. Because `Age` and `Fare` have more potential to contain outliers, we will standardize them. `Pclass`, `SibSp`, and `Parch` are much more contained so they will simply be normalized. 

The standardization will be accomplished with a `StandardScaler` and the normalization will be done through a `MinMaxScaler`.

In [65]:
# Features that are going to be scaled
standardize_features = ['Age', 'Fare']
normalize_features = ['Pclass', 'SibSp', 'Parch']

# Initializing the scalers
standard_scaler = StandardScaler()
minmax_scaler = MinMaxScaler()

# Fitting the scalers on the training data
X_train[standardize_features] = standard_scaler.fit_transform(X_train[standardize_features])
X_train[normalize_features] = minmax_scaler.fit_transform(X_train[normalize_features])
X_val[standardize_features] = standard_scaler.transform(X_val[standardize_features])
X_val[normalize_features] = minmax_scaler.transform(X_val[normalize_features])

In [66]:
# Scaling the numeric features in the test set
df_test[standardize_features] = standard_scaler.transform(df_test[standardize_features])
df_test[normalize_features] = minmax_scaler.transform(df_test[normalize_features])

## Model Development and Optimization

### Picking the Model

The decision tree model had a fractionally better F1 score than K-NN and Logistic Regression models so I decided to move forward with it.

In [67]:
# Creating a Decision Tree model and fitting it to the training data
dt_model = DecisionTreeClassifier(random_state=1)
dt_model.fit(X_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,1
,max_leaf_nodes,
,min_impurity_decrease,0.0


### Hyperparameter Optimization

I decided to do a 15-fold cross-validation due to the relatively small size of the data I was working with. When I ran the tests on my computer the best hyperparameters turned out to be:

`{'criterion': 'entropy', 'max_depth': 11, 'min_samples_leaf': 5, 'min_samples_split': 20}`

Though it may be different when ran on another computer. 

In [68]:
# Defining the parameter grid to search through
param_grid = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': range(1, 21),
    'min_samples_split': [2, 5, 10, 20, 50],
    'min_samples_leaf': [1, 2, 4, 5, 10, 20]
}

# Setting up the GridSearchCV with 15-fold CV and use F1 as the scoring metric.
grid_search = GridSearchCV(
    estimator=DecisionTreeClassifier(random_state=1), 
    param_grid=param_grid,
    cv=15,
    scoring='f1',
    n_jobs=-1
)

# Fitting the grid search to the training data
grid_search.fit(X_train, y_train)

# Printing the best hyperparameters and score
print("Best hyperparameters:", grid_search.best_params_)
print("Best CV score:", grid_search.best_score_)

# Printing the best model's performance on the validation set
best_dt_model = grid_search.best_estimator_
y_val_pred = best_dt_model.predict(X_val)
print("Validation Performance of Best Decision Tree:")
print(classification_report(y_val, y_val_pred))

Best hyperparameters: {'criterion': 'entropy', 'max_depth': 11, 'min_samples_leaf': 5, 'min_samples_split': 20}
Best CV score: 0.759244835452659
Validation Performance of Best Decision Tree:
              precision    recall  f1-score   support

           0       0.81      0.84      0.82        85
           1       0.75      0.71      0.73        58

    accuracy                           0.78       143
   macro avg       0.78      0.77      0.77       143
weighted avg       0.78      0.78      0.78       143



## Generation of Submission File

So at this point we've found the most optimized decision tree model with what we've allowed it to train itself on.

The code below runs the optimized model on the test data, and generates a CSV with its predictions on the unseen data.

In [69]:
# Making predictions on the test set
# Note: The PassengerId column is not used for predictions, so it is dropped.
test_pred = best_dt_model.predict(df_test.drop(columns=['PassengerId']))

# Creating a submission DataFrame
submission = pd.DataFrame({
    'PassengerId': df_test['PassengerId'],
    'Survived': test_pred
})

# Saving the submission file
submission.to_csv('data/submission.csv', index=False)