In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
# Recreate the preprocess, but only use the logic from the best model so far 

def preprocess_features_best(df):
    # 1. Fill missing Age and Fare using training medians
    df['Age'] = df.groupby(['Pclass', 'Sex'])['Age'].transform(
        lambda x: x.fillna(x.median())
    )

    # Fill missing Fare BEFORE log transform
    df['Fare'] = df.groupby(['Pclass', 'Sex'])['Fare'].transform(
        lambda x: x.fillna(x.median())
    )
    
    # 2. Log transform Fare
    df['Fare_log'] = df['Fare'].apply(lambda x: np.log(x + 1))

    # 3. Create FamilySize
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

    # 4. Create FamilySizeGroup (manual binning)
    def bin_family_size(size):
        if size == 1:
            return 'Alone'
        elif size <= 4:
            return 'Small'
        else:
            return 'Large'
    df['FamilySizeGroup'] = df['FamilySize'].apply(bin_family_size)

    # 5. Fill missing Embarked (if any) This is required for doing one-hot encoding
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

    # 6. Extract Title from Name
    
    # Extract title using regex
    df['Title'] = df['Name'].str.extract(r',\s*([^\.]*)\s*\.', expand=False)
    
    df['Title'] = df['Title'].replace(" ","_")
    # Optional: consolidate rare titles
    df['Title'] = df['Title'].replace({
        'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs',
        'Lady': 'Rare', 'the Countess': 'Rare', 'Capt': 'Rare', 'Col': 'Rare',
        'Don': 'Rare', 'Dr': 'Rare', 'Major': 'Rare', 'Rev': 'Rare',
        'Sir': 'Rare', 'Jonkheer': 'Rare', 'Dona': 'Rare'
    })

    # 7. Create SoloTraveller
    df['SoloTraveller'] = (df['FamilySize'] == 1).astype(int)

    # 8. Create Fare_per_person
    df['Fare_per_person'] = df['Fare'] / df['FamilySize']

    # Fill missing or infinite values
    df['Fare_per_person'] = df['Fare_per_person'].replace([np.inf, -np.inf], np.nan)
    df['Fare_per_person'] = df.groupby(['Pclass', 'Sex'])['Fare_per_person'].transform(
        lambda x: x.fillna(x.median())
    )

    # Optional: log transform to normalize
    df['Fare_per_person_log'] = np.log(df['Fare_per_person'] + 1)

    # 9. Create PClass x Fare (log)
    df['Pclass_Fare'] = df['Pclass'] * df['Fare_per_person_log']
    df['Pclass_Fare'] = df.groupby(['Pclass', 'Sex'])['Pclass_Fare'].transform(
        lambda x: x.fillna(x.median())
    )

    # 10. Create PClass x Title
    df['Pclass_Title'] = df['Pclass'].astype(str) + "_" + df['Title']
    df['Pclass_Title'] = df['Pclass_Title'].str.replace(' ', '_')

    # One-hot encode for categorical fields
    df = pd.get_dummies(df, columns=['Sex', 'Embarked', 'FamilySizeGroup', 'Title', 'SoloTraveller','Pclass_Title'], drop_first=True)

    bool_cols = df.select_dtypes(include='bool').columns
    df[bool_cols] = df[bool_cols].astype(int)

    # Drop unused columns (optional)
    drop_cols = ['Name', 'Ticket', 'Cabin', 'Fare', 'SibSp', 'Parch', 'PassengerId','Fare_per_person']

    df = df.drop(columns=[col for col in drop_cols if col in df.columns], errors='ignore')

    return df

In [3]:
train_raw = pd.read_csv(os.path.join(os.pardir,"data","train.csv"))
test = pd.read_csv(os.path.join(os.pardir,"data","test.csv"))

train_cleaned = preprocess_features_best(train_raw.copy())
test_cleaned = preprocess_features_best(test.copy())

In [4]:
from sklearn.model_selection import train_test_split

X = train_cleaned.drop(columns='Survived')
y = train_cleaned['Survived']

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize model
rf_model = RandomForestClassifier(
    n_estimators=100,      # number of trees
    max_depth=5,           # control overfitting
    random_state=42
)

# Train
rf_model.fit(X_train, y_train)

# Predict
rf_preds = rf_model.predict(X_valid)

In [6]:
print("Accuracy:", accuracy_score(y_valid, rf_preds))
print("Confusion Matrix:\n", confusion_matrix(y_valid, rf_preds))
print("Classification Report:\n", classification_report(y_valid, rf_preds))

Accuracy: 0.8100558659217877
Confusion Matrix:
 [[96 14]
 [20 49]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.87      0.85       110
           1       0.78      0.71      0.74        69

    accuracy                           0.81       179
   macro avg       0.80      0.79      0.80       179
weighted avg       0.81      0.81      0.81       179



In [7]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(rf_model, X, y, cv=5)
print("CV Accuracy (mean):", cv_scores.mean())

CV Accuracy (mean): 0.8271420500910175


In [8]:
# Predict on test data
test_df = pd.read_csv(os.path.join(os.pardir,"data","test.csv"))

# Align test features
X_test = test_cleaned[X.columns]  # ensure same columns and order

test_preds_rf = rf_model.predict(X_test)

submission_rf = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': test_preds_rf
})

submission_rf.to_csv(r'../models/random-forest/submission_rf.csv', index=False)