In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
train_df = pd.read_csv(os.path.join(os.pardir,"data","train_cleaned.csv"))

# Define features and target
X = train_df.drop(columns='Survived')
y = train_df['Survived']

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [4]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint

param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5),
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False]
}

In [5]:
rf = RandomForestClassifier(random_state=42)

rs = RandomizedSearchCV(
    rf,
    param_distributions=param_dist,
    n_iter=50,               # Try ~50 combinations
    cv=5,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1,
    random_state=42
)

rs.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


90 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
59 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/ebixg/Documents/Development/evan-gloria/git_repo/titanic-survival-prediction/.ds_venv/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ebixg/Documents/Development/evan-gloria/git_repo/titanic-survival-prediction/.ds_venv/lib/python3.13/site-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "/Users/ebixg/Documents/Development/evan-gl

In [6]:
print("Best Parameters:", rs.best_params_)
best_rf = rs.best_estimator_

# Evaluate on validation set
y_pred = best_rf.predict(X_valid)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Accuracy:", accuracy_score(y_valid, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_valid, y_pred))
print("Classification Report:\n", classification_report(y_valid, y_pred))

Best Parameters: {'bootstrap': True, 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 497}
Accuracy: 0.8100558659217877
Confusion Matrix:
 [[96 14]
 [20 49]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.87      0.85       110
           1       0.78      0.71      0.74        69

    accuracy                           0.81       179
   macro avg       0.80      0.79      0.80       179
weighted avg       0.81      0.81      0.81       179



In [7]:
# Predict on test data
test_cleaned = pd.read_csv(os.path.join(os.pardir,"data","test_cleaned.csv"))
test_df = pd.read_csv(os.path.join(os.pardir,"data","test.csv"))

In [9]:
test_preds = best_rf.predict(test_cleaned[X.columns])

submission_rf = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': test_preds
})
submission_rf.to_csv(r'../models/random-forest/submission_rf.csv', index=False)