In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
train_df = pd.read_csv(os.path.join(os.pardir,"data","train_cleaned.csv"))

# Define features and target
X = train_df.drop(columns='Survived')
y = train_df['Survived']

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize model
rf_model = RandomForestClassifier(
    n_estimators=100,      # number of trees
    max_depth=5,           # control overfitting
    random_state=42
)

# Train
rf_model.fit(X_train, y_train)

# Predict
rf_preds = rf_model.predict(X_valid)

In [5]:
print("Accuracy:", accuracy_score(y_valid, rf_preds))
print("Confusion Matrix:\n", confusion_matrix(y_valid, rf_preds))
print("Classification Report:\n", classification_report(y_valid, rf_preds))

Accuracy: 0.7932960893854749
Confusion Matrix:
 [[101   9]
 [ 28  41]]
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.92      0.85       110
           1       0.82      0.59      0.69        69

    accuracy                           0.79       179
   macro avg       0.80      0.76      0.77       179
weighted avg       0.80      0.79      0.79       179



In [6]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(rf_model, X, y, cv=5)
print("CV Accuracy (mean):", cv_scores.mean())

CV Accuracy (mean): 0.8193333751804657


## Random Forest Model Evaluation

### Accuracy

- **Validation Accuracy**: `79.3%`
- **Cross-Validation Accuracy (mean)**: `81.9%`

---

### Confusion Matrix

| Actual / Predicted | Predicted 0 | Predicted 1 |
|--------------------|-------------|-------------|
| Actual 0 (Died)    | 101         | 9           |
| Actual 1 (Survived)| 28          | 41          |

---

### Classification Report

| Class | Precision | Recall | F1-Score | Support |
|-------|-----------|--------|----------|---------|
| 0 (Died)     | 0.78      | **0.92** | 0.85     | 110     |
| 1 (Survived) | **0.82**  | 0.59     | 0.69     | 69      |

- **High precision** for survivors (fewer false positives)
- **Lower recall** for survivors (more missed survivors)

---

### Comparison with Logistic Regression

| Metric         | Logistic Regression | Random Forest | Notes                             |
|----------------|---------------------|----------------|-----------------------------------|
| Accuracy       | **81.0%**           | 79.3%          | LR slightly better on validation  |
| CV Accuracy    | 81.4%               | **81.9%**      | RF better generalization          |
| Precision (1)  | 0.78                | **0.82**       | RF makes fewer false positives    |
| Recall (1)     | **0.71**            | 0.59           | LR detects more survivors         |

In [7]:
# Predict on test data
test_cleaned = pd.read_csv(os.path.join(os.pardir,"data","test_cleaned.csv"))
test_df = pd.read_csv(os.path.join(os.pardir,"data","test.csv"))
test_preds = rf_model.predict(test_cleaned)

test_preds_rf = rf_model.predict(test_cleaned)

submission_rf = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': test_preds_rf
})

submission_rf.to_csv(r'../models/random-forest/submission_rf.csv', index=False)