In [None]:
from google.colab import files
uploads=files.upload()

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score,GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
df=pd.read_csv('Titanic-Dataset.csv')
#cleaning data
df = df[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Fare'] = df['Fare'].fillna(df['Fare'].median())
df['Sex'] = df['Sex'].map({'male': 1, 'female': 0})
df = df.dropna()
df.head()
X = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]
y = df['Survived']
print(f"Data ready: {X.shape[0]} passengers")
print("Starting hyperparameter tuning...")

In [None]:
#define hyperparameter grid to test
param_grid={
    'n_estimators': [50,100,200],  #number of trees
    'max_depth': [5,10,15,None],   #tree depth
    'min_samples_split': [2,5,10], #min samples to split
    'min_samples_leaf': [1,2,4]    #min samples in leaf
}
#calculate total combinations
total_combinations=(len(param_grid['n_estimators'])*
                    len(param_grid['max_depth'])*
                    len(param_grid['min_samples_leaf'])*
                    len(param_grid['min_samples_split']))
print(f"Testing {total_combinations} different combinations...")
print("This will take a minute...\n")
#create base model
rf=RandomForestClassifier(random_state=42)
#grid search with 5 fold cv
grid_search=GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,                 #5 fold cross validation
    scoring='accuracy',
    n_jobs=1,             #use all cpu cores
    verbose=1             #show progress
)
#fit
grid_search.fit(X,y)

print("\n" + "="*60)
print("BEST HYPERPARAMETERS FOUND:")
print("="*60)
for param, value in grid_search.best_params_.items():
    print(f"{param}: {value}")
print("\n"+"="*60)
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")
print(f"Default Random Forest Accuracy: 0.8100")
print(f"Improvement: {(grid_search.best_score_-0.81)*100:.2f}%")

In [None]:
# Get all results
results = pd.DataFrame(grid_search.cv_results_)

# Plot: n_estimators vs accuracy
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. n_estimators effect
for depth in [5, 10, 15, None]:
    mask = results['param_max_depth'] == depth
    data = results[mask].groupby('param_n_estimators')['mean_test_score'].mean()
    axes[0, 0].plot(data.index, data.values, marker='o', label=f'depth={depth}')
axes[0, 0].set_xlabel('n_estimators')
axes[0, 0].set_ylabel('Accuracy')
axes[0, 0].set_title('Effect of Number of Trees')
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)

# 2. max_depth effect
depth_scores = results.groupby('param_max_depth')['mean_test_score'].mean().sort_index()
axes[0, 1].bar(range(len(depth_scores)), depth_scores.values)
axes[0, 1].set_xticks(range(len(depth_scores)))
axes[0, 1].set_xticklabels(depth_scores.index)
axes[0, 1].set_xlabel('max_depth')
axes[0, 1].set_ylabel('Accuracy')
axes[0, 1].set_title('Effect of Tree Depth')
axes[0, 1].grid(alpha=0.3)

# 3. min_samples_split effect
split_scores = results.groupby('param_min_samples_split')['mean_test_score'].mean()
axes[1, 0].plot(split_scores.index, split_scores.values, marker='o', color='green')
axes[1, 0].set_xlabel('min_samples_split')
axes[1, 0].set_ylabel('Accuracy')
axes[1, 0].set_title('Effect of Min Samples Split')
axes[1, 0].grid(alpha=0.3)

# 4. Top 10 configurations
top_10 = results.nlargest(10, 'mean_test_score')[['params', 'mean_test_score']]
axes[1, 1].barh(range(10), top_10['mean_test_score'].values)
axes[1, 1].set_yticks(range(10))
axes[1, 1].set_yticklabels([f"Config {i+1}" for i in range(10)])
axes[1, 1].set_xlabel('Accuracy')
axes[1, 1].set_title('Top 10 Configurations')
axes[1, 1].axvline(x=0.81, color='r', linestyle='--', alpha=0.5, label='Default')
axes[1, 1].legend()
axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("\nTop 5 configurations:")
print(top_10.head())

In [None]:
# Summary comparison
models_comparison = {
    'Model': [
        'Logistic Regression',
        'Decision Tree',
        'Random Forest (Default)',
        'Random Forest (Tuned)'
    ],
    'Accuracy': [0.7900, 0.8045, 0.8100, 0.8384],
    'Method': ['Baseline', 'Single Tree', 'Ensemble', 'Optimized Ensemble']
}

comparison_df = pd.DataFrame(models_comparison)

# Visualize
plt.figure(figsize=(10, 6))
bars = plt.bar(comparison_df['Model'], comparison_df['Accuracy'],
               color=['blue', 'orange', 'green', 'red'])
plt.ylim([0.75, 0.85])
plt.ylabel('Accuracy')
plt.title('Titanic Survival Prediction - Model Evolution')
plt.xticks(rotation=15, ha='right')

# Add value labels
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.2%}',
            ha='center', va='bottom', fontweight='bold')

# Highlight best
plt.axhline(y=0.8384, color='red', linestyle='--', alpha=0.3, label='Best Model')
plt.legend()
plt.tight_layout()
plt.show()

print("\n" + "="*60)
print("COMPLETE MODEL COMPARISON")
print("="*60)
print(comparison_df.to_string(index=False))
print("="*60)
print(f"\nWINNER: Random Forest (Tuned) - {0.8384:.2%}")
print(f"Total improvement from baseline: {(0.8384-0.79)*100:.2f}%")
print("="*60)

# Hyperparameter Tuning - Optimizing Random Forest

## Problem
Can we improve Random Forest performance beyond default settings?

## Approach: Grid Search Cross-Validation

Tested 108 hyperparameter combinations:
- n_estimators: [50, 100, 200]
- max_depth: [5, 10, 15, None]
- min_samples_split: [2, 5, 10]
- min_samples_leaf: [1, 2, 4]

## Optimal Hyperparameters Found
```python
{
    'n_estimators': 100,
    'max_depth': 10,
    'min_samples_split': 5,
    'min_samples_leaf': 1
}
```

## Results

**Before Tuning:** 81.00% accuracy
**After Tuning:** 83.84% accuracy
**Improvement:** +2.84%

## Model Evolution Summary

| Model | Accuracy | Notes |
|-------|----------|-------|
| Logistic Regression | 79.00% | Baseline |
| Decision Tree | 80.45% | Single tree |
| Random Forest (Default) | 81.00% | Ensemble |
| **Random Forest (Tuned)** | **83.84%** | **Optimized ✓** |

## Key Learnings

1. **max_depth=10 prevents overfitting** - Unlimited depth performed worse
2. **100 trees is sufficient** - More trees didn't help
3. **Grid Search found 2.84% improvement** - Worth the computational cost
4. **Systematic tuning beats guessing** - Tested all combinations

## Conclusion

Hyperparameter tuning provided meaningful accuracy improvement. The optimized Random Forest model achieves 83.84% accuracy, correctly predicting survival for ~747 out of 891 passengers.