In [None]:
from google.colab import files
uploads=files.upload()

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score,KFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
df=pd.read_csv('Titanic-Dataset.csv')
#cleaning data
df = df[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Fare'] = df['Fare'].fillna(df['Fare'].median())
df['Sex'] = df['Sex'].map({'male': 1, 'female': 0})
df = df.dropna()
df.head()
X = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]
y = df['Survived']
print(f"Data ready: {X.shape[0]} passengers")

In [None]:
#creating models
lr=LogisticRegression(max_iter=1000,random_state=42)
rf=RandomForestClassifier(n_estimators=100,random_state=42)
#5 fold cross validation
kfold=KFold(n_splits=5,shuffle=True,random_state=42)
#logistic regression CV
lr_scores=cross_val_score(lr,X,y,cv=kfold,scoring='accuracy')
print("LOGISTIC REGRESSION")
print(f"5 fold cv scores: {lr_scores}")
print(f"mean accuracy: {lr_scores.mean():.4f}")
print(f"std deviation: {lr_scores.std():.4f}")
print(f"Range: {lr_scores.min():.4f}-{lr_scores.max():.4f}")
print("\n" + "="*50 +"\n")
#Random forest cv
rf_scores=cross_val_score(rf,X,y,cv=kfold,scoring='accuracy')
print("RANDOM FOREST")
print(f"5 fold cv scores: {rf_scores}")
print(f"mean accuracy: {rf_scores.mean():.4f}")
print(f"std deviation: {rf_scores.std():.4f}")
print(f"Range: {rf_scores.min():.4f}-{rf_scores.max():.4f}")

In [None]:
#visualizing cross-validation scores
fig,(ax1,ax2)=plt.subplots(1,2 ,figsize=(14,5))
#boxplot
scores_df=pd.DataFrame({
    'Logistic Regression': lr_scores,
    'Random Forest': rf_scores
})
scores_df.plot(kind='box',ax=ax1)
ax1.set_ylabel("Accuracy score")
ax1.set_title("Model Stability Comparison (5-Fold CV)")
ax1.axhline(y=0.8,color='r',linestyle='--',alpha=0.3,label='80% baseline')
ax1.legend()
#barplot with error bars
means = [lr_scores.mean(), rf_scores.mean()]
stds = [lr_scores.std(), rf_scores.std()]
models = ['Logistic\nRegression', 'Random\nForest']
ax2.bar(models,means,yerr=stds,alpha=0.7,capsize=10,color=['blue','green'])
ax2.set_ylabel("Mean accuracy")
ax2.set_ylim([0.75,0.85])
ax2.set_title("Mean Accuracy with Standard deviation")
ax2.axhline(y=0.8,color='r',linestyle='--',alpha=0.3)
#add value labels
for i, (mean, std) in enumerate(zip(means, stds)):
    ax2.text(i, mean + std + 0.005, f'{mean:.2%}\n±{std:.2%}',
             ha='center', va='bottom', fontsize=10, fontweight='bold')
plt.tight_layout()
plt.show()
print("\n" + "="*60)
print("CONCLUSION:")
print("="*60)
print(f"Random Forest is MORE ACCURATE: {rf_scores.mean():.2%}")
print(f"Random Forest is MORE STABLE: std={rf_scores.std():.4f}")
print(f"Winner: Random Forest")
print("="*60)

# Cross-Validation: Robust Model Evaluation

## Why Cross-Validation?

Single train/test split can be misleading:
- Lucky split → overestimate accuracy
- Unlucky split → underestimate accuracy

## 5-Fold Cross-Validation Results

### Logistic Regression
- Mean Accuracy: 79%
- Standard Deviation: [your value]
- Less stable across different data splits

### Random Forest  
- Mean Accuracy: 81%
- Standard Deviation: [your value] (lower = more stable)
- **Winner: More accurate AND more consistent** ✓

## Conclusion

Cross-validation provides:
- More reliable accuracy estimates
- Confidence intervals (mean ± std)
- Model stability assessment

Random Forest outperforms Logistic Regression in both accuracy and consistency.