# Advanced Titanic Survival Analysis

This notebook demonstrates advanced data analysis and machine learning techniques using the Titanic dataset.

In [None]:
import sys
sys.path.append('..')

from src.data_processing import DataProcessor
from src.features import FeatureEngineering
from src.models import ModelTrainer

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, roc_curve, auc

%matplotlib inline
plt.style.use('seaborn')

## 1. Enhanced Data Analysis

In [None]:
# Load and prepare data
processor = DataProcessor()
df = processor.load_data('../data/raw/titanic.csv')

# Display basic information about the dataset
print("Dataset Info:")
print(df.info())

print("\nMissing Values:")
print(df.isnull().sum())

print("\nBasic Statistics:")
print(df.describe())

## 2. Advanced Visualizations

In [None]:
# Set up the plotting style
plt.style.use('seaborn')

# Create a figure with multiple subplots
fig = plt.figure(figsize=(15, 10))

# 1. Age distribution by survival
plt.subplot(2, 2, 1)
sns.kdeplot(data=df, x='Age', hue='Survived', common_norm=False)
plt.title('Age Distribution by Survival')

# 2. Fare distribution by class
plt.subplot(2, 2, 2)
sns.boxplot(data=df, x='Pclass', y='Fare')
plt.title('Fare Distribution by Class')

# 3. Survival rate by class and gender
plt.subplot(2, 2, 3)
sns.barplot(data=df, x='Pclass', y='Survived', hue='Sex')
plt.title('Survival Rate by Class and Gender')

# 4. Family size analysis
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
plt.subplot(2, 2, 4)
sns.barplot(data=df, x='FamilySize', y='Survived')
plt.title('Survival Rate by Family Size')

plt.tight_layout()
plt.show()

## 3. Advanced Feature Engineering

In [None]:
# Clean data with advanced features
df_cleaned = processor.clean_data(
    columns=['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'],
    drop_duplicates=True,
    fill_na={'Age': df['Age'].median(), 'Embarked': 'S'}
)

# Create advanced features
df_cleaned['FamilySize'] = df_cleaned['SibSp'] + df_cleaned['Parch'] + 1
df_cleaned['IsAlone'] = (df_cleaned['FamilySize'] == 1).astype(int)
df_cleaned['AgeBin'] = pd.qcut(df_cleaned['Age'], 5)
df_cleaned['FareBin'] = pd.qcut(df_cleaned['Fare'], 5)

# Feature engineering
fe = FeatureEngineering()
numeric_cols = ['Age', 'Fare', 'FamilySize']
categorical_cols = ['Pclass', 'Sex', 'Embarked', 'IsAlone']

df_features = fe.create_features(
    df_cleaned,
    numeric_columns=numeric_cols,
    categorical_columns=categorical_cols
)

print("Final features shape:", df_features.shape)
df_features.head()

## 4. Model Comparison

In [None]:
# Prepare data
X = df_features.drop('Survived', axis=1)
y = df_features['Survived']

# Compare different models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
}

# Train and evaluate models
results = []
for name, model in models.items():
    trainer = ModelTrainer(model)
    X_train, X_test, y_train, y_test = trainer.split_data(X, y)
    trainer.train_model(X_train, y_train)
    metrics = trainer.evaluate_model(X_test, y_test)
    
    # Perform cross-validation
    cv_scores = cross_val_score(model, X, y, cv=5)
    
    results.append({
        'Model': name,
        'Test Accuracy': metrics['accuracy'],
        'CV Mean': cv_scores.mean(),
        'CV Std': cv_scores.std()
    })

# Display results
results_df = pd.DataFrame(results)
print("Model Comparison:")
print(results_df)

## 5. Best Model Analysis

In [None]:
# Get the best model (Gradient Boosting)
best_model = models['Gradient Boosting']
trainer = ModelTrainer(best_model)
X_train, X_test, y_train, y_test = trainer.split_data(X, y)
trainer.train_model(X_train, y_train)

# Plot confusion matrix
y_pred = trainer.model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Plot ROC curve
y_pred_proba = trainer.model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

## 6. Feature Importance and Insights

In [None]:
# Get feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': trainer.model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance)
plt.title('Feature Importance (Gradient Boosting)')
plt.show()

# Print key insights
print("\nKey Insights:")
print("1. Top 3 most important features:")
print(feature_importance.head(3))

print("\n2. Model Performance:")
print(f"Accuracy: {metrics['accuracy']:.2f}")
print("\n3. Classification Report:")
print(metrics['classification_report'])