import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import time
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# ML libraries
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Evaluation metrics
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report,
    roc_curve, precision_recall_curve
)
from sklearn.model_selection import cross_val_score

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12,6)

print("Libraries imported successfully!")
print(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

## 1. Setup and Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import time
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# ML libraries
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Evaluation metrics
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report,
    roc_curve, precision_recall_curve
)
from sklearn.model_selection import cross_val_score

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12,6)

print("Libraries imported successfully!")
print(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

In [None]:
# Load processed data
X_train = pd.read_csv('../data/processed/X_train.csv')
X_test = pd.read_csv('../data/processed/X_test.csv')
y_train = pd.read_csv('../data/processed/y_train.csv').squeeze()
y_test = pd.read_csv('../data/processed/y_test.csv').squeeze()

print(f"Data loaded:")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")
print(f"\nClass distribution:")
print(f"Train - Delay rate: {y_train.mean():.2%}")
print(f"Test - Delay rate: {y_test.mean():.2%}")

## 2. Helper Functions

In [None]:
def evaluate_model(model, X_train, y_train, X_test, y_test, model_name):
    """
    Evaluate a trained model and return metrics
    """
    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Probabilities
    y_train_proba = model.predict_proba(X_train)[:, 1]
    y_test_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    metrics = {
        'Model': model_name,
        'Train_Accuracy': accuracy_score(y_train, y_train_pred),
        'Test_Accuracy': accuracy_score(y_test, y_test_pred),
        'Train_Precision': precision_score(y_train, y_train_pred),
        'Test_Precision': precision_score(y_test, y_test_pred),
        'Train_Recall': recall_score(y_train, y_train_pred),
        'Test_Recall': recall_score(y_test, y_test_pred),
        'Train_F1': f1_score(y_train, y_train_pred),
        'Test_F1': f1_score(y_test, y_test_pred),
        'Train_ROC_AUC': roc_auc_score(y_train, y_train_proba),
        'Test_ROC_AUC': roc_auc_score(y_test, y_test_proba)
    }
    
    return metrics, y_test_pred, y_test_proba

def print_metrics(metrics):
    """
    Print model evaluation metrics
    """
    print(f"\n{'='*80}")
    print(f"Model: {metrics['Model']}")
    print(f"{'='*80}")
    print(f"\nTrain Metrics:")
    print(f"  Accuracy:  {metrics['Train_Accuracy']:.4f}")
    print(f"  Precision: {metrics['Train_Precision']:.4f}")
    print(f"  Recall:    {metrics['Train_Recall']:.4f}")
    print(f"  F1 Score:  {metrics['Train_F1']:.4f}")
    print(f"  ROC-AUC:   {metrics['Train_ROC_AUC']:.4f}")
    
    print(f"\nTest Metrics:")
    print(f"  Accuracy:  {metrics['Test_Accuracy']:.4f}")
    print(f"  Precision: {metrics['Test_Precision']:.4f}")
    print(f"  Recall:    {metrics['Test_Recall']:.4f}")
    print(f"  F1 Score:  {metrics['Test_F1']:.4f}")
    print(f"  ROC-AUC:   {metrics['Test_ROC_AUC']:.4f}")

print("Helper functions defined")

## 3. Baseline Model 1: Logistic Regression

In [None]:
# Feature importance from Decision Tree
feature_importance_dt = pd.DataFrame({
    'feature': X_train.columns,
    'importance': dt_model.feature_importances_
}).sort_values('importance', ascending=False).head(20)

print("\\nTop 20 Important Features (Decision Tree):")
print(feature_importance_dt)

# Visualze (minor typo)
plt.figure(figsize=(10, 8))
plt.barh(range(len(feature_importance_dt)), feature_importance_dt['importance'].values)
plt.yticks(range(len(feature_importance_dt)), feature_importance_dt['feature'].values, fontsize=9)
plt.xlabel('Feature Importance')
plt.title('Top 20 Features - Decision Tree')
plt.tight_layout()
plt.show()

In [None]:
# Confusion Matrix for Logistic Regression
cm_lr = confusion_matrix(y_test, lr_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['On-Time', 'Late'],
            yticklabels=['On-Time', 'Late'])
plt.title('Confusion Matrix - Logistic Regression')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()
plt.show()

print("\nClassification Report:")
print(classification_report(y_test, lr_pred, target_names=['On-Time', 'Late']))

## 4. Baseline Model 2: Decision Tree

In [None]:
# Train Decision Tree
print("Training Decision Tree...")
start_time = time.time()

dt_model = DecisionTreeClassifier(
    max_depth=10,
    min_samples_split=20,
    min_samples_leaf=10,
    random_state=42
)

dt_model.fit(X_train, y_train)

training_time = time.time() - start_time
print(f"Training completed in {training_time:.2f} seconds")

# Evaluate
dt_metrics, dt_pred, dt_proba = evaluate_model(dt_model, X_train, y_train, X_test, y_test, 'Decision Tree')
print_metrics(dt_metrics)

In [None]:
# Feature importance from Decision Tree
feature_importance_dt = pd.DataFrame({
    'feature': X_train.columns,
    'importance': dt_model.feature_importances_
}).sort_values('importance', ascending=False).head(20)

print("\nTop 20 Important Features (Decision Tree):")
print(feature_importance_dt)

# Visualize
plt.figure(figsize=(10, 8))
plt.barh(range(len(feature_importance_dt)), feature_importance_dt['importance'].values)
plt.yticks(range(len(feature_importance_dt)), feature_importance_dt['feature'].values, fontsize=9)
plt.xlabel('Feature Importance')
plt.title('Top 20 Features - Decision Tree')
plt.tight_layout()
plt.show()

## 5. Advanced Model 1: Random Forest

In [None]:
# Train Random Forest
print("Training Random Forest...")
start_time = time.time()

rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    min_samples_split=10,
    min_samples_leaf=5,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1,
    verbose=0
)

rf_model.fit(X_train, y_train)

training_time = time.time() - start_time
print(f"Training completed in {training_time:.2f} seconds")

# Evaluate
rf_metrics, rf_pred, rf_proba = evaluate_model(rf_model, X_train, y_train, X_test, y_test, 'Random Forest')
print_metrics(rf_metrics)

In [None]:
# Feature importance from Random Forest
feature_importance_rf = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False).head(20)

print("\nTop 20 Important Features (Random Forest):")
print(feature_importance_rf)

# Visualize
plt.figure(figsize=(10, 8))
plt.barh(range(len(feature_importance_rf)), feature_importance_rf['importance'].values)
plt.yticks(range(len(feature_importance_rf)), feature_importance_rf['feature'].values, fontsize=9)
plt.xlabel('Feature Importance')
plt.title('Top 20 Features - Random Forest')
plt.tight_layout()
plt.show()

## 6. Advanced Model 2: XGBoost

In [None]:
# Train XGBoost
print("Training XGBoost...")
start_time = time.time()

xgb_model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0,
    reg_alpha=0,
    reg_lambda=1,
    random_state=42,
    n_jobs=-1,
    eval_metric='logloss'
)

xgb_model.fit(X_train, y_train)

training_time = time.time() - start_time
print(f"Training completed in {training_time:.2f} seconds")

# Evaluate
xgb_metrics, xgb_pred, xgb_proba = evaluate_model(xgb_model, X_train, y_train, X_test, y_test, 'XGBoost')
print_metrics(xgb_metrics)

In [None]:
# Feature importance from XGBoost
feature_importance_xgb = pd.DataFrame({
    'feature': X_train.columns,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False).head(20)

print("\nTop 20 Important Features (XGBoost):")
print(feature_importance_xgb)

# Visualize
plt.figure(figsize=(10, 8))
plt.barh(range(len(feature_importance_xgb)), feature_importance_xgb['importance'].values)
plt.yticks(range(len(feature_importance_xgb)), feature_importance_xgb['feature'].values, fontsize=9)
plt.xlabel('Feature Importance')
plt.title('Top 20 Features - XGBoost')
plt.tight_layout()
plt.show()

## 7. Advanced Model 3: LightGBM

In [None]:
# Train LightGBM
print("Training LightGBM...")
start_time = time.time()

lgbm_model = LGBMClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0,
    reg_lambda=1,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

lgbm_model.fit(X_train, y_train)

training_time = time.time() - start_time
print(f"Training completed in {training_time:.2f} seconds")

# Evaluate
lgbm_metrics, lgbm_pred, lgbm_proba = evaluate_model(lgbm_model, X_train, y_train, X_test, y_test, 'LightGBM')
print_metrics(lgbm_metrics)

In [None]:
# Feature importance from LightGBM
feature_importance_lgbm = pd.DataFrame({
    'feature': X_train.columns,
    'importance': lgbm_model.feature_importances_
}).sort_values('importance', ascending=False).head(20)

print("\nTop 20 Important Features (LightGBM):")
print(feature_importance_lgbm)

# Visualize
plt.figure(figsize=(10, 8))
plt.barh(range(len(feature_importance_lgbm)), feature_importance_lgbm['importance'].values)
plt.yticks(range(len(feature_importance_lgbm)), feature_importance_lgbm['feature'].values, fontsize=9)
plt.xlabel('Feature Importance')
plt.title('Top 20 Features - LightGBM')
plt.tight_layout()
plt.show()

## 8. Model Comparison

In [None]:
# Compile all metrics
all_metrics = pd.DataFrame([
    lr_metrics,
    dt_metrics,
    rf_metrics,
    xgb_metrics,
    lgbm_metrics
])

print("\nModel Comparison - Test Set Performance:")
print("="*100)
print(all_metrics[['Model', 'Test_Accuracy', 'Test_Precision', 'Test_Recall', 'Test_F1', 'Test_ROC_AUC']].to_string(index=False))

# Save comparison
all_metrics.to_csv('../models/model_metadata/model_comparison.csv', index=False)
print("\nModel comparison saved to: ../models/model_metadata/model_comparison.csv")

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

metrics_to_plot = ['Test_Accuracy', 'Test_Precision', 'Test_Recall', 'Test_ROC_AUC']
titles = ['Accuracy', 'Precision', 'Recall', 'ROC-AUC']

for idx, (metric, title) in enumerate(zip(metrics_to_plot, titles)):
    row = idx // 2
    col = idx % 2
    
    axes[row, col].bar(all_metrics['Model'], all_metrics[metric])
    axes[row, col].set_title(f'Test {title} Comparison')
    axes[row, col].set_ylabel(title)
    axes[row, col].tick_params(axis='x', rotation=45)
    axes[row, col].set_ylim([0, 1])
    
    # Add value labels on bars
    for i, v in enumerate(all_metrics[metric]):
        axes[row, col].text(i, v + 0.02, f'{v:.3f}', ha='center', fontsize=9)

plt.tight_layout()
plt.savefig('../models/model_metadata/model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print("Comparison plot saved")

In [None]:
# ROC Curves Comparison
plt.figure(figsize=(10, 8))

# Plot ROC curve for each model
models_data = [
    ('Logistic Regression', lr_proba),
    ('Decision Tree', dt_proba),
    ('Random Forest', rf_proba),
    ('XGBoost', xgb_proba),
    ('LightGBM', lgbm_proba)
]

for model_name, y_proba in models_data:
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    auc = roc_auc_score(y_test, y_proba)
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {auc:.3f})', linewidth=2)

plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier', linewidth=1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves - All Models')
plt.legend(loc='lower right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('../models/model_metadata/roc_curves.png', dpi=300, bbox_inches='tight')
plt.show()

## 9. Save Trained Models

In [None]:
# Save all models
models_to_save = {
    'logistic_regression': lr_model,
    'decision_tree': dt_model,
    'random_forest': rf_model,
    'xgboost': xgb_model,
    'lightgbm': lgbm_model
}

for model_name, model in models_to_save.items():
    filename = f'../models/saved_models/{model_name}_model.pkl'
    joblib.dump(model, filename)
    print(f"Saved: {filename}")

print("\nAll models saved successfully!")

In [None]:
# Save predictions for later analysis
predictions_df = pd.DataFrame({
    'y_true': y_test,
    'lr_pred': lr_pred,
    'lr_proba': lr_proba,
    'dt_pred': dt_pred,
    'dt_proba': dt_proba,
    'rf_pred': rf_pred,
    'rf_proba': rf_proba,
    'xgb_pred': xgb_pred,
    'xgb_proba': xgb_proba,
    'lgbm_pred': lgbm_pred,
    'lgbm_proba': lgbm_proba
})

predictions_df.to_csv('../models/model_metadata/all_predictions.csv', index=False)
print("Predictions saved to: ../models/model_metadata/all_predictions.csv")

## 10. Summary

In [None]:
print("="*100)
print("SUMMARY - ML MODELS")
print("="*100)

print("\n1. Models Trained:")
print("   Baseline Models:")
print("   - Logistic Regression")
print("   - Decision Tree")
print("   \n   Advanced Models:")
print("   - Random Forest")
print("   - XGBoost")
print("   - LightGBM")

print("\n2. Best Model by Metric:")
print(f"   - Best Accuracy:  {all_metrics.loc[all_metrics['Test_Accuracy'].idxmax(), 'Model']} ({all_metrics['Test_Accuracy'].max():.4f})")
print(f"   - Best Precision: {all_metrics.loc[all_metrics['Test_Precision'].idxmax(), 'Model']} ({all_metrics['Test_Precision'].max():.4f})")
print(f"   - Best Recall:    {all_metrics.loc[all_metrics['Test_Recall'].idxmax(), 'Model']} ({all_metrics['Test_Recall'].max():.4f})")
print(f"   - Best F1 Score:  {all_metrics.loc[all_metrics['Test_F1'].idxmax(), 'Model']} ({all_metrics['Test_F1'].max():.4f})")
print(f"   - Best ROC-AUC:   {all_metrics.loc[all_metrics['Test_ROC_AUC'].idxmax(), 'Model']} ({all_metrics['Test_ROC_AUC'].max():.4f})")

print("\n3. Model Performance Range:")
print(f"   - Test Accuracy:  {all_metrics['Test_Accuracy'].min():.4f} - {all_metrics['Test_Accuracy'].max():.4f}")
print(f"   - Test ROC-AUC:   {all_metrics['Test_ROC_AUC'].min():.4f} - {all_metrics['Test_ROC_AUC'].max():.4f}")

print("\n4. Saved Artifacts:")
print("   - 5 trained models (.pkl files)")
print("   - Model comparison CSV")
print("   - All predictions CSV")
print("   - Comparison plots (PNG)")

print("\n5. Next Steps:")
print("   - Notebook 5: Model Evaluation & Selection")
print("   - Error analysis and SHAP values")
print("   - Final model selection")
print("   - Deployment recommendations")
print("="*100)