# Flight Delay Prediction - Model Training and Evaluation

This notebook focuses on training and evaluating multiple machine learning models for flight delay prediction.

## Objectives
1. Load engineered features from previous notebook
2. Prepare data for machine learning (train/test split, handling imbalance)
3. Train multiple models (Logistic Regression, Random Forest, XGBoost, LightGBM)
4. Evaluate and compare model performance
5. Perform hyperparameter tuning
6. Analyze feature importance
7. Save best model for deployment

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import sys
import json
from pathlib import Path

# Add src to path
sys.path.append('../src')

from models.train_model import FlightDelayPredictor, train_all_models
from visualization.plots import FlightDelayVisualizer

# Machine learning imports
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.metrics import precision_recall_curve, average_precision_score

warnings.filterwarnings('ignore')
plt.style.use('default')
sns.set_palette("husl")

%matplotlib inline

## 1. Load Engineered Data

In [None]:
# Load engineered dataset
data_path = '../data/processed/flight_features_engineered.csv'
metadata_path = '../data/processed/feature_metadata.json'

if Path(data_path).exists():
    df = pd.read_csv(data_path)
    print(f"Loaded engineered dataset: {df.shape}")
    
    # Load metadata if available
    if Path(metadata_path).exists():
        with open(metadata_path, 'r') as f:
            metadata = json.load(f)
        print(f"Feature categories: {metadata['feature_categories']}")
        print(f"Delay rate: {metadata['delay_rate']*100:.1f}%")
    
else:
    print(f"Engineered data not found at {data_path}")
    print("Please run 02_feature_engineering.ipynb first")
    
    # Fallback: load and quickly engineer features
    from data.download_data import load_airline_data
    from features.feature_engineering import FlightFeatureEngineer
    
    print("Loading and engineering features as fallback...")
    df_raw = load_airline_data(year=2023, sample_size=50000)
    
    if df_raw is not None:
        df_raw['delayed'] = (df_raw['ARR_DELAY'] > 15).astype(int)
        df_raw = df_raw[(df_raw.get('CANCELLED', 0) != 1) & (df_raw['ARR_DELAY'].notna())].copy()
        
        engineer = FlightFeatureEngineer()
        df = engineer.engineer_all_features(df_raw)
        print(f"Engineered features: {df.shape}")
    else:
        raise ValueError("Could not load data")

print(f"\nDataset info:")
print(f"Shape: {df.shape}")
print(f"Target distribution: {df['delayed'].value_counts()}")
print(f"Delay rate: {df['delayed'].mean()*100:.1f}%")

## 2. Data Preparation for Modeling

In [None]:
# Initialize predictor
predictor = FlightDelayPredictor()
viz = FlightDelayVisualizer()

# Prepare data for modeling
print("Preparing data for modeling...")
X_train, X_test, y_train, y_test = predictor.prepare_data(df, target_col='delayed', test_size=0.2, random_state=42)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Feature columns: {len(predictor.feature_names)}")

# Show class distribution
print(f"\nTraining set class distribution:")
print(f"On-time: {(y_train == 0).sum():,} ({(y_train == 0).mean()*100:.1f}%)")
print(f"Delayed: {(y_train == 1).sum():,} ({(y_train == 1).mean()*100:.1f}%)")
print(f"Class imbalance ratio: {(y_train == 0).sum() / (y_train == 1).sum():.1f}:1")

# Display sample of features
print(f"\nSample features (first 10):")
print(predictor.feature_names[:10])

## 3. Baseline Model Training

In [None]:
# Train baseline logistic regression
print("Training baseline logistic regression model...")
lr_model = predictor.train_logistic_regression(X_train, y_train)

# Evaluate baseline
lr_result = predictor.evaluate_model(lr_model, X_test, y_test, "Logistic Regression (Baseline)")

# Plot precision-recall curve for baseline
predictor.plot_precision_recall_curve(y_test, lr_result['probabilities'], "Logistic Regression")

## 4. Advanced Model Training

In [None]:
# Create validation set for tree-based models
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

print(f"Training set (for tree models): {X_train_split.shape}")
print(f"Validation set: {X_val.shape}")

# Train Random Forest
print("\nTraining Random Forest...")
rf_model = predictor.train_random_forest(X_train, y_train)
rf_result = predictor.evaluate_model(rf_model, X_test, y_test, "Random Forest")

# Train XGBoost
print("\nTraining XGBoost...")
xgb_model = predictor.train_xgboost(X_train_split, y_train_split, X_val, y_val)
xgb_result = predictor.evaluate_model(xgb_model, X_test, y_test, "XGBoost")

# Train LightGBM
print("\nTraining LightGBM...")
lgb_model = predictor.train_lightgbm(X_train_split, y_train_split, X_val, y_val)
lgb_result = predictor.evaluate_model(lgb_model, X_test, y_test, "LightGBM")

## 5. Model Comparison and Analysis

In [None]:
# Compare all models
print("\n" + "="*50)
print("MODEL COMPARISON SUMMARY")
print("="*50)

results = predictor.compare_models(X_test, y_test)

# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Model': [r['model_name'] for r in results],
    'ROC_AUC': [r['roc_auc'] for r in results]
})

print("\nModel Performance Ranking:")
display(comparison_df.sort_values('ROC_AUC', ascending=False))

# Visualize model comparison
viz.plot_model_comparison(results)

In [None]:
# ROC Curves for all models
plt.figure(figsize=(10, 8))

for result in results:
    fpr, tpr, _ = roc_curve(y_test, result['probabilities'])
    plt.plot(fpr, tpr, label=f"{result['model_name']} (AUC = {result['roc_auc']:.3f})")

plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves - Model Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Precision-Recall Curves
plt.figure(figsize=(10, 8))

for result in results:
    precision, recall, _ = precision_recall_curve(y_test, result['probabilities'])
    avg_precision = average_precision_score(y_test, result['probabilities'])
    plt.plot(recall, precision, label=f"{result['model_name']} (AP = {avg_precision:.3f})")

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curves - Model Comparison')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 6. Feature Importance Analysis

In [None]:
# Analyze feature importance for best performing tree-based model
best_model_name = max(results, key=lambda x: x['roc_auc'])['model_name']
print(f"Best performing model: {best_model_name}")

best_model = predictor.best_model

if hasattr(best_model, 'feature_importances_'):
    print("\nAnalyzing feature importance...")
    
    # Get feature importance
    feature_importance = predictor.plot_feature_importance(best_model, best_model_name, top_n=20)
    
    # Create feature importance dataframe
    importance_df = pd.DataFrame(feature_importance, columns=['Feature', 'Importance'])
    
    print("\nTop 15 Most Important Features:")
    display(importance_df.head(15))
    
    # Categorize important features
    feature_categories = {
        'Time': ['hour', 'day', 'month', 'weekend', 'sin', 'cos'],
        'Airport/Route': ['origin', 'dest', 'route', 'major', 'departures', 'arrivals'],
        'Aircraft': ['prev_flight', 'aircraft', 'tail', 'hours_since'],
        'Airline': ['airline', 'carrier'],
        'Weather': ['weather', 'temperature', 'wind', 'visibility', 'precipitation']
    }
    
    print("\nFeature Importance by Category:")
    for category, keywords in feature_categories.items():
        category_features = importance_df[importance_df['Feature'].str.contains('|'.join(keywords), case=False)]
        if len(category_features) > 0:
            total_importance = category_features['Importance'].sum()
            print(f"{category}: {total_importance:.3f} ({len(category_features)} features)")

else:
    print(f"Feature importance not available for {best_model_name}")

## 7. Hyperparameter Tuning

In [None]:
# Hyperparameter tuning for best model type
print("Performing hyperparameter tuning...")
print("Note: This may take several minutes")

# Determine which model to tune based on best performance
if 'xgboost' in best_model_name.lower():
    tuned_model = predictor.hyperparameter_tuning(X_train_split, y_train_split, model_type='xgboost')
    tuned_result = predictor.evaluate_model(tuned_model, X_test, y_test, "XGBoost (Tuned)")
elif 'random forest' in best_model_name.lower():
    tuned_model = predictor.hyperparameter_tuning(X_train_split, y_train_split, model_type='random_forest')
    tuned_result = predictor.evaluate_model(tuned_model, X_test, y_test, "Random Forest (Tuned)")
else:
    print(f"Hyperparameter tuning not implemented for {best_model_name}")
    tuned_model = best_model
    tuned_result = max(results, key=lambda x: x['roc_auc'])

print(f"\nTuning Results:")
print(f"Original {best_model_name}: {max(results, key=lambda x: x['roc_auc'])['roc_auc']:.4f}")
if 'tuned_result' in locals():
    print(f"Tuned model: {tuned_result['roc_auc']:.4f}")
    improvement = tuned_result['roc_auc'] - max(results, key=lambda x: x['roc_auc'])['roc_auc']
    print(f"Improvement: {improvement:.4f}")

## 8. Model Interpretability and Error Analysis

In [None]:
# Confusion matrix for best model
best_predictions = max(results, key=lambda x: x['roc_auc'])['predictions']
best_probabilities = max(results, key=lambda x: x['roc_auc'])['probabilities']

# Confusion Matrix
cm = confusion_matrix(y_test, best_predictions)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['On-Time', 'Delayed'], 
            yticklabels=['On-Time', 'Delayed'])
plt.title(f'Confusion Matrix - {best_model_name}')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Calculate additional metrics
tn, fp, fn, tp = cm.ravel()
precision = tp / (tp + fp)
recall = tp / (tp + fn)
specificity = tn / (tn + fp)
f1 = 2 * (precision * recall) / (precision + recall)

print(f"\nDetailed Performance Metrics for {best_model_name}:")
print(f"Accuracy: {(tp + tn) / (tp + tn + fp + fn):.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall (Sensitivity): {recall:.3f}")
print(f"Specificity: {specificity:.3f}")
print(f"F1-Score: {f1:.3f}")
print(f"ROC-AUC: {max(results, key=lambda x: x['roc_auc'])['roc_auc']:.3f}")

In [None]:
# Prediction probability distribution
plt.figure(figsize=(12, 5))

# Subplot 1: Probability distribution by actual class
plt.subplot(1, 2, 1)
plt.hist(best_probabilities[y_test == 0], bins=50, alpha=0.7, label='Actual On-Time', color='green')
plt.hist(best_probabilities[y_test == 1], bins=50, alpha=0.7, label='Actual Delayed', color='red')
plt.xlabel('Predicted Delay Probability')
plt.ylabel('Frequency')
plt.title('Prediction Probability Distribution')
plt.legend()

# Subplot 2: Calibration plot
plt.subplot(1, 2, 2)
from sklearn.calibration import calibration_curve

fraction_of_positives, mean_predicted_value = calibration_curve(y_test, best_probabilities, n_bins=10)
plt.plot(mean_predicted_value, fraction_of_positives, "s-", label=best_model_name)
plt.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
plt.xlabel('Mean Predicted Probability')
plt.ylabel('Fraction of Positives')
plt.title('Calibration Plot')
plt.legend()

plt.tight_layout()
plt.show()

## 9. Cross-Validation

In [None]:
# Perform cross-validation on best model
print("Performing 5-fold cross-validation...")

if 'tuned_model' in locals():
    cv_model = tuned_model
    cv_name = f"{best_model_name} (Tuned)"
else:
    cv_model = best_model
    cv_name = best_model_name

# Use a sample for cross-validation if dataset is large
sample_size = min(10000, len(X_train))
if len(X_train) > sample_size:
    X_cv_sample = X_train.sample(sample_size, random_state=42)
    y_cv_sample = y_train[X_cv_sample.index]
    print(f"Using sample of {sample_size} for cross-validation")
else:
    X_cv_sample = X_train
    y_cv_sample = y_train

cv_scores = cross_val_score(cv_model, X_cv_sample, y_cv_sample, cv=5, scoring='roc_auc', n_jobs=-1)

print(f"\nCross-Validation Results for {cv_name}:")
print(f"ROC-AUC Scores: {cv_scores}")
print(f"Mean ROC-AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
print(f"Test Set ROC-AUC: {max(results, key=lambda x: x['roc_auc'])['roc_auc']:.4f}")

# Plot cross-validation scores
plt.figure(figsize=(8, 5))
plt.boxplot(cv_scores)
plt.ylabel('ROC-AUC Score')
plt.title(f'Cross-Validation Scores - {cv_name}')
plt.grid(True, alpha=0.3)
plt.show()

## 10. Model Persistence

In [None]:
# Save the best model
final_model = tuned_model if 'tuned_model' in locals() else best_model
final_model_name = f"best_flight_delay_model_{best_model_name.lower().replace(' ', '_')}"

predictor.save_model(final_model, final_model_name)

# Save model metadata
model_metadata = {
    'model_type': best_model_name,
    'performance_metrics': {
        'roc_auc': max(results, key=lambda x: x['roc_auc'])['roc_auc'],
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'accuracy': (tp + tn) / (tp + tn + fp + fn)
    },
    'cross_validation': {
        'mean_cv_score': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'cv_scores': cv_scores.tolist()
    },
    'training_info': {
        'training_samples': len(X_train),
        'test_samples': len(X_test),
        'num_features': len(predictor.feature_names),
        'feature_names': predictor.feature_names,
        'class_distribution': {
            'on_time': int((y_train == 0).sum()),
            'delayed': int((y_train == 1).sum())
        }
    }
}

# Add feature importance if available
if 'feature_importance' in locals():
    model_metadata['feature_importance'] = {
        feature: float(importance) for feature, importance in feature_importance[:20]
    }

# Save metadata
metadata_path = f'../models/{final_model_name}_metadata.json'
with open(metadata_path, 'w') as f:
    json.dump(model_metadata, f, indent=2)

print(f"\n✅ Model and metadata saved:")
print(f"Model: ../models/{final_model_name}.joblib")
print(f"Metadata: {metadata_path}")

## 11. Final Summary and Recommendations

In [None]:
print("="*60)
print("FLIGHT DELAY PREDICTION - FINAL SUMMARY")
print("="*60)

print(f"\n📊 DATASET SUMMARY:")
print(f"Training samples: {len(X_train):,}")
print(f"Test samples: {len(X_test):,}")
print(f"Features engineered: {len(predictor.feature_names)}")
print(f"Delay rate: {y_train.mean()*100:.1f}%")

print(f"\n🏆 BEST MODEL: {best_model_name}")
print(f"ROC-AUC Score: {max(results, key=lambda x: x['roc_auc'])['roc_auc']:.4f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1-Score: {f1:.3f}")
print(f"Cross-validation mean: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

if 'feature_importance' in locals():
    print(f"\n🎯 TOP 5 MOST IMPORTANT FEATURES:")
    for i, (feature, importance) in enumerate(feature_importance[:5], 1):
        print(f"{i}. {feature}: {importance:.3f}")

print(f"\n💡 KEY INSIGHTS:")
print(f"• Model successfully predicts flight delays with {max(results, key=lambda x: x['roc_auc'])['roc_auc']*100:.1f}% AUC")
print(f"• Can identify {recall*100:.1f}% of actual delays (recall)")
print(f"• {precision*100:.1f}% of predicted delays are correct (precision)")
print(f"• Class imbalance handled effectively with appropriate techniques")

print(f"\n🚀 NEXT STEPS:")
print(f"1. Deploy model using Streamlit app (app/streamlit_app.py)")
print(f"2. Integrate real-time weather data for live predictions")
print(f"3. Monitor model performance and retrain periodically")
print(f"4. Consider ensemble methods for further improvement")
print(f"5. Collect feedback and iterate on features")

print(f"\n📁 SAVED FILES:")
print(f"• Model: ../models/{final_model_name}.joblib")
print(f"• Metadata: ../models/{final_model_name}_metadata.json")
print(f"• Processed data: ../data/processed/flight_features_engineered.csv")

print(f"\n" + "="*60)
print("🎉 MODEL TRAINING COMPLETE!")
print("Ready for deployment and real-world testing.")
print("="*60)