# üéØ Flight Delay Classification

Predict whether a flight will be delayed using machine learning models.

**Models:**
- Logistic Regression (baseline)
- Random Forest
- XGBoost
- LightGBM

**Techniques:**
- SMOTE for handling imbalanced data
- Cross-validation
- Feature importance analysis


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, confusion_matrix, classification_report,
                             roc_curve, precision_recall_curve)
import warnings
import sys
import os

sys.path.insert(0, os.path.abspath('..'))
warnings.filterwarnings('ignore')

# Custom colors
COLORS = {'primary': '#2E86AB', 'secondary': '#A23B72', 'success': '#18A558', 
          'warning': '#F18F01', 'danger': '#C73E1D'}

print("‚úì Libraries imported")


In [None]:
# Load processed data
X = pd.read_csv('../data/processed/features.csv')
y = pd.read_csv('../data/processed/target.csv').iloc[:, 0]

print(f"Features shape: {X.shape}")
print(f"Target distribution:\n{y.value_counts()}")
print(f"\nClass imbalance ratio: {y.value_counts()[0]/y.value_counts()[1]:.2f}:1")


## 1. Data Preparation


In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set: {len(X_train):,} samples")
print(f"Test set: {len(X_test):,} samples")

# Apply SMOTE for handling imbalanced data
try:
    from imblearn.over_sampling import SMOTE
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)
    print(f"\nAfter SMOTE: {len(X_train_resampled):,} samples")
    print(f"Class distribution: {pd.Series(y_train_resampled).value_counts().to_dict()}")
except ImportError:
    print("SMOTE not available. Using original data with class weights.")
    X_train_resampled, y_train_resampled = X_train_scaled, y_train


## 2. Train Multiple Models


In [None]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, max_depth=5, random_state=42)
}

# Try to add XGBoost and LightGBM
try:
    from xgboost import XGBClassifier
    models['XGBoost'] = XGBClassifier(n_estimators=100, max_depth=6, scale_pos_weight=3, random_state=42, eval_metric='logloss')
except ImportError:
    print("XGBoost not available")

try:
    from lightgbm import LGBMClassifier
    models['LightGBM'] = LGBMClassifier(n_estimators=100, max_depth=6, class_weight='balanced', random_state=42, verbose=-1)
except ImportError:
    print("LightGBM not available")

print(f"Models to train: {list(models.keys())}")


In [None]:
# Train and evaluate models
results = {}

for name, model in models.items():
    print(f"\n{'='*50}")
    print(f"Training: {name}")
    print('='*50)
    
    # Train
    model.fit(X_train_resampled, y_train_resampled)
    
    # Predict
    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:, 1]
    
    # Calculate metrics
    results[name] = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, y_prob),
        'y_pred': y_pred,
        'y_prob': y_prob,
        'model': model
    }
    
    print(f"Accuracy:  {results[name]['accuracy']:.4f}")
    print(f"Precision: {results[name]['precision']:.4f}")
    print(f"Recall:    {results[name]['recall']:.4f}")
    print(f"F1-Score:  {results[name]['f1']:.4f}")
    print(f"ROC-AUC:   {results[name]['roc_auc']:.4f}")


## 3. Model Comparison


In [None]:
# Create comparison dataframe
comparison = pd.DataFrame({
    name: {k: v for k, v in metrics.items() if k not in ['y_pred', 'y_prob', 'model']}
    for name, metrics in results.items()
}).T

comparison = comparison.sort_values('f1', ascending=False)
print("\nüìä Model Comparison (sorted by F1-Score):")
display(comparison.round(4))

# Plot comparison
fig, ax = plt.subplots(figsize=(12, 6))
comparison[['accuracy', 'precision', 'recall', 'f1', 'roc_auc']].plot(kind='bar', ax=ax, 
    color=[COLORS['primary'], COLORS['secondary'], COLORS['success'], COLORS['warning'], COLORS['danger']])
ax.set_ylabel('Score')
ax.set_title('Model Performance Comparison')
ax.legend(loc='lower right')
ax.set_ylim([0, 1])
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('../reports/figures/model_comparison.png', dpi=150)
plt.show()


## 4. Best Model Analysis


In [None]:
# Select best model
best_model_name = comparison.index[0]
best_results = results[best_model_name]

print(f"üèÜ Best Model: {best_model_name}")
print(f"\nClassification Report:")
print(classification_report(y_test, best_results['y_pred'], target_names=['On-Time', 'Delayed']))

# Confusion Matrix
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Confusion Matrix
ax1 = axes[0]
cm = confusion_matrix(y_test, best_results['y_pred'])
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax1, 
            xticklabels=['On-Time', 'Delayed'], yticklabels=['On-Time', 'Delayed'])
ax1.set_xlabel('Predicted')
ax1.set_ylabel('Actual')
ax1.set_title(f'Confusion Matrix - {best_model_name}')

# ROC Curve
ax2 = axes[1]
fpr, tpr, _ = roc_curve(y_test, best_results['y_prob'])
ax2.plot(fpr, tpr, color=COLORS['primary'], linewidth=2, 
         label=f'ROC (AUC = {best_results["roc_auc"]:.3f})')
ax2.plot([0, 1], [0, 1], 'k--', linewidth=1)
ax2.fill_between(fpr, tpr, alpha=0.2, color=COLORS['primary'])
ax2.set_xlabel('False Positive Rate')
ax2.set_ylabel('True Positive Rate')
ax2.set_title('ROC Curve')
ax2.legend()

plt.tight_layout()
plt.savefig('../reports/figures/best_model_analysis.png', dpi=150)
plt.show()


## 5. Feature Importance


In [None]:
# Feature importance from Random Forest
if 'Random Forest' in results:
    rf_model = results['Random Forest']['model']
    importance = pd.DataFrame({
        'feature': X.columns,
        'importance': rf_model.feature_importances_
    }).sort_values('importance', ascending=False).head(15)
    
    fig, ax = plt.subplots(figsize=(10, 8))
    colors = plt.cm.Blues(np.linspace(0.3, 1, len(importance)))[::-1]
    ax.barh(importance['feature'], importance['importance'], color=colors)
    ax.set_xlabel('Importance')
    ax.set_title('Top 15 Feature Importance (Random Forest)')
    ax.invert_yaxis()
    plt.tight_layout()
    plt.savefig('../reports/figures/feature_importance.png', dpi=150)
    plt.show()

print("\n‚úì Classification analysis complete!")
