# Flight Delay Prediction - Feature Engineering

This notebook focuses on creating advanced features for flight delay prediction, including weather integration, aircraft lag effects, and airport congestion metrics.

## Objectives
1. Create time-based cyclical features
2. Engineer airport congestion metrics
3. Build aircraft lag features (cascading delays)
4. Integrate weather data
5. Create airline and route-specific features
6. Prepare final dataset for modeling

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import sys
from pathlib import Path
from datetime import datetime, timedelta

# Add src to path
sys.path.append('../src')

from features.feature_engineering import FlightFeatureEngineer
from data.weather_data import WeatherDataCollector
from visualization.plots import FlightDelayVisualizer

warnings.filterwarnings('ignore')
plt.style.use('default')
sns.set_palette("husl")

%matplotlib inline

## 1. Load Processed Data

In [None]:
# Load the data from exploration notebook
data_path = '../data/processed/airline_exploration.csv'

if Path(data_path).exists():
    df = pd.read_csv(data_path)
    print(f"Loaded {len(df):,} flights with {len(df.columns)} columns")
    print(f"Date range: {df['FL_DATE'].min()} to {df['FL_DATE'].max()}")
else:
    print(f"Data file not found at {data_path}")
    print("Please run the 01_data_exploration.ipynb notebook first.")
    # Load sample data as fallback
    from data.download_data import load_airline_data
    df = load_airline_data(year=2023, sample_size=50000)
    if df is not None:
        df['delayed'] = (df['ARR_DELAY'] > 15).astype(int)
        df = df[(df.get('CANCELLED', 0) != 1) & (df['ARR_DELAY'].notna())].copy()

print(f"\nDataset shape: {df.shape}")
print(f"Delay rate: {df['delayed'].mean()*100:.1f}%")
display(df.head())

## 2. Initialize Feature Engineer

In [None]:
# Initialize feature engineer
engineer = FlightFeatureEngineer()
viz = FlightDelayVisualizer()

print("Feature engineering pipeline initialized")
print(f"Starting with {len(df.columns)} columns")

## 3. Time-Based Feature Engineering

In [None]:
# Create time-based features
print("Creating time-based features...")
df_time = engineer.create_time_features(df)

print(f"Added {len(df_time.columns) - len(df.columns)} time-based features")

# Display new time features
new_time_cols = [col for col in df_time.columns if col not in df.columns]
print(f"New time features: {new_time_cols}")

# Visualize cyclical features
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Month cyclical features
axes[0, 0].scatter(df_time['month_sin'], df_time['month_cos'], c=df_time['month'], cmap='tab12')
axes[0, 0].set_xlabel('Month Sin')
axes[0, 0].set_ylabel('Month Cos')
axes[0, 0].set_title('Month Cyclical Encoding')

# Day of week cyclical features
axes[0, 1].scatter(df_time['day_of_week_sin'], df_time['day_of_week_cos'], c=df_time['day_of_week'], cmap='tab7')
axes[0, 1].set_xlabel('Day of Week Sin')
axes[0, 1].set_ylabel('Day of Week Cos')
axes[0, 1].set_title('Day of Week Cyclical Encoding')

# Hour cyclical features (if available)
if 'dep_hour_sin' in df_time.columns:
    axes[1, 0].scatter(df_time['dep_hour_sin'], df_time['dep_hour_cos'], c=df_time['dep_hour'], cmap='tab24')
    axes[1, 0].set_xlabel('Departure Hour Sin')
    axes[1, 0].set_ylabel('Departure Hour Cos')
    axes[1, 0].set_title('Departure Hour Cyclical Encoding')

# Delay rate by time features
if 'is_weekend' in df_time.columns:
    weekend_delays = df_time.groupby('is_weekend')['delayed'].mean()
    axes[1, 1].bar(['Weekday', 'Weekend'], weekend_delays.values, color=['skyblue', 'orange'])
    axes[1, 1].set_ylabel('Delay Rate')
    axes[1, 1].set_title('Delay Rate: Weekday vs Weekend')

plt.tight_layout()
plt.show()

df = df_time

## 4. Airport and Route Features

In [None]:
# Create airport and route features
print("Creating airport and route features...")
df_airport = engineer.create_airport_features(df)

print(f"Added {len(df_airport.columns) - len(df.columns)} airport/route features")

# Analyze major route impact
if 'is_major_route' in df_airport.columns:
    major_route_analysis = df_airport.groupby('is_major_route')['delayed'].agg(['count', 'mean'])
    major_route_analysis.columns = ['flight_count', 'delay_rate']
    major_route_analysis.index = ['Non-Major Route', 'Major Route']
    
    print("\nMajor Route Analysis:")
    display(major_route_analysis)
    
    # Visualize
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    
    major_route_analysis['flight_count'].plot(kind='bar', ax=ax1, color='lightblue')
    ax1.set_title('Flight Volume by Route Type')
    ax1.set_ylabel('Number of Flights')
    ax1.tick_params(axis='x', rotation=45)
    
    major_route_analysis['delay_rate'].plot(kind='bar', ax=ax2, color='coral')
    ax2.set_title('Delay Rate by Route Type')
    ax2.set_ylabel('Delay Rate')
    ax2.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()

df = df_airport

## 5. Aircraft Lag Features (Cascading Delays)

In [None]:
# Create aircraft lag features
print("Creating aircraft lag features...")
print("This may take a few minutes for large datasets...")

df_lag = engineer.create_aircraft_lag_features(df)

print(f"Added {len(df_lag.columns) - len(df.columns)} aircraft lag features")

# Analyze aircraft lag impact
if 'prev_flight_delayed' in df_lag.columns:
    lag_analysis = df_lag.groupby('prev_flight_delayed')['delayed'].agg(['count', 'mean'])
    lag_analysis.columns = ['flight_count', 'delay_rate']
    lag_analysis.index = ['Previous Flight On-Time', 'Previous Flight Delayed']
    
    print("\nAircraft Lag Effect Analysis:")
    display(lag_analysis)
    
    # Calculate correlation between previous and current delay
    if 'prev_flight_arr_delay' in df_lag.columns:
        valid_prev_delays = df_lag[df_lag['prev_flight_arr_delay'].notna()]
        correlation = valid_prev_delays['prev_flight_arr_delay'].corr(valid_prev_delays['ARR_DELAY'])
        print(f"\nCorrelation between previous and current flight delay: {correlation:.3f}")
        
        # Visualize lag effect
        fig, axes = plt.subplots(1, 3, figsize=(18, 5))
        
        # Previous delay impact
        lag_analysis['delay_rate'].plot(kind='bar', ax=axes[0], color='orange')
        axes[0].set_title('Current Delay Rate by Previous Flight Status')
        axes[0].set_ylabel('Current Flight Delay Rate')
        axes[0].tick_params(axis='x', rotation=45)
        
        # Scatter plot of previous vs current delay
        sample_size = min(5000, len(valid_prev_delays))
        sample_data = valid_prev_delays.sample(sample_size)
        
        axes[1].scatter(sample_data['prev_flight_arr_delay'], sample_data['ARR_DELAY'], 
                       alpha=0.3, s=10, color='red')
        axes[1].set_xlabel('Previous Flight Delay (minutes)')
        axes[1].set_ylabel('Current Flight Delay (minutes)')
        axes[1].set_title('Previous vs Current Flight Delay')
        axes[1].set_xlim(-50, 100)
        axes[1].set_ylim(-50, 100)
        
        # Time since last flight distribution
        if 'hours_since_last_flight' in df_lag.columns:
            valid_hours = df_lag[df_lag['hours_since_last_flight'].notna() & 
                                (df_lag['hours_since_last_flight'] < 24)]
            
            axes[2].hist(valid_hours['hours_since_last_flight'], bins=30, alpha=0.7, color='green')
            axes[2].set_xlabel('Hours Since Last Flight')
            axes[2].set_ylabel('Frequency')
            axes[2].set_title('Distribution of Time Between Flights')
        
        plt.tight_layout()
        plt.show()

df = df_lag

## 6. Airport Congestion Features

In [None]:
# Create congestion features
print("Creating airport congestion features...")
df_congestion = engineer.create_congestion_features(df)

print(f"Added {len(df_congestion.columns) - len(df.columns)} congestion features")

# Analyze congestion impact
congestion_cols = ['origin_departures_per_hour', 'dest_arrivals_per_hour']
available_congestion = [col for col in congestion_cols if col in df_congestion.columns]

if available_congestion:
    # Create congestion level categories
    for col in available_congestion:
        if col in df_congestion.columns:
            df_congestion[f'{col}_category'] = pd.cut(df_congestion[col],
                                                    bins=[0, 10, 20, 30, float('inf')],
                                                    labels=['Low', 'Medium', 'High', 'Very High'])
    
    # Analyze congestion vs delays
    if 'origin_departures_per_hour_category' in df_congestion.columns:
        congestion_analysis = df_congestion.groupby('origin_departures_per_hour_category')['delayed'].agg(['count', 'mean'])
        congestion_analysis.columns = ['flight_count', 'delay_rate']
        
        print("\nOrigin Airport Congestion Analysis:")
        display(congestion_analysis)
        
        # Visualize congestion impact
        fig, axes = plt.subplots(1, 2, figsize=(15, 5))
        
        congestion_analysis['delay_rate'].plot(kind='bar', ax=axes[0], color='purple')
        axes[0].set_title('Delay Rate by Origin Airport Congestion')
        axes[0].set_ylabel('Delay Rate')
        axes[0].tick_params(axis='x', rotation=45)
        
        # Scatter plot of congestion vs delay
        sample_data = df_congestion.sample(min(5000, len(df_congestion)))
        axes[1].scatter(sample_data['origin_departures_per_hour'], sample_data['delayed'], 
                       alpha=0.3, s=10, color='purple')
        axes[1].set_xlabel('Departures per Hour')
        axes[1].set_ylabel('Delayed (0/1)')
        axes[1].set_title('Airport Congestion vs Delay Probability')
        
        plt.tight_layout()
        plt.show()

df = df_congestion

## 7. Airline Features

In [None]:
# Create airline features
print("Creating airline-specific features...")
df_airline = engineer.create_airline_features(df)

print(f"Added {len(df_airline.columns) - len(df.columns)} airline features")

# Analyze airline features
airline_feature_cols = ['airline_avg_delay', 'airline_delay_std', 'route_airline_avg_delay']
available_airline_features = [col for col in airline_feature_cols if col in df_airline.columns]

if available_airline_features:
    print("\nAirline Feature Statistics:")
    display(df_airline[available_airline_features].describe())
    
    # Visualize airline feature distributions
    fig, axes = plt.subplots(1, len(available_airline_features), figsize=(5*len(available_airline_features), 5))
    
    if len(available_airline_features) == 1:
        axes = [axes]
    
    for i, col in enumerate(available_airline_features):
        df_airline[col].hist(bins=30, ax=axes[i], alpha=0.7)
        axes[i].set_title(f'Distribution of {col}')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()

df = df_airline

## 8. Weather Data Integration

In [None]:
# Weather data integration
print("Integrating weather data...")
print("Note: This uses mock weather data for demonstration. Set WEATHER_API_KEY environment variable for real data.")

weather_collector = WeatherDataCollector()

# For demo purposes, we'll add mock weather data to a sample
sample_size = min(1000, len(df))  # Limit for demo
df_sample = df.sample(sample_size, random_state=42)

# Add mock weather features
np.random.seed(42)
weather_features = {
    'origin_temperature': np.random.normal(60, 20, sample_size),
    'origin_humidity': np.random.normal(50, 20, sample_size),
    'origin_wind_speed': np.random.gamma(2, 5, sample_size),
    'origin_visibility': np.random.gamma(5, 2, sample_size),
    'origin_precipitation': np.random.exponential(0.1, sample_size),
    'dest_temperature': np.random.normal(65, 18, sample_size),
    'dest_humidity': np.random.normal(55, 18, sample_size),
    'dest_wind_speed': np.random.gamma(2, 5, sample_size),
    'dest_visibility': np.random.gamma(5, 2, sample_size),
    'dest_precipitation': np.random.exponential(0.1, sample_size)
}

for feature, values in weather_features.items():
    df_sample[feature] = values

print(f"Added weather features to {sample_size} flights")
print(f"Weather features: {list(weather_features.keys())}")

# Create weather-derived features
df_weather = engineer.create_weather_features(df_sample)

weather_derived = [col for col in df_weather.columns if col not in df_sample.columns]
print(f"\nDerived weather features: {weather_derived}")

# Analyze weather impact (using sample with weather data)
if weather_derived:
    print("\nWeather Impact Analysis:")
    
    # Analyze adverse weather impact
    weather_impact_cols = [col for col in weather_derived if 'adverse_weather' in col]
    
    for col in weather_impact_cols:
        if col in df_weather.columns:
            impact = df_weather.groupby(col)['delayed'].agg(['count', 'mean'])
            impact.columns = ['flight_count', 'delay_rate']
            impact.index = ['Normal Weather', 'Adverse Weather']
            print(f"\n{col}:")
            display(impact)
    
    # Visualize weather impact
    viz.plot_weather_impact(df_weather)

# For the main dataset, we'll continue without weather features
# In a real scenario, you would integrate weather data for the full dataset
print("\nContinuing with main dataset (without weather features for this demo)")

## 9. Categorical Encoding

In [None]:
# Encode categorical features
print("Encoding categorical features...")
df_encoded = engineer.encode_categorical_features(df)

print(f"Added {len(df_encoded.columns) - len(df.columns)} encoded features")

# Show encoding results
encoded_cols = [col for col in df_encoded.columns if col.endswith('_encoded')]
print(f"Encoded features: {encoded_cols}")

if encoded_cols:
    print("\nSample of encoded values:")
    display(df_encoded[encoded_cols].head(10))

df = df_encoded

## 10. Final Feature Selection and Dataset Preparation

In [None]:
# Final feature summary
print("=== FEATURE ENGINEERING SUMMARY ===")
print(f"Original columns: {len(df.columns)}")
print(f"Final dataset shape: {df.shape}")

# Categorize features
feature_categories = {
    'Time Features': [col for col in df.columns if any(x in col for x in ['hour', 'day', 'month', 'weekend', 'holiday', 'sin', 'cos'])],
    'Airport Features': [col for col in df.columns if any(x in col for x in ['origin', 'dest', 'major', 'route', 'departures', 'arrivals'])],
    'Aircraft Features': [col for col in df.columns if any(x in col for x in ['prev_flight', 'aircraft', 'hours_since', 'tail'])],
    'Airline Features': [col for col in df.columns if any(x in col for x in ['airline', 'carrier'])],
    'Weather Features': [col for col in df.columns if any(x in col for x in ['weather', 'temperature', 'wind', 'visibility', 'precipitation'])],
    'Encoded Features': [col for col in df.columns if col.endswith('_encoded')],
    'Target': ['delayed']
}

for category, features in feature_categories.items():
    available_features = [f for f in features if f in df.columns]
    print(f"\n{category} ({len(available_features)}): {available_features[:5]}{'...' if len(available_features) > 5 else ''}")

# Check for missing values in engineered features
missing_summary = df.isnull().sum()
missing_features = missing_summary[missing_summary > 0]

if len(missing_features) > 0:
    print("\n=== MISSING VALUES IN ENGINEERED FEATURES ===")
    print(missing_features.head(10))
else:
    print("\n✅ No missing values in engineered features")

# Feature correlation with target
numerical_features = df.select_dtypes(include=[np.number]).columns
target_correlations = df[numerical_features].corr()['delayed'].abs().sort_values(ascending=False)

print("\n=== TOP 15 FEATURES BY CORRELATION WITH TARGET ===")
display(target_correlations.head(15))

In [None]:
# Visualize feature importance through correlation
top_features = target_correlations.head(20).index.tolist()

if len(top_features) > 1:
    plt.figure(figsize=(12, 8))
    
    # Create correlation heatmap for top features
    correlation_matrix = df[top_features].corr()
    
    mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
    sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
               square=True, linewidths=0.5, cbar_kws={"shrink": .8})
    plt.title('Correlation Matrix - Top 20 Features')
    plt.tight_layout()
    plt.show()

In [None]:
# Save engineered dataset
output_path = '../data/processed/flight_features_engineered.csv'
df.to_csv(output_path, index=False)

print(f"\n✅ Engineered dataset saved to {output_path}")
print(f"Final shape: {df.shape}")
print(f"Target distribution: {df['delayed'].mean()*100:.1f}% delayed")

# Save feature metadata
feature_metadata = {
    'total_features': len(df.columns),
    'feature_categories': {k: len([f for f in v if f in df.columns]) for k, v in feature_categories.items()},
    'top_features': target_correlations.head(15).to_dict(),
    'dataset_shape': df.shape,
    'delay_rate': df['delayed'].mean()
}

import json
with open('../data/processed/feature_metadata.json', 'w') as f:
    json.dump(feature_metadata, f, indent=2)

print("\n✅ Feature metadata saved")
print("\n=== READY FOR MODELING ===")
print("Proceed to 03_model_training.ipynb for model development")