This notebook is designed to make real-time flight delay predictions by integrating live data from the APIs. It replicates the preprocessing steps applied during model training and uses the trained ensemble model to predict delays. The notebook includes functions for fetching real-time data, matching flight times with weather conditions, and generating delay predictions in a practical, live environment.

## Data Fetching

In [1]:
import os
import requests
import json
import pandas as pd
from datetime import datetime, timedelta
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Get API keys from environment variables
AVIATION_EDGE_API_KEY = os.getenv('AVIATION_EDGE_API_KEY')
OPENWEATHER_API_KEY = os.getenv('OPENWEATHER_API_KEY')

# Coordinates for JFK Airport
LAT = 40.6413
LON = -73.7781

def fetch_flight_data(API_KEY, code, flight_type, status):
    params = {
        'key': API_KEY,
        'iataCode': code,
        'type': flight_type,
        'status': status
    }
    response = requests.get(f"https://aviation-edge.com/v2/public/timetable", params=params)
    
    # Debugging: Print the request URL to check for correctness
    print(response.url)
    
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code}, Response: {response.text}")
        return None

def fetch_weather_forecast(api_key, lat, lon):
    """
    Fetch the hourly weather forecast for the next 48 hours for a specific location.

    Parameters:
    - api_key: OpenWeather API key (string)
    - lat: Latitude of the location (float)
    - lon: Longitude of the location (float)

    Returns:
    - JSON response from the OpenWeather API, or None if an error occurs.
    """
    url = f"https://pro.openweathermap.org/data/2.5/forecast/hourly?lat={lat}&lon={lon}&appid={api_key}&units=imperial"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        print("Weather Forecast Data (Next 48 Hours):")
        return data
    else:
        print(f"Error fetching weather data: {response.status_code}")
        print(response.text)
        return None

def round_time_to_nearest_hour(dt):
    # Round the datetime object to the nearest hour
    return (dt.replace(second=0, microsecond=0, minute=0, hour=dt.hour)
            + timedelta(hours=dt.minute // 30))

def match_flight_with_weather(df_flights, forecast_data):
    # Create a copy of the DataFrame to avoid SettingWithCopyWarning
    df_flights = df_flights.copy()

    # Round flight times to the nearest hour
    df_flights['rounded_scheduledTime'] = df_flights['departure.scheduledTime'].apply(
        lambda x: round_time_to_nearest_hour(datetime.strptime(x, "%Y-%m-%dT%H:%M:%S.%f"))
    )

    # Convert forecast times to datetime for easier matching
    forecast_list = []
    for entry in forecast_data['list']:
        forecast_time = datetime.strptime(entry['dt_txt'], "%Y-%m-%d %H:%M:%S")
        forecast_list.append({
            'forecast_time': forecast_time,
            'temp': entry['main']['temp'],
            'pressure': entry['main']['pressure'],
            'humidity': entry['main']['humidity'],
            'visibility': entry.get('visibility', 0),
            'wind_speed': entry['wind']['speed'],
            'wind_deg': entry['wind']['deg'],
            'clouds_all': entry['clouds']['all'],
            'weather_main': entry['weather'][0]['main'],
            'weather_description': entry['weather'][0]['description']
        })

    df_weather = pd.DataFrame(forecast_list)

    # Merge on the rounded_scheduledTime with forecast_time
    df_merged = pd.merge(df_flights, df_weather, left_on="rounded_scheduledTime", right_on="forecast_time", how="left")

    # Drop unnecessary columns
    df_merged = df_merged.drop(columns=["rounded_scheduledTime", "forecast_time"])
    
    return df_merged


def main():
    code = 'JFK'  # JFK airport code
    flight_type = 'departure'  # 'arrival' or 'departure'
    status = 'scheduled'
    
    # Fetch flight data
    flight_data = fetch_flight_data(AVIATION_EDGE_API_KEY, code, flight_type, status)
    
    if flight_data:
        # Normalize the JSON response to a flat table
        df_flights = pd.json_normalize(
            flight_data,
            sep='.',  # This will join nested keys with a dot
            errors='ignore'  # Ignore errors in normalization if there are any
        )
        
        print("Flight DataFrame after normalization:")
        print(df_flights.head())  # Debugging: Inspect the normalized flight data
        
      
        # List of columns to keep
        columns_to_keep = [
            'departure.terminal',
            'departure.gate',
            'departure.delay',
            'departure.scheduledTime',
            'arrival.iataCode',     
            'arrival.scheduledTime',         
            'airline.iataCode',
            'flight.iataNumber',        
        ]

        
        # Ensure all necessary columns are present in the DataFrame
        for col in columns_to_keep:
            if col not in df_flights.columns:
                df_flights[col] = None  # Adding missing columns with None values
        
        # Filter the DataFrame to keep only the required columns
        df_flights_filtered = df_flights[columns_to_keep]
        
        print("Flight DataFrame after filtering:")
        print(df_flights_filtered.head())  # Debugging: Inspect the filtered flight data
        
        # Fetch the weather forecast data
        forecast_data = fetch_weather_forecast(OPENWEATHER_API_KEY, LAT, LON)
        
        if forecast_data:
            print("Weather forecast data fetched successfully")
            
            # Match flights with weather data
            df_merged = match_flight_with_weather(df_flights_filtered, forecast_data)
            
            print("Merged DataFrame:")
            print(df_merged.head())  # Debugging: Inspect the merged DataFrame
            
            # Optionally save the DataFrame to a CSV file
            df_merged.to_csv("jfk_flight_weather_data.csv", index=False)
            
            # Return the merged DataFrame
            return df_merged
        
        else:
            print("Failed to fetch weather forecast data")
            return None
    else:
        print("Failed to fetch flight data")
        return None

if __name__ == "__main__":
    df_merged = main()


https://aviation-edge.com/v2/public/timetable?key=a17617-30a161&iataCode=JFK&type=departure&status=scheduled
Flight DataFrame after normalization:
  codeshared     status       type airline.iataCode airline.icaoCode  \
0       None  scheduled  departure               BA              BAW   
1       None  scheduled  departure               DL              DAL   
2       None  scheduled  departure               DL              DAL   
3       None  scheduled  departure               DL              DAL   
4       None  scheduled  departure               AA              AAL   

        airline.name arrival.actualRunway arrival.actualTime arrival.baggage  \
0    British Airways                 None               None            None   
1    Delta Air Lines                 None               None              12   
2    Delta Air Lines                 None               None              11   
3    Delta Air Lines                 None               None              31   
4  American Airlines

## Preprocessing and Prediction 

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
import joblib

# Handling missing values (if any)
# df_merged = df_merged.dropna()

# 1. Lag Features
# Convert the departure.scheduledTime to datetime if it's not already
df_merged['departure.scheduledTime'] = pd.to_datetime(df_merged['departure.scheduledTime'])
df_merged['arrival.scheduledTime'] = pd.to_datetime(df_merged['arrival.scheduledTime'])


# Ensure the data is sorted by the scheduled time
df_merged = df_merged.sort_values('departure.scheduledTime')

# Set the departure.scheduledTime as the index to facilitate time-based rolling calculations
df_merged = df_merged.set_index('departure.scheduledTime')

# Calculate the rolling average of departure delay based on the last 2 hours
df_merged['rolling_avg_delay_2hr'] = df_merged['departure.delay'].rolling('2h').mean()

# Reset the index to return to the original structure
df_merged = df_merged.reset_index()

# 2. Cyclical Encoding of Time Features
# Extract relevant time features from departure.scheduledTime
df_merged['hour'] = df_merged['departure.scheduledTime'].dt.hour
df_merged['day_of_week'] = df_merged['departure.scheduledTime'].dt.dayofweek
df_merged['day_of_month'] = df_merged['departure.scheduledTime'].dt.day
df_merged['month'] = df_merged['departure.scheduledTime'].dt.month
df_merged['is_weekend'] = df_merged['departure.scheduledTime'].dt.dayofweek >= 5

# Define seasons based on months (Winter: 1, Spring: 2, Summer: 3, Fall: 4)
def get_season(month):
    if month in [12, 1, 2]:
        return 1  # Winter
    elif month in [3, 4, 5]:
        return 2  # Spring
    elif month in [6, 7, 8]:
        return 3  # Summer
    elif month in [9, 10, 11]:
        return 4  # Fall

df_merged['season'] = df_merged['month'].apply(get_season)

# Function to create cyclical features
def create_cyclical_features(df, column_name, max_value):
    df[f'{column_name}_sin'] = np.sin(2 * np.pi * df[column_name] / max_value)
    df[f'{column_name}_cos'] = np.cos(2 * np.pi * df[column_name] / max_value)
    return df

# Apply cyclical encoding
df_merged = create_cyclical_features(df_merged, 'hour', 24)
df_merged = create_cyclical_features(df_merged, 'day_of_week', 7)
df_merged = create_cyclical_features(df_merged, 'day_of_month', 31)
df_merged = create_cyclical_features(df_merged, 'month', 12)
df_merged = create_cyclical_features(df_merged, 'season', 4)

In [3]:
# 3. Handle Categorical Data using the loaded OneHotEncoder and LabelEncoders
# Load the existing OneHotEncoder and set handle_unknown to ignore
onehot_encoder = joblib.load('onehot_encoder.pkl')
one_hot_columns = ['weather_main', 'arrival.iataCode']
onehot_encoder.handle_unknown = 'ignore'

# Apply the encoder to the new data
df_merged[one_hot_columns] = df_merged[one_hot_columns].fillna('missing')
encoded_data = onehot_encoder.transform(df_merged[one_hot_columns])

# Create a DataFrame with the encoded features and proper column names
encoded_df = pd.DataFrame(encoded_data, columns=onehot_encoder.get_feature_names_out(one_hot_columns))

# Drop the original columns that have been one-hot encoded
df_merged = df_merged.drop(columns=one_hot_columns)

# Concatenate the encoded columns back to the original DataFrame
df_merged = pd.concat([df_merged.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)



In [4]:
label_encoders = {}
label_encode_columns = [
    'airline.iataCode', 
    'flight.iataNumber', 
    'weather_description', 
    'departure.gate', 
  
]

for column in label_encode_columns:
    # Load the specific label encoder for each column
    label_encoders[column] = joblib.load(f'label_encoder_{column}.pkl')
    
    # Identify any unseen categories
    unseen_categories = set(df_merged[column].unique()) - set(label_encoders[column].classes_)
    
    # If there are unseen categories, add them to the encoder's classes
    if unseen_categories:
        label_encoders[column].classes_ = np.append(label_encoders[column].classes_, list(unseen_categories))
    
    # Apply the transformation using the correct encoder
    df_merged[column] = label_encoders[column].transform(df_merged[column])

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import joblib

# Load the saved scaler and model
scaler = joblib.load('scaler.pkl')
model = joblib.load('ensemble_model.pkl')

# 4. Prepare the Data for Scaling and Prediction

# Drop unnecessary columns
df_merged = df_merged.drop(columns=['departure.scheduledTime', 'arrival.scheduledTime'])

# Separate features (X) and target (y)
X = df_merged.drop(columns=['departure.delay'])  # Features

# Apply the scaler to the features
data_scaled = scaler.transform(X)

# Convert the scaled data back into a DataFrame
df_scaled = pd.DataFrame(data_scaled, columns=X.columns)

# 5. Ensure Proper Data Types

# Convert object types to categories where applicable
for col in df_scaled.select_dtypes(include=['object']).columns:
    df_scaled[col] = df_scaled[col].astype('category')

# Ensure other non-categorical columns are numeric
df_scaled = df_scaled.apply(pd.to_numeric)

# 6. Predicting the Delay

# Predicting the delay using the pretrained model
predicted_delay = model.predict(df_scaled)



Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [8]:
# Add the predicted delay directly to the DataFrame
X['predicted_delay'] = predicted_delay


print("Flights with Predicted Delay over 10 minutes:")

X[X['predicted_delay'] > 10]

Flights with Predicted Delay over 10 minutes:


Unnamed: 0,departure.terminal,departure.gate,airline.iataCode,flight.iataNumber,temp,pressure,humidity,visibility,wind_speed,wind_deg,...,arrival.iataCode_yul,arrival.iataCode_yum,arrival.iataCode_yvr,arrival.iataCode_yyc,arrival.iataCode_yyr,arrival.iataCode_yyt,arrival.iataCode_yyz,arrival.iataCode_zaz,arrival.iataCode_zrh,predicted_delay
0,8,119,217,11932,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43.937213
1,4,120,224,12024,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.549226
2,4,140,224,12000,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.497165
3,4,143,224,11926,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.497165
4,8,31,204,11971,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,45.806670
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,7,119,219,12005,78.35,1014.0,52.0,10000.0,5.99,175.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.831150
146,7,119,219,11978,78.35,1014.0,52.0,10000.0,5.99,175.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.831150
147,1,119,228,12004,78.35,1014.0,52.0,10000.0,5.99,175.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.841800
148,1,119,199,12008,78.35,1014.0,52.0,10000.0,5.99,175.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.153830
