# Flight Delay Prediction Model

This notebook creates a machine learning model to predict flight delays based on day of week and airport.
We will:
1. Load and explore the flight data
2. Clean the data by handling missing values
3. Create a model to predict delays > 15 minutes
4. Save the model for external use
5. Export airport data to a separate file

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle
import os

In [None]:
# Load the flight data
# The data file contains information about flights in the US in 2013
df = pd.read_csv('data/flights.csv')

print(f"Dataset shape: {df.shape}")
print("\nFirst 5 rows:")
df.head()

In [None]:
# Explore the dataset structure
print("Dataset information:")
print(df.info())

print("\nColumn descriptions:")
print("- DayOfWeek: Day of the week (1=Monday, 7=Sunday)")
print("- OriginAirportID: Unique identifier for origin airport")
print("- DepDel15: 1 if departure delayed >15 minutes, 0 otherwise")
print("- ArrDel15: 1 if arrival delayed >15 minutes, 0 otherwise")

In [None]:
# Check for missing values
print("Missing values per column:")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

print("\nUnique values in key columns:")
print(f"Days of week: {sorted(df['DayOfWeek'].unique())}")
print(f"Number of unique airports: {df['OriginAirportID'].nunique()}")
print(f"Delay distribution (DepDel15): {df['DepDel15'].value_counts().to_dict()}")

In [None]:
# Data cleaning: Replace null values with 0 (as specified in requirements)
print("Cleaning data: replacing null values with 0...")

# Fill missing values with 0
df_clean = df.fillna(0)

print("After cleaning:")
print(f"Missing values: {df_clean.isnull().sum().sum()}")

# Ensure DepDel15 is binary (0 or 1)
df_clean['DepDel15'] = df_clean['DepDel15'].astype(int)

print(f"\nDelay distribution after cleaning: {df_clean['DepDel15'].value_counts().to_dict()}")

In [None]:
# Prepare data for machine learning
# Features: DayOfWeek and OriginAirportID
# Target: DepDel15 (1 if delayed >15 minutes, 0 otherwise)

# Select features and target
features = ['DayOfWeek', 'OriginAirportID']
X = df_clean[features]
y = df_clean['DepDel15']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeature statistics:")
print(X.describe())

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")
print(f"\nClass distribution in training set:")
print(y_train.value_counts(normalize=True))

In [None]:
# Create and train the machine learning model
# Using Random Forest Classifier which works well for this type of problem

print("Training Random Forest model...")

# Create the model
model = RandomForestClassifier(
    n_estimators=100,  # Number of trees
    random_state=42,   # For reproducibility
    max_depth=10,      # Prevent overfitting
    min_samples_split=5
)

# Train the model
model.fit(X_train, y_train)

print("Model training completed!")

In [None]:
# Evaluate the model performance
print("Evaluating model performance...")

# Make predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]  # Probability of delay

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy:.4f}")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature importance
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)

In [None]:
# Test the model with example predictions
print("Testing model with examples:")

# Example: Monday (1) at Chicago O'Hare (13930)
example_1 = [[1, 13930]]
prob_1 = model.predict_proba(example_1)[0][1]
print(f"Probability of delay on Monday at Chicago O'Hare: {prob_1:.4f} ({prob_1*100:.2f}%)")

# Example: Friday (5) at JFK (12478)
example_2 = [[5, 12478]]
prob_2 = model.predict_proba(example_2)[0][1]
print(f"Probability of delay on Friday at JFK: {prob_2:.4f} ({prob_2*100:.2f}%)")

# Example: Sunday (7) at LAX (12892)
example_3 = [[7, 12892]]
prob_3 = model.predict_proba(example_3)[0][1]
print(f"Probability of delay on Sunday at LAX: {prob_3:.4f} ({prob_3*100:.2f}%)")

In [None]:
# Save the trained model to a file
print("Saving the model...")

# Create a directory for the model if it doesn't exist
os.makedirs('model', exist_ok=True)

# Save the model using pickle
model_filename = 'model/flight_delay_model.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(model, file)

print(f"Model saved to: {model_filename}")

# Also save feature names for reference
feature_info = {
    'features': features,
    'model_type': 'RandomForestClassifier',
    'accuracy': accuracy,
    'description': 'Predicts probability of flight delay >15 minutes based on day of week and airport'
}

with open('model/model_info.pkl', 'wb') as file:
    pickle.dump(feature_info, file)

print("Model information saved to: model/model_info.pkl")

In [None]:
# Create airports CSV file with all airport names and IDs
print("Creating airports CSV file...")

# Extract unique airports from both origin and destination
origin_airports = df_clean[['OriginAirportID', 'OriginAirportName', 'OriginCity', 'OriginState']].copy()
origin_airports.columns = ['AirportID', 'AirportName', 'City', 'State']

dest_airports = df_clean[['DestAirportID', 'DestAirportName', 'DestCity', 'DestState']].copy()
dest_airports.columns = ['AirportID', 'AirportName', 'City', 'State']

# Combine and remove duplicates
all_airports = pd.concat([origin_airports, dest_airports]).drop_duplicates(subset=['AirportID'])
all_airports = all_airports.sort_values('AirportID').reset_index(drop=True)

print(f"Total unique airports: {len(all_airports)}")
print("\nFirst 5 airports:")
print(all_airports.head())

# Save to CSV
airports_filename = 'data/airports.csv'
all_airports.to_csv(airports_filename, index=False)
print(f"\nAirports data saved to: {airports_filename}")

In [None]:
# Summary of what we accomplished
print("=== SUMMARY ===")
print(f"‚úÖ Loaded and cleaned {df.shape[0]:,} flight records")
print(f"‚úÖ Created Random Forest model with {accuracy:.4f} accuracy")
print(f"‚úÖ Saved model to: {model_filename}")
print(f"‚úÖ Created airports file with {len(all_airports)} unique airports")
print(f"‚úÖ Saved airports to: {airports_filename}")
print("\nüéØ The model can now predict flight delay probabilities based on day of week and airport!")
print("\nüìÅ Files created:")
print(f"   - {model_filename}")
print(f"   - model/model_info.pkl")
print(f"   - {airports_filename}")