# ✈️ SkyLine-Delay Predictor

This notebook reproduces the functionality of `predictor.py`.  Each function is defined in its own cell, and at the end you can load the data, train the model, and run a prediction.

In [2]:
# 1. Imports

import glob
import datetime as dt
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

In [3]:
# 2. Constants

CSV_GLOB_PATTERN = "**/*-*2024.csv"  # search everywhere for 2024 CSVs

COLS = [
    "carrier", "date", "flight_num", "tail_num", "dest", "crs_dep_time",
    "dep_time", "dep_delay", "carrier_delay", "weather_delay", "nas_delay",
    "security_delay", "late_aircraft_delay",
]

In [4]:
# 3. Data Loading Function

@pd.api.extensions.register_dataframe_accessor("skyline")
class _SkylineLoader:
    """Namespace to hide helper methods if you like—optional."""
    def __init__(self, pandas_obj):
        self._df = pandas_obj

def load_and_merge() -> pd.DataFrame:
    """
    Recursively find all '*-<airport>2024.csv' files,
    read them into DataFrames, clean & parse, and return one concatenated DataFrame.
    """
    frames = []
    for fp in glob.glob(CSV_GLOB_PATTERN, recursive=True):
        if not fp.lower().endswith(".csv"):
            continue
        df = pd.read_csv(fp, header=None, names=COLS, dtype=str)
        df["airport"] = Path(fp).stem.split("-")[1][:3]
        frames.append(df)

    if not frames:
        raise FileNotFoundError("No CSV files like `AA-EWR2024.csv` found.")

    data = pd.concat(frames, ignore_index=True)

    # Clean & parse
    data["date"] = pd.to_datetime(data["date"], format="%m/%d/%Y", errors="coerce")
    data = data.dropna(subset=["date"])

    num_cols = [
        "dep_delay", "carrier_delay", "weather_delay", "nas_delay",
        "security_delay", "late_aircraft_delay"
    ]
    data[num_cols] = data[num_cols].apply(pd.to_numeric, errors="coerce")
    data = data.dropna(subset=["dep_delay"])

    data["day_of_year"] = data["date"].dt.dayofyear
    return data

In [5]:
# 4. Model-Training Function

def train_model(df: pd.DataFrame):
    """
    Given the cleaned DataFrame, one-hot encode carrier & airport,
    combine with numeric features, train a LinearRegression, and
    return the fitted model plus the feature ordering.
    """
    X_cat = pd.get_dummies(
        df[["carrier", "airport"]],
        prefix=["airline", "airport"],
        drop_first=False
    )
    X_num = df[[
        "day_of_year", "carrier_delay", "weather_delay", "nas_delay",
        "security_delay", "late_aircraft_delay"
    ]]
    X = pd.concat([X_cat, X_num], axis=1)
    y = df["dep_delay"].astype(float)

    model = LinearRegression().fit(X, y)
    return model, X.columns.tolist()

In [6]:
# 5. Helper for Typical Components

def typical_components(
    df: pd.DataFrame,
    carrier: str,
    airport: str
) -> pd.Series:
    """
    Compute the historical mean of the five delay-component columns
    for a given carrier & airport.  Falls back to carrier-only or global
    if needed.
    """
    subset = df[(df.carrier == carrier) & (df.airport == airport)]
    if subset.empty:
        subset = df[df.carrier == carrier]
    if subset.empty:
        subset = df
    return subset[[
        "carrier_delay", "weather_delay", "nas_delay",
        "security_delay", "late_aircraft_delay"
    ]].mean()

In [7]:
# 6. Prediction Function

def predict_delay(
    model,
    feature_order: list[str],
    date_choice: dt.date,
    airline: str,
    airport: str,
    carrier_d: float,
    weather_d: float,
    nas_d: float,
    security_d: float,
    late_d: float
) -> float:
    """
    Build a one-row feature vector based on inputs and return
    the model's predicted departure delay (in minutes).
    """
    feat = {col: 0 for col in feature_order}
    feat[f"airline_{airline}"] = 1
    feat[f"airport_{airport}"] = 1
    feat["day_of_year"] = pd.Timestamp(date_choice).dayofyear
    feat.update({
        "carrier_delay": carrier_d,
        "weather_delay": weather_d,
        "nas_delay": nas_d,
        "security_delay": security_d,
        "late_aircraft_delay": late_d,
    })

    X_pred = pd.DataFrame([feat])[feature_order]
    return float(model.predict(X_pred)[0])

In [8]:
# 7. Load data and train

data = load_and_merge()
model, FEATURE_ORDER = train_model(data)
print(f"Loaded {len(data):,} rows; model ready.")

Loaded 153,702 rows; model ready.


In [9]:
# 8. Example Usage

# Choose an airline & airport:
airline = "AA"     # e.g. "AA", "B6", "DL", ...
airport = "JFK"    # three-letter code from your CSVs

# Choose a date (can be outside 2024):
date_choice = dt.date.today()

# Get defaults:
defaults = typical_components(data, airline, airport)
print("Typical component delays (mins):")
print(defaults.to_dict())

# Predict:
pred = predict_delay(
    model, FEATURE_ORDER, date_choice, airline, airport,
    defaults["carrier_delay"],
    defaults["weather_delay"],
    defaults["nas_delay"],
    defaults["security_delay"],
    defaults["late_aircraft_delay"]
)
print(f"\nPredicted departure delay for {airline} at {airport} on {date_choice}: {pred:.1f} minutes")

Typical component delays (mins):
{'carrier_delay': 4.636238279095422, 'weather_delay': 0.8277027027027027, 'nas_delay': 3.478350799779371, 'security_delay': 0.02840595697738555, 'late_aircraft_delay': 5.967388306674021}

Predicted departure delay for AA at JFK on 2025-05-06: 12.5 minutes
