In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
import joblib

# Data cleansing: fill remaining nulls with zero (in-place copy to preserve original if needed)
df_clean = df.copy()
if 'DepDel15' in df_clean.columns:
    df_clean['DepDel15'] = df_clean['DepDel15'].fillna(0)

# Feature engineering
df_clean['DepHour'] = df_clean['CRSDepTime'] // 100  # convert HHMM to hour bucket

# Define target (arrival delay > 15 mins already encoded in ArrDel15)
target_col = 'ArrDel15'
feature_cols = ['Month', 'DayOfWeek', 'Carrier', 'OriginAirportID', 'DestAirportID', 'DepHour']

X = df_clean[feature_cols]
y = df_clean[target_col]


categorical = ['Carrier']
numeric = ['Month', 'DayOfWeek', 'OriginAirportID', 'DestAirportID', 'DepHour']

preproc = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
        ("num", "passthrough", numeric),
    ]
)

delay_model = Pipeline(steps=[
    ("preproc", preproc),
    ("clf", LogisticRegression(max_iter=500, n_jobs=None if hasattr(LogisticRegression, "n_jobs") else None, class_weight='balanced'))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

delay_model.fit(X_train, y_train)
y_proba = delay_model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_proba)
print(f"Validation AUC: {auc:.4f}")

# Persistence
model_path = data_dir / "delay_model.joblib"
joblib.dump(delay_model, model_path)
print(f"Model saved to {model_path}")

def predict_delay_probability(model, month, day_of_week, carrier, origin_airport_id, dest_airport_id, crs_dep_time):
    dep_hour = crs_dep_time // 100
    sample = pd.DataFrame([{
        'Month': month,
        'DayOfWeek': day_of_week,
        'Carrier': carrier,
        'OriginAirportID': origin_airport_id,
        'DestAirportID': dest_airport_id,
        'DepHour': dep_hour
    }])
    return float(model.predict_proba(sample)[0, 1])

# Example prediction using first row
example_row = df.iloc[0]
example_prob = predict_delay_probability(
    delay_model,
    example_row['Month'],
    example_row['DayOfWeek'],
    example_row['Carrier'],
    example_row['OriginAirportID'],
    example_row['DestAirportID'],
    example_row['CRSDepTime']
)
print(f"Example delay >15 min probability: {example_prob:.3f}")

# Airport lookup file (ID -> Name)
origin_lookup = df[['OriginAirportID', 'OriginAirportName']].rename(
    columns={'OriginAirportID': 'AirportID', 'OriginAirportName': 'AirportName'}
)
dest_lookup = df[['DestAirportID', 'DestAirportName']].rename(
    columns={'DestAirportID': 'AirportID', 'DestAirportName': 'AirportName'}
)
airports = (
    pd.concat([origin_lookup, dest_lookup], axis=0)
      .drop_duplicates()
      .sort_values('AirportID')
      .reset_index(drop=True)
)

airports_path = data_dir / "airports.csv"
airports.to_csv(airports_path, index=False)
print(f"Airport lookup saved to {airports_path} (rows={len(airports)})")

Validation AUC: 0.6310
Model saved to data/delay_model.joblib
Example delay >15 min probability: 0.500
Airport lookup saved to data/airports.csv (rows=70)


In [3]:
# Data quality & null analysis (original df still preserved; df_clean already used for modeling)
null_counts = df.isna().sum()
null_pct = (null_counts / len(df)).round(4)
null_report = (
    pd.DataFrame({"null_count": null_counts, "null_pct": null_pct})
      .sort_values("null_count", ascending=False)
)
print("Null value summary (original df):")
print(null_report.head(10))

# Ensure any remaining numeric nulls (if future columns added) are zero-filled in a fresh working copy
numeric_with_nulls = [c for c in df.select_dtypes(include=["number"]).columns if df[c].isna().any()]
if numeric_with_nulls:
    df_zero_filled = df.copy()
    df_zero_filled[numeric_with_nulls] = df_zero_filled[numeric_with_nulls].fillna(0)
    print(f"Filled numeric nulls with 0 for columns: {numeric_with_nulls}")
else:
    df_zero_filled = df  # no change needed
    print("No numeric nulls requiring fill (df_zero_filled = df).")

# Baseline historical probability model for (OriginAirportID, DestAirportID, DayOfWeek)
# Laplace smoothing to avoid zero/one extremes
alpha = 1.0
grp = (
    df_clean
      .groupby(["OriginAirportID", "DestAirportID", "DayOfWeek"])["ArrDel15"]
      .agg(["sum", "count"])
      .reset_index()
)
grp["smoothed_delay_prob"] = (grp["sum"] + alpha) / (grp["count"] + 2 * alpha)

global_delay_prob = float((df_clean["ArrDel15"].sum() + alpha) / (len(df_clean) + 2 * alpha))

pair_delay_rates = grp.rename(columns={"sum": "delay_events", "count": "n"})[
    ["OriginAirportID", "DestAirportID", "DayOfWeek", "delay_events", "n", "smoothed_delay_prob"]
]

pair_rates_path = data_dir / "pair_delay_rates.csv"
pair_delay_rates.to_csv(pair_rates_path, index=False)
print(f"Pair delay rate table saved to {pair_rates_path} (rows={len(pair_delay_rates)})")
print(f"Global (smoothed) delay probability: {global_delay_prob:.4f}")

def pair_delay_probability(origin_airport_id, dest_airport_id, day_of_week):
    """Return historical (smoothed) probability for a route + day-of-week, fallback to global."""
    match = pair_delay_rates[
        (pair_delay_rates.OriginAirportID == origin_airport_id) &
        (pair_delay_rates.DestAirportID == dest_airport_id) &
        (pair_delay_rates.DayOfWeek == day_of_week)
    ]
    if not match.empty:
        return float(match.iloc[0].smoothed_delay_prob)
    return global_delay_prob

def predict_combined_probability(month, day_of_week, carrier, origin_airport_id, dest_airport_id, crs_dep_time):
    """Blend logistic model probability with historical pair/day probability (simple average)."""
    dep_hour = crs_dep_time // 100
    features = pd.DataFrame([{
        "Month": month,
        "DayOfWeek": day_of_week,
        "Carrier": carrier,
        "OriginAirportID": origin_airport_id,
        "DestAirportID": dest_airport_id,
        "DepHour": dep_hour
    }])
    logit_prob = float(delay_model.predict_proba(features)[0, 1])
    hist_prob = pair_delay_probability(origin_airport_id, dest_airport_id, day_of_week)
    combined = 0.5 * logit_prob + 0.5 * hist_prob
    return {
        "logistic_prob": logit_prob,
        "historical_pair_prob": hist_prob,
        "combined_prob": combined
    }

# Example combined prediction using example_row + its destination/day combo
combined_example = predict_combined_probability(
    example_row["Month"],
    example_row["DayOfWeek"],
    example_row["Carrier"],
    example_row["OriginAirportID"],
    example_row["DestAirportID"],
    example_row["CRSDepTime"]
)
print("Example combined probability components:", combined_example)

# Persist a combined asset bundle for external use
combined_assets = {
    "logistic_model": delay_model,
    "pair_delay_rates": pair_delay_rates,
    "global_delay_prob": global_delay_prob,
    "feature_columns": feature_cols
}
combined_model_path = data_dir / "delay_model_with_pair_rates.joblib"
joblib.dump(combined_assets, combined_model_path)
print(f"Combined model assets saved to {combined_model_path}")

Null value summary (original df):
                   null_count  null_pct
DepDel15                 2761    0.0102
Year                        0    0.0000
DayofMonth                  0    0.0000
Month                       0    0.0000
DayOfWeek                   0    0.0000
Carrier                     0    0.0000
OriginAirportName           0    0.0000
OriginAirportID             0    0.0000
OriginState                 0    0.0000
DestAirportID               0    0.0000
Filled numeric nulls with 0 for columns: ['DepDel15']
Pair delay rate table saved to data/pair_delay_rates.csv (rows=16773)
Global (smoothed) delay probability: 0.2159
Example combined probability components: {'logistic_prob': 0.5000900077607221, 'historical_pair_prob': 0.2631578947368421, 'combined_prob': 0.3816239512487821}
Combined model assets saved to data/delay_model_with_pair_rates.joblib
