In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, roc_auc_score

In [2]:
df = pd.read_csv("data_with_clusters.csv")
df.head()

Unnamed: 0,YEAR,QUARTER,MONTH,DAY,DAY_OF_WEEK,MKT_CARRIER_AIRLINE_ID,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_NAME,DEST_AIRPORT_ID,...,prev_real_delay,FL_DATE,origin_flights_day,origin_bucket,dest_flights_day,destination_bucket,distance_bucket,airline_bucket,HOUR,cluster
0,2024,1,1,1,1,19393,10140,1014005,"Albuquerque, NM",10423,...,0.0,2024-01-01,67,1,241,2,2,1,7,0
1,2024,1,1,1,1,19393,10140,1014005,"Albuquerque, NM",10423,...,0.0,2024-01-01,67,1,241,2,2,1,18,0
2,2024,1,1,1,1,19393,10140,1014005,"Albuquerque, NM",10800,...,0.0,2024-01-01,67,1,90,1,3,1,14,1
3,2024,1,1,1,1,19393,10140,1014005,"Albuquerque, NM",10821,...,0.0,2024-01-01,67,1,265,2,4,1,15,0
4,2024,1,1,1,1,19393,10140,1014005,"Albuquerque, NM",11259,...,0.0,2024-01-01,67,1,214,2,2,1,5,0


In [3]:
# defining feature columns and target variables
feature_cols = [
    "airline_bucket",
    "origin_bucket",
    "destination_bucket",
    "lagged_delay_flag",
    "prev_real_delay",
]

#col names
target_reg = "DEP_DELAY_NEW" # continuous delay target
target_clf = "DEP_DEL15" # binary delay target
cluster_col = "cluster" # clusters we created

X = df[feature_cols]
y_reg = df[target_reg]
y_clf = df[target_clf]

In [4]:
# global lin regression model
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(
    X, y_reg, test_size=0.2, random_state=42
)

lin_global = LinearRegression()
lin_global.fit(X_train_r, y_train_r)

y_pred_r = lin_global.predict(X_test_r)
print("Global Linear Regression")
print(" MSE:", mean_squared_error(y_test_r, y_pred_r))
print(" R squared :", r2_score(y_test_r, y_pred_r))

Global Linear Regression
 MSE: 3082.098596517904
 R squared : 0.004216522171108683


In [5]:
# global log regression 
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X, y_clf, test_size=0.2, random_state=42, stratify=y_clf
)

log_global = LogisticRegression(max_iter=1000)
log_global.fit(X_train_c, y_train_c)

y_pred_c = log_global.predict(X_test_c)
y_proba_c = log_global.predict_proba(X_test_c)[:, 1]

print("Global Logistic Regression")
print(" Accuracy:", accuracy_score(y_test_c, y_pred_c))
print(" ROC AUC:", roc_auc_score(y_test_c, y_proba_c))


Global Logistic Regression
 Accuracy: 0.7959405326111487
 ROC AUC: 0.5679276695281481


In [None]:
cluster_models = {"linear": {}, "logistic": {}}

for c_id, df_c in df.groupby(cluster_col):
    X_c = df_c[feature_cols]
    y_reg_c = df_c[target_reg]
    y_clf_c = df_c[target_clf]

    print(f"\n=== Cluster {c_id} (n={len(df_c)}) ===")

    # --- linear regression per cluster ---
    if len(df_c) > 6:
        Xtr_r, Xte_r, ytr_r, yte_r = train_test_split(
            X_c, y_reg_c, test_size=0.2, random_state=42
        )
        lin = LinearRegression().fit(Xtr_r, ytr_r)
        yhat_r = lin.predict(Xte_r)
        print("  Linear: MSE =", mean_squared_error(yte_r, yhat_r),
              "R² =", r2_score(yte_r, yhat_r))
        cluster_models["linear"][c_id] = lin
    else:
        print("  Linear: not enough data")

    # --- logistic regression per cluster ---
    if y_clf_c.nunique() >= 2 and len(df_c) > 6:
        Xtr_c, Xte_c, ytr_c, yte_c = train_test_split(
            X_c, y_clf_c, test_size=0.2, random_state=42, stratify=y_clf_c
        )
        log = LogisticRegression(max_iter=1000).fit(Xtr_c, ytr_c)
        yhat_c = log.predict(Xte_c)
        yprob_c = log.predict_proba(Xte_c)[:, 1]
        print("  Logistic: Acc =", accuracy_score(yte_c, yhat_c),
              "AUC =", roc_auc_score(yte_c, yprob_c))
        cluster_models["logistic"][c_id] = log
    else:
        print("  Logistic: not enough class variation or data")



=== Cluster 0 (n=3087770) ===
  Linear: MSE = 3774.9554625507126 R² = 0.002575644063299909
  Logistic: Acc = 0.8140745586620766 AUC = 0.561739226505296

=== Cluster 1 (n=3032718) ===
  Linear: MSE = 2347.785622170519 R² = 0.006450296921100551
  Logistic: Acc = 0.7882132211348228 AUC = 0.5663139947771257

=== Cluster 2 (n=1270339) ===
  Linear: MSE = 2815.1981793003256 R² = 0.006082830195005884
  Logistic: Acc = 0.7749263976573201 AUC = 0.5728645383093552

=== Cluster 3 (n=53253) ===
  Linear: MSE = 3670.0751314487993 R² = 0.005974246240059489
  Logistic: Acc = 0.6872594122617595 AUC = 0.5500081611157108


In [None]:
def fit_global_models(df, feature_cols, target_reg, target_clf):
    ...
    return {"linear": lin_global, "logistic": log_global}

def fit_cluster_models(df, feature_cols, target_reg, target_clf, cluster_col="cluster"):
    ...
    return cluster_models
