In [10]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LassoCV, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, r2_score, mean_squared_error
from sklearn.cluster import KMeans

In [3]:
DATA_PATH = "data_with_clusters.csv"

FEATURE_COLS = [
    "airline_bucket",
    "origin_bucket",
    "destination_bucket",
    "lagged_delay_flag",
    "prev_real_delay",
]

TARGET_CLF = "DEP_DEL15" #binary departure delay indicator
TARGET_REG = "DEP_DELAY_NEW" #continuous departure delay (min)
CLUSTER_COL = "cluster"

df = pd.read_csv(DATA_PATH)

# keep only needed columns, drop rows with missing in features/targets
df = df.dropna(subset=FEATURE_COLS + [TARGET_CLF, TARGET_REG])

X_clf = df[FEATURE_COLS]
y_clf = df[TARGET_CLF].astype(int)

X_reg = df[FEATURE_COLS]
y_reg = df[TARGET_REG].astype(float)

In [8]:
def make_models():
    return {
        "LR_L2": LogisticRegression(
            random_state=0, solver="liblinear", max_iter=200
        ),
        "LR_L1": LogisticRegression(
            random_state=0,
            penalty="l1",
            solver="liblinear",
            class_weight="balanced",
            max_iter=500,
        ),
        "CART": DecisionTreeClassifier(
            random_state=0, class_weight="balanced"
        ),
        "RF": RandomForestClassifier(
            random_state=0, class_weight="balanced"
        ),
    }


def init_all_models():
    model_names = ("LR_L2", "LR_L1", "CART", "RF")
    techniques = ("Baseline", "Scaling")  # subset of full Lab 6 list

    idx = pd.MultiIndex.from_product(
        [model_names, techniques],
        names=("model", "technique"),
    )
    all_models = pd.DataFrame(
        index=idx,
        columns=["Precision", "Recall", "Score", "Model"],
    )
    all_models[["Precision", "Recall", "Score"]] = all_models[
        ["Precision", "Recall", "Score"]
    ].astype(float)
    return all_models


def standardize_data(X_train, X_out):
    scaler = StandardScaler()
    scaler.fit(X_train)

    Xtr = pd.DataFrame(
        scaler.transform(X_train),
        index=X_train.index,
        columns=X_train.columns,
    )
    Xout = pd.DataFrame(
        scaler.transform(X_out),
        index=X_out.index,
        columns=X_out.columns,
    )
    return Xtr, Xout, scaler


def fit_and_score_model(all_models, stage_name,
                        X_train, X_out, y_train, y_out):
    models_dict = make_models()

    for model_name, model in models_dict.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_out)

        p = precision_score(y_out, y_pred)
        r = recall_score(y_out, y_pred)
        s = 0.5 * (p + r)

        idx = (model_name, stage_name)
        
        all_models.at[idx, "Precision"] = p
        all_models.at[idx, "Recall"] = r
        all_models.at[idx, "Score"] = s
        all_models.at[idx, "Model"] = model

    return all_models


def compare_models(all_models, technique_name="Scaling"):
    diffs = (
        all_models.xs(technique_name, level="technique").Score.values
        - all_models.xs("Baseline", level="technique").Score.values
    )
    print(
        f"{technique_name}: mean ΔScore={diffs.mean():.3f}, "
        f"max ΔScore={diffs.max():.3f}"
    )


# Model Training Functions

In [9]:
def train_global_classification_models(X, y, test_size=0.2, random_state=0):
    Xtr, Xte, ytr, yte = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    all_models = init_all_models()

    # Baseline
    all_models = fit_and_score_model(
        all_models, "Baseline", Xtr, Xte, ytr, yte
    )

    # Scaling
    Xtr_s, Xte_s, scaler = standardize_data(Xtr, Xte)
    all_models = fit_and_score_model(
        all_models, "Scaling", Xtr_s, Xte_s, ytr, yte
    )

    compare_models(all_models, "Scaling")

    best_row = all_models.sort_values("Score").iloc[-1]
    best_model = best_row["Model"]

    return {
        "all_models": all_models,
        "best_model": best_model,
        "scaler": scaler,
        "train_split": (Xtr, Xte, ytr, yte),
    }


global_clf_results = train_global_classification_models(X_clf, y_clf)
global_clf_results["all_models"]


Scaling: mean ΔScore=-0.000, max ΔScore=0.000


Unnamed: 0_level_0,Unnamed: 1_level_0,Precision,Recall,Score,Model
model,technique,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LR_L2,Baseline,0.311712,0.002857,0.157284,"LogisticRegression(max_iter=200, random_state=..."
LR_L2,Scaling,0.311847,0.002861,0.157354,"LogisticRegression(max_iter=200, random_state=..."
LR_L1,Baseline,0.239996,0.513178,0.376587,"LogisticRegression(class_weight='balanced', ma..."
LR_L1,Scaling,0.239987,0.512857,0.376422,"LogisticRegression(class_weight='balanced', ma..."
CART,Baseline,0.259102,0.444737,0.351919,DecisionTreeClassifier(class_weight='balanced'...
CART,Scaling,0.259099,0.444734,0.351916,DecisionTreeClassifier(class_weight='balanced'...
RF,Baseline,0.258949,0.442471,0.35071,"(DecisionTreeClassifier(max_features='sqrt', r..."
RF,Scaling,0.258941,0.442458,0.350699,"(DecisionTreeClassifier(max_features='sqrt', r..."


In [11]:
def train_cluster_classification_models(df,
                                        feature_cols,
                                        target_col,
                                        cluster_col="cluster",
                                        test_size=0.2,
                                        random_state=0):
    cluster_results = {}

    for clust_id, df_c in df.groupby(cluster_col):
        y_c = df_c[target_col].astype(int)
        if y_c.nunique() < 2 or len(df_c) < 40:
            continue

        X_c = df_c[feature_cols]

        Xtr, Xte, ytr, yte = train_test_split(
            X_c, y_c,
            test_size=test_size,
            random_state=random_state,
            stratify=y_c,
        )

        all_models = init_all_models()

        all_models = fit_and_score_model(
            all_models, "Baseline", Xtr, Xte, ytr, yte
        )

        Xtr_s, Xte_s, scaler = standardize_data(Xtr, Xte)
        all_models = fit_and_score_model(
            all_models, "Scaling", Xtr_s, Xte_s, ytr, yte
        )

        best_row = all_models.sort_values("Score").iloc[-1]
        best_model = best_row["Model"]

        cluster_results[clust_id] = {
            "all_models": all_models,
            "best_model": best_model,
            "scaler": scaler,
        }

    return cluster_results


cluster_clf_results = train_cluster_classification_models(
    df,
    FEATURE_COLS,
    TARGET_CLF,
    cluster_col=CLUSTER_COL,
)


In [12]:
def train_global_regression_models(X, y, test_size=0.2, random_state=0):
    Xtr, Xte, ytr, yte = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # Plain linear regression (unscaled)
    lin = LinearRegression()
    lin.fit(Xtr, ytr)
    yhat_lin = lin.predict(Xte)
    lin_r2 = r2_score(yte, yhat_lin)
    lin_mse = mean_squared_error(yte, yhat_lin)

    # LassoCV (scaled)
    Xtr_s, Xte_s, scaler = standardize_data(Xtr, Xte)
    lasso = LassoCV(cv=5, random_state=random_state)
    lasso.fit(Xtr_s, ytr)
    yhat_lasso = lasso.predict(Xte_s)
    lasso_r2 = r2_score(yte, yhat_lasso)
    lasso_mse = mean_squared_error(yte, yhat_lasso)

    return {
        "linear": {
            "model": lin,
            "r2": lin_r2,
            "mse": lin_mse,
        },
        "lasso": {
            "model": lasso,
            "scaler": scaler,
            "r2": lasso_r2,
            "mse": lasso_mse,
        },
        "train_split": (Xtr, Xte, ytr, yte),
    }


global_reg_results = train_global_regression_models(X_reg, y_reg)
global_reg_results


{'linear': {'model': LinearRegression(),
  'r2': 0.004382212091354809,
  'mse': 3031.940064178128},
 'lasso': {'model': LassoCV(cv=5, random_state=0),
  'scaler': StandardScaler(),
  'r2': 0.004382149319585027,
  'mse': 3031.9402553360665},
 'train_split': (         airline_bucket  origin_bucket  destination_bucket  lagged_delay_flag  \
  2840684               1              2                   3                  0   
  2506158               1              2                   4                  0   
  1795334               1              4                   2                  0   
  930487                1              3                   1                  0   
  749653                1              4                   1                  0   
  ...                 ...            ...                 ...                ...   
  2249467               1              1                   2                  0   
  5157699               1              2                   4                  0 

In [13]:
def train_cluster_regression_models(df,
                                    feature_cols,
                                    target_col,
                                    cluster_col="cluster",
                                    test_size=0.2,
                                    random_state=0,
                                    min_rows=40):
    cluster_reg_results = {}

    for clust_id, df_c in df.groupby(cluster_col):
        if len(df_c) < min_rows:
            continue

        X_c = df_c[feature_cols]
        y_c = df_c[target_col].astype(float)

        Xtr, Xte, ytr, yte = train_test_split(
            X_c, y_c,
            test_size=test_size,
            random_state=random_state,
        )

        # linear
        lin = LinearRegression()
        lin.fit(Xtr, ytr)
        yhat_lin = lin.predict(Xte)
        lin_r2 = r2_score(yte, yhat_lin)
        lin_mse = mean_squared_error(yte, yhat_lin)

        # lasso (scaled)
        Xtr_s, Xte_s, scaler = standardize_data(Xtr, Xte)
        lasso = LassoCV(cv=5, random_state=random_state)
        lasso.fit(Xtr_s, ytr)
        yhat_lasso = lasso.predict(Xte_s)
        lasso_r2 = r2_score(yte, yhat_lasso)
        lasso_mse = mean_squared_error(yte, yhat_lasso)

        cluster_reg_results[clust_id] = {
            "linear": {
                "model": lin,
                "r2": lin_r2,
                "mse": lin_mse,
            },
            "lasso": {
                "model": lasso,
                "scaler": scaler,
                "r2": lasso_r2,
                "mse": lasso_mse,
            },
        }

    return cluster_reg_results


cluster_reg_results = train_cluster_regression_models(
    df,
    FEATURE_COLS,
    TARGET_REG,
    cluster_col=CLUSTER_COL,
)


# Cluster Experiments

In [None]:
def assign_clusters(df, k, cluster_features, cluster_col_name):
    """
    df: DataFrame with all your data
    k: number of clusters
    cluster_features: list of columns to cluster on
    cluster_col_name: name of the new cluster column

    returns: df with new column cluster_col_name
    """
    X_cluster = df[cluster_features].copy()

    kmeans = KMeans(n_clusters=k, random_state=0)
    labels = kmeans.fit_predict(X_cluster)

    df = df.copy()
    df[cluster_col_name] = labels
    return df

In [None]:
CLUSTER_EXPERIMENTS = [
    {
        "name": "k3_all_feats",
        "k": 3,
        "cluster_features": FEATURE_COLS,      # use all current features
    },
    {
        "name": "k5_all_feats",
        "k": 5,
        "cluster_features": FEATURE_COLS,
    },
    {
        "name": "k5_subset_origin_dest",
        "k": 5,
        "cluster_features": ["origin_bucket", "destination_bucket"],
    },
    # add more as needed
]

In [None]:
all_cluster_clf_results = {}
all_cluster_reg_results = {}

for cfg in CLUSTER_EXPERIMENTS:
    exp_name = cfg["name"]
    k = cfg["k"]
    cluster_features = cfg["cluster_features"]

    # 1) assign cluster labels for this experiment
    cluster_col = f"cluster_{exp_name}"
    df_with_clusters = assign_clusters(df, k, cluster_features, cluster_col)

    # 2) train cluster-specific classification models
    cluster_clf_results = train_cluster_classification_models(
        df_with_clusters,
        FEATURE_COLS,       # you can choose to change this too, if needed
        TARGET_CLF,
        cluster_col=cluster_col,
    )

    # 3) train cluster-specific regression models
    cluster_reg_results = train_cluster_regression_models(
        df_with_clusters,
        FEATURE_COLS,
        TARGET_REG,
        cluster_col=cluster_col,
    )

    # 4) store results keyed by experiment name
    all_cluster_clf_results[exp_name] = cluster_clf_results
    all_cluster_reg_results[exp_name] = cluster_reg_results


# Gathering Summary Results

In [None]:
summary_rows = []
#global classification
global_df = global_clf_results["all_models"].copy()
global_df = global_df.reset_index()
for _, row in global_df.iterrows():
    summary_rows.append({
        "Level": "Global",
        "Cluster": "-",
        "ClusterExp": "-",  
        "Task": "Classification",
        "Model": row["model"],
        "Technique": row["technique"],
        "Precision": row["Precision"],
        "Recall": row["Recall"],
        "Score": row["Score"],
        "R2": None,
        "MSE": None
    })


# ==========================================================
# CLUSTER CLASSIFICATION (MULTIPLE EXPERIMENTS)
# ==========================================================
for exp_name, cluster_clf_results in all_cluster_clf_results.items():
    for clust_id, res in cluster_clf_results.items():

        best_row = res["all_models"].sort_values("Score").iloc[-1]

        summary_rows.append({
            "Level": "Cluster",
            "Cluster": clust_id,
            "ClusterExp": exp_name,     # <--- TRACK EXPERIMENT
            "Task": "Classification",
            "Model": best_row.name[0],  # model name
            "Technique": best_row.name[1],
            "Precision": best_row["Precision"],
            "Recall": best_row["Recall"],
            "Score": best_row["Score"],
            "R2": None,
            "MSE": None
        })

# --------------------------
# GLOBAL REGRESSION
# --------------------------
# linear
summary_rows.append({
    "Level": "Global",
    "Cluster": "-",
    "Task": "Regression",
    "Model": "LinearRegression",
    "Technique": "Baseline",
    "Precision": None,
    "Recall": None,
    "Score": None,
    "R2": global_reg_results["linear"]["r2"],
    "MSE": global_reg_results["linear"]["mse"]
})

# lasso
summary_rows.append({
    "Level": "Global",
    "Cluster": "-",
    "Task": "Regression",
    "Model": "LassoCV",
    "Technique": "Scaled",
    "Precision": None,
    "Recall": None,
    "Score": None,
    "R2": global_reg_results["lasso"]["r2"],
    "MSE": global_reg_results["lasso"]["mse"]
})


# ==========================================================
# CLUSTER REGRESSION (MULTIPLE EXPERIMENTS)
# ==========================================================
for exp_name, cluster_reg_results in all_cluster_reg_results.items():
    for clust_id, res in cluster_reg_results.items():

        # linear
        summary_rows.append({
            "Level": "Cluster",
            "Cluster": clust_id,
            "ClusterExp": exp_name, 
            "Task": "Regression",
            "Model": "LinearRegression",
            "Technique": "Baseline",
            "Precision": None,
            "Recall": None,
            "Score": None,
            "R2": res["linear"]["r2"],
            "MSE": res["linear"]["mse"]
        })

        # lasso
        summary_rows.append({
            "Level": "Cluster",
            "Cluster": clust_id,
            "ClusterExp": exp_name,
            "Task": "Regression",
            "Model": "LassoCV",
            "Technique": "Scaled",
            "Precision": None,
            "Recall": None,
            "Score": None,
            "R2": res["lasso"]["r2"],
            "MSE": res["lasso"]["mse"]
        })


# --------------------------
# BUILD FINAL SUMMARY TABLE
# --------------------------
summary_table = pd.DataFrame(summary_rows)

# sorting for readability
summary_table = summary_table.sort_values(
    by=["Task", "ClusterExp", "Level", "Cluster", "Model"]
).reset_index(drop=True)

summary_table