In [7]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LassoCV, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, r2_score, mean_squared_error
from sklearn.cluster import KMeans

from sklearn.ensemble import (
    HistGradientBoostingClassifier,
    HistGradientBoostingRegressor,
    RandomForestRegressor,
    RandomForestClassifier,
    StackingClassifier,
)
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split


In [8]:
DATA_PATH = "data_with_clusters.csv"

FEATURE_COLS = [
    "MONTH",
    "HOUR",
    "origin_flights_day",
    "airline_bucket",
    "origin_bucket",
    "destination_bucket",
    "lagged_delay_flag",
    "prev_real_delay",
]

TARGET_CLF = "DEP_DEL15" #binary departure delay indicator
TARGET_REG = "DEP_DELAY_NEW" #continuous departure delay (min)
CLUSTER_COL = "cluster"

df = pd.read_csv(DATA_PATH)

# keep only needed columns, drop rows with missing in features/targets
df = df.dropna(subset=FEATURE_COLS + [TARGET_CLF, TARGET_REG])

X_clf = df[FEATURE_COLS]
y_clf = df[TARGET_CLF].astype(int)

X_reg = df[FEATURE_COLS]
y_reg = df[TARGET_REG].astype(float)

In [9]:
df.head()

Unnamed: 0,YEAR,QUARTER,MONTH,DAY,DAY_OF_WEEK,MKT_CARRIER_AIRLINE_ID,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_NAME,DEST_AIRPORT_ID,...,prev_real_delay,FL_DATE,origin_flights_day,origin_bucket,dest_flights_day,destination_bucket,distance_bucket,airline_bucket,HOUR,cluster
0,2024,1,1,1,1,19393,10140,1014005,"Albuquerque, NM",10423,...,0.0,2024-01-01,67,1,241,2,2,1,7,0
1,2024,1,1,1,1,19393,10140,1014005,"Albuquerque, NM",10423,...,0.0,2024-01-01,67,1,241,2,2,1,18,0
2,2024,1,1,1,1,19393,10140,1014005,"Albuquerque, NM",10800,...,0.0,2024-01-01,67,1,90,1,3,1,14,1
3,2024,1,1,1,1,19393,10140,1014005,"Albuquerque, NM",10821,...,0.0,2024-01-01,67,1,265,2,4,1,15,0
4,2024,1,1,1,1,19393,10140,1014005,"Albuquerque, NM",11259,...,0.0,2024-01-01,67,1,214,2,2,1,5,0


In [10]:
def make_models():
    return {
        "LR_L2": LogisticRegression(
            random_state=0, solver="liblinear", max_iter=200
        ),
        "LR_L1": LogisticRegression(
            random_state=0,
            penalty="l1",
            solver="liblinear",
            class_weight="balanced",
            max_iter=500,
        ),
        "CART": DecisionTreeClassifier(
            random_state=0, class_weight="balanced"
        ),
        "RF": RandomForestClassifier(
            random_state=0, class_weight="balanced", n_jobs=-1,
        ),
    }


def init_all_models():
    model_names = ("LR_L2", "LR_L1", "CART", "RF")
    techniques = ("Baseline", "Scaling")  # subset of full Lab 6 list

    idx = pd.MultiIndex.from_product(
        [model_names, techniques],
        names=("model", "technique"),
    )
    all_models = pd.DataFrame(
        index=idx,
        columns=["Precision", "Recall", "Score", "Model"],
    )
    all_models[["Precision", "Recall", "Score"]] = all_models[
        ["Precision", "Recall", "Score"]
    ].astype(float)
    return all_models


def standardize_data(X_train, X_out):
    scaler = StandardScaler()
    scaler.fit(X_train)

    Xtr = pd.DataFrame(
        scaler.transform(X_train),
        index=X_train.index,
        columns=X_train.columns,
    )
    Xout = pd.DataFrame(
        scaler.transform(X_out),
        index=X_out.index,
        columns=X_out.columns,
    )
    return Xtr, Xout, scaler


def fit_and_score_model(all_models, stage_name,
                        X_train, X_out, y_train, y_out):
    models_dict = make_models()

    for model_name, model in models_dict.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_out)

        p = precision_score(y_out, y_pred)
        r = recall_score(y_out, y_pred)
        s = 0.5 * (p + r)

        idx = (model_name, stage_name)
        
        all_models.at[idx, "Precision"] = p
        all_models.at[idx, "Recall"] = r
        all_models.at[idx, "Score"] = s
        all_models.at[idx, "Model"] = model

    return all_models


def compare_models(all_models, technique_name="Scaling"):
    diffs = (
        all_models.xs(technique_name, level="technique").Score.values
        - all_models.xs("Baseline", level="technique").Score.values
    )
    print(
        f"{technique_name}: mean ΔScore={diffs.mean():.3f}, "
        f"max ΔScore={diffs.max():.3f}"
    )


In [11]:
GLOBAL_CLF_TRAIN_FRAC = 0.25   # 25% of global clf train
CLUSTER_CLF_TRAIN_FRAC = 0.5    # up to 50% of each cluster's train
GLOBAL_REG_TRAIN_FRAC = 0.25   # 25% of global reg train
CLUSTER_REG_TRAIN_FRAC = 0.5    # 30% of each cluster's reg train
LASSO_TRAIN_FRAC = 0.2  # 20% just for LassoCV (more expensive)

MIN_SUBSAMPLE_SIZE = 10000    # don't bother subsampling below this

def stratified_subsample_xy_frac(X, y, frac=1.0, random_state=0):
    """
    Classification subsample: take a fraction of (X, y), stratified by y.
    If frac >= 1.0 or dataset is already small, return (X, y) unchanged.
    """
    n = len(X)
    if frac >= 1.0 or n <= MIN_SUBSAMPLE_SIZE:
        return X, y

    n_sub = int(n * frac)
    if n_sub < MIN_SUBSAMPLE_SIZE:
        # if frac is tiny on a small dataset, just keep all
        return X, y

    X_sub, _, y_sub, _ = train_test_split(
        X, y,
        train_size=n_sub,
        stratify=y,
        random_state=random_state,
    )
    return X_sub, y_sub


def random_subsample_xy_frac(X, y, frac=1.0, random_state=0):
    """
    Regression subsample: take a random fraction of (X, y).
    If frac >= 1.0 or dataset is already small, return (X, y) unchanged.
    """
    n = len(X)
    if frac >= 1.0 or n <= MIN_SUBSAMPLE_SIZE:
        return X, y

    n_sub = int(n * frac)
    if n_sub < MIN_SUBSAMPLE_SIZE:
        return X, y

    rng = np.random.RandomState(random_state)
    idx = rng.choice(n, size=n_sub, replace=False)

    if isinstance(X, (pd.DataFrame, pd.Series)):
        return X.iloc[idx], y.iloc[idx]
    else:
        return X[idx], y[idx]


# Model Training Functions

In [12]:
def train_global_classification_models(
    X, y,
    test_size=0.2,
    random_state=0,
    train_frac=GLOBAL_CLF_TRAIN_FRAC,
):
    # Full split first
    Xtr, Xte, ytr, yte = train_test_split(
        X, y,
        test_size=test_size,
        random_state=random_state,
        stratify=y,
    )

    # === SUBSAMPLE TRAINING SET (stratified by y) ===
    Xtr_sub, ytr_sub = stratified_subsample_xy_frac(
        Xtr, ytr,
        frac=train_frac,
        random_state=random_state,
    )

    all_models = init_all_models()

    # Baseline (unscaled) – train on subsample, test on full test set
    all_models = fit_and_score_model(
        all_models, "Baseline",
        Xtr_sub, Xte, ytr_sub, yte
    )

    # Scaling – fit scaler on subsample, transform full test
    Xtr_s, Xte_s, scaler = standardize_data(Xtr_sub, Xte)
    all_models = fit_and_score_model(
        all_models, "Scaling",
        Xtr_s, Xte_s, ytr_sub, yte
    )

    compare_models(all_models, "Scaling")

    best_row = all_models.sort_values("Score").iloc[-1]
    best_model = best_row["Model"]

    return {
        "all_models": all_models,
        "best_model": best_model,
        "scaler": scaler,
        # store what we *actually used* for training
        "train_split": (Xtr_sub, Xte, ytr_sub, yte),
    }


global_clf_results = train_global_classification_models(X_clf, y_clf)
global_clf_results["all_models"]


Scaling: mean ΔScore=-0.000, max ΔScore=0.000


Unnamed: 0_level_0,Unnamed: 1_level_0,Precision,Recall,Score,Model
model,technique,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LR_L2,Baseline,0.383997,0.003472,0.193734,"LogisticRegression(max_iter=200, random_state=..."
LR_L2,Scaling,0.382979,0.003448,0.193214,"LogisticRegression(max_iter=200, random_state=..."
LR_L1,Baseline,0.281996,0.608067,0.445032,"LogisticRegression(class_weight='balanced', ma..."
LR_L1,Scaling,0.282005,0.608064,0.445034,"LogisticRegression(class_weight='balanced', ma..."
CART,Baseline,0.264188,0.390509,0.327349,DecisionTreeClassifier(class_weight='balanced'...
CART,Scaling,0.264067,0.390331,0.327199,DecisionTreeClassifier(class_weight='balanced'...
RF,Baseline,0.283857,0.296324,0.290091,"(DecisionTreeClassifier(max_features='sqrt', r..."
RF,Scaling,0.283797,0.296304,0.290051,"(DecisionTreeClassifier(max_features='sqrt', r..."


In [13]:
def train_cluster_classification_models(
    df,
    feature_cols,
    target_col,
    cluster_col="cluster",
    test_size=0.2,
    random_state=0,
    train_frac=CLUSTER_CLF_TRAIN_FRAC,
):
    cluster_results = {}

    for clust_id, df_c in df.groupby(cluster_col):
        y_c = df_c[target_col].astype(int)
        if y_c.nunique() < 2 or len(df_c) < 40:
            continue

        X_c = df_c[feature_cols]

        Xtr, Xte, ytr, yte = train_test_split(
            X_c, y_c,
            test_size=test_size,
            random_state=random_state,
            stratify=y_c,
        )

        # === SUBSAMPLE TRAINING SET IN THIS CLUSTER ===
        Xtr_sub, ytr_sub = stratified_subsample_xy_frac(
            Xtr, ytr,
            frac=train_frac,
            random_state=random_state,
        )

        all_models = init_all_models()

        # Baseline
        all_models = fit_and_score_model(
            all_models, "Baseline", Xtr_sub, Xte, ytr_sub, yte
        )

        # Scaling
        Xtr_s, Xte_s, scaler = standardize_data(Xtr_sub, Xte)
        all_models = fit_and_score_model(
            all_models, "Scaling", Xtr_s, Xte_s, ytr_sub, yte
        )

        best_row = all_models.sort_values("Score").iloc[-1]
        best_model = best_row["Model"]

        cluster_results[clust_id] = {
            "all_models": all_models,
            "best_model": best_model,
            "scaler": scaler,
            "train_split": (Xtr_sub, Xte, ytr_sub, yte),
        }

    return cluster_results


cluster_clf_results = train_cluster_classification_models(
    df,
    FEATURE_COLS,
    TARGET_CLF,
    cluster_col=CLUSTER_COL,
)


In [14]:
def train_global_regression_models(
    X, y,
    test_size=0.2,
    random_state=0,
    train_frac=GLOBAL_REG_TRAIN_FRAC,
    lasso_frac=LASSO_TRAIN_FRAC,
):
    # Full split
    Xtr, Xte, ytr, yte = train_test_split(
        X, y,
        test_size=test_size,
        random_state=random_state,
    )

    # === SUBSAMPLE TRAINING FOR LINEAR & LASSO ===
    Xtr_sub, ytr_sub = random_subsample_xy_frac(
        Xtr, ytr,
        frac=train_frac,
        random_state=random_state,
    )

    # Plain linear regression (unscaled)
    lin = LinearRegression()
    lin.fit(Xtr_sub, ytr_sub)
    yhat_lin = lin.predict(Xte)
    lin_r2 = r2_score(yte, yhat_lin)
    lin_mse = mean_squared_error(yte, yhat_lin)

    # LassoCV (scaled) – optionally even smaller subsample
    Xtr_lasso, ytr_lasso = random_subsample_xy_frac(
        Xtr_sub, ytr_sub,
        frac=lasso_frac / max(train_frac, 1e-9) if lasso_frac < train_frac else 1.0,
        random_state=random_state,
    )

    Xtr_s, Xte_s, scaler = standardize_data(Xtr_lasso, Xte)
    lasso = LassoCV(
        cv=3,               # lighter than 5-fold
        random_state=random_state,
        n_jobs=-1,
    )
    lasso.fit(Xtr_s, ytr_lasso)
    yhat_lasso = lasso.predict(Xte_s)
    lasso_r2 = r2_score(yte, yhat_lasso)
    lasso_mse = mean_squared_error(yte, yhat_lasso)

    return {
        "linear": {
            "model": lin,
            "r2": lin_r2,
            "mse": lin_mse,
        },
        "lasso": {
            "model": lasso,
            "scaler": scaler,
            "r2": lasso_r2,
            "mse": lasso_mse,
        },
        "train_split": (Xtr_sub, Xte, ytr_sub, yte),
    }


global_reg_results = train_global_regression_models(X_reg, y_reg)
global_reg_results


{'linear': {'model': LinearRegression(),
  'r2': 0.013018494131079361,
  'mse': 3005.6401227349497},
 'lasso': {'model': LassoCV(cv=3, n_jobs=-1, random_state=0),
  'scaler': StandardScaler(),
  'r2': 0.013005703102057287,
  'mse': 3005.6790750656787},
 'train_split': (         MONTH  HOUR  origin_flights_day  airline_bucket  origin_bucket  \
  656565       2    16                 115               1              1   
  5538059      9    13                  82               1              1   
  5469672      9     6                 391               0              3   
  3576367      6    20                 494               1              3   
  95935        1    10                 234               0              2   
  ...        ...   ...                 ...             ...            ...   
  3111319      6    14                 193               1              2   
  4536074      8    12                  24               1              1   
  7307663     12    19                 

In [15]:
def train_cluster_regression_models(
    df,
    feature_cols,
    target_col,
    cluster_col="cluster",
    test_size=0.2,
    random_state=0,
    min_rows=40,
    train_frac=CLUSTER_REG_TRAIN_FRAC,
    lasso_frac=LASSO_TRAIN_FRAC,
):
    cluster_reg_results = {}

    for clust_id, df_c in df.groupby(cluster_col):
        if len(df_c) < min_rows:
            continue

        X_c = df_c[feature_cols]
        y_c = df_c[target_col].astype(float)

        Xtr, Xte, ytr, yte = train_test_split(
            X_c, y_c,
            test_size=test_size,
            random_state=random_state,
        )

        # === SUBSAMPLE TRAINING FOR THIS CLUSTER ===
        Xtr_sub, ytr_sub = random_subsample_xy_frac(
            Xtr, ytr,
            frac=train_frac,
            random_state=random_state,
        )

        # Linear
        lin = LinearRegression()
        lin.fit(Xtr_sub, ytr_sub)
        yhat_lin = lin.predict(Xte)
        lin_r2 = r2_score(yte, yhat_lin)
        lin_mse = mean_squared_error(yte, yhat_lin)

        # Lasso
        Xtr_lasso, ytr_lasso = random_subsample_xy_frac(
            Xtr_sub, ytr_sub,
            frac=lasso_frac / max(train_frac, 1e-9) if lasso_frac < train_frac else 1.0,
            random_state=random_state,
        )

        Xtr_s, Xte_s, scaler = standardize_data(Xtr_lasso, Xte)
        lasso = LassoCV(
            cv=3,
            random_state=random_state,
            n_jobs=-1,
        )
        lasso.fit(Xtr_s, ytr_lasso)
        yhat_lasso = lasso.predict(Xte_s)
        lasso_r2 = r2_score(yte, yhat_lasso)
        lasso_mse = mean_squared_error(yte, yhat_lasso)

        cluster_reg_results[clust_id] = {
            "linear": {
                "model": lin,
                "r2": lin_r2,
                "mse": lin_mse,
            },
            "lasso": {
                "model": lasso,
                "scaler": scaler,
                "r2": lasso_r2,
                "mse": lasso_mse,
            },
            "train_split": (Xtr_sub, Xte, ytr_sub, yte),
        }

    return cluster_reg_results


cluster_reg_results = train_cluster_regression_models(
    df,
    FEATURE_COLS,
    TARGET_REG,
    cluster_col=CLUSTER_COL,
)

## Not on a subset of data

In [None]:
def train_global_classification_models(X, y, test_size=0.2, random_state=0):
    Xtr, Xte, ytr, yte = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    all_models = init_all_models()

    # Baseline
    all_models = fit_and_score_model(
        all_models, "Baseline", Xtr, Xte, ytr, yte
    )

    # Scaling
    Xtr_s, Xte_s, scaler = standardize_data(Xtr, Xte)
    all_models = fit_and_score_model(
        all_models, "Scaling", Xtr_s, Xte_s, ytr, yte
    )

    compare_models(all_models, "Scaling")

    best_row = all_models.sort_values("Score").iloc[-1]
    best_model = best_row["Model"]

    return {
        "all_models": all_models,
        "best_model": best_model,
        "scaler": scaler,
        "train_split": (Xtr, Xte, ytr, yte),
    }


global_clf_results = train_global_classification_models(X_clf, y_clf)
global_clf_results["all_models"]


Scaling: mean ΔScore=-0.000, max ΔScore=0.000


Unnamed: 0_level_0,Unnamed: 1_level_0,Precision,Recall,Score,Model
model,technique,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LR_L2,Baseline,0.383242,0.003475,0.193359,"LogisticRegression(max_iter=200, random_state=..."
LR_L2,Scaling,0.382331,0.003445,0.192888,"LogisticRegression(max_iter=200, random_state=..."
LR_L1,Baseline,0.281975,0.608097,0.445036,"LogisticRegression(class_weight='balanced', ma..."
LR_L1,Scaling,0.281999,0.608103,0.445051,"LogisticRegression(class_weight='balanced', ma..."
CART,Baseline,0.284408,0.453031,0.36872,DecisionTreeClassifier(class_weight='balanced'...
CART,Scaling,0.284351,0.453001,0.368676,DecisionTreeClassifier(class_weight='balanced'...
RF,Baseline,0.299683,0.378473,0.339078,"(DecisionTreeClassifier(max_features='sqrt', r..."
RF,Scaling,0.299631,0.37839,0.339011,"(DecisionTreeClassifier(max_features='sqrt', r..."


: 

In [None]:
def train_cluster_classification_models(df,
                                        feature_cols,
                                        target_col,
                                        cluster_col="cluster",
                                        test_size=0.2,
                                        random_state=0):
    cluster_results = {}

    for clust_id, df_c in df.groupby(cluster_col):
        y_c = df_c[target_col].astype(int)
        if y_c.nunique() < 2 or len(df_c) < 40:
            continue

        X_c = df_c[feature_cols]

        Xtr, Xte, ytr, yte = train_test_split(
            X_c, y_c,
            test_size=test_size,
            random_state=random_state,
            stratify=y_c,
        )

        all_models = init_all_models()

        all_models = fit_and_score_model(
            all_models, "Baseline", Xtr, Xte, ytr, yte
        )

        Xtr_s, Xte_s, scaler = standardize_data(Xtr, Xte)
        all_models = fit_and_score_model(
            all_models, "Scaling", Xtr_s, Xte_s, ytr, yte
        )

        best_row = all_models.sort_values("Score").iloc[-1]
        best_model = best_row["Model"]

        cluster_results[clust_id] = {
            "all_models": all_models,
            "best_model": best_model,
            "scaler": scaler,
        }

    return cluster_results


cluster_clf_results = train_cluster_classification_models(
    df,
    FEATURE_COLS,
    TARGET_CLF,
    cluster_col=CLUSTER_COL,
)


In [None]:
def train_global_regression_models(X, y, test_size=0.2, random_state=0):
    Xtr, Xte, ytr, yte = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # Plain linear regression (unscaled)
    lin = LinearRegression()
    lin.fit(Xtr, ytr)
    yhat_lin = lin.predict(Xte)
    lin_r2 = r2_score(yte, yhat_lin)
    lin_mse = mean_squared_error(yte, yhat_lin)

    # LassoCV (scaled)
    Xtr_s, Xte_s, scaler = standardize_data(Xtr, Xte)
    lasso = LassoCV(cv=5, random_state=random_state)
    lasso.fit(Xtr_s, ytr)
    yhat_lasso = lasso.predict(Xte_s)
    lasso_r2 = r2_score(yte, yhat_lasso)
    lasso_mse = mean_squared_error(yte, yhat_lasso)

    return {
        "linear": {
            "model": lin,
            "r2": lin_r2,
            "mse": lin_mse,
        },
        "lasso": {
            "model": lasso,
            "scaler": scaler,
            "r2": lasso_r2,
            "mse": lasso_mse,
        },
        "train_split": (Xtr, Xte, ytr, yte),
    }


global_reg_results = train_global_regression_models(X_reg, y_reg)
global_reg_results


In [None]:
def train_cluster_regression_models(df,
                                    feature_cols,
                                    target_col,
                                    cluster_col="cluster",
                                    test_size=0.2,
                                    random_state=0,
                                    min_rows=40):
    cluster_reg_results = {}

    for clust_id, df_c in df.groupby(cluster_col):
        if len(df_c) < min_rows:
            continue

        X_c = df_c[feature_cols]
        y_c = df_c[target_col].astype(float)

        Xtr, Xte, ytr, yte = train_test_split(
            X_c, y_c,
            test_size=test_size,
            random_state=random_state,
        )

        # linear
        lin = LinearRegression()
        lin.fit(Xtr, ytr)
        yhat_lin = lin.predict(Xte)
        lin_r2 = r2_score(yte, yhat_lin)
        lin_mse = mean_squared_error(yte, yhat_lin)

        # lasso (scaled)
        Xtr_s, Xte_s, scaler = standardize_data(Xtr, Xte)
        lasso = LassoCV(cv=5, random_state=random_state)
        lasso.fit(Xtr_s, ytr)
        yhat_lasso = lasso.predict(Xte_s)
        lasso_r2 = r2_score(yte, yhat_lasso)
        lasso_mse = mean_squared_error(yte, yhat_lasso)

        cluster_reg_results[clust_id] = {
            "linear": {
                "model": lin,
                "r2": lin_r2,
                "mse": lin_mse,
            },
            "lasso": {
                "model": lasso,
                "scaler": scaler,
                "r2": lasso_r2,
                "mse": lasso_mse,
            },
        }

    return cluster_reg_results


cluster_reg_results = train_cluster_regression_models(
    df,
    FEATURE_COLS,
    TARGET_REG,
    cluster_col=CLUSTER_COL,
)


# Cluster Experiments

In [None]:
def assign_clusters(df, k, cluster_features, cluster_col_name):
    X_cluster = df[cluster_features].copy()

    kmeans = KMeans(n_clusters=k, random_state=0)
    labels = kmeans.fit_predict(X_cluster)

    df = df.copy()
    df[cluster_col_name] = labels
    return df

In [None]:
CLUSTER_EXPERIMENTS = [
    {
        "name": "k3_all_feats",
        "k": 3,
        "cluster_features": FEATURE_COLS,      # use all current features
    },
    {
        "name": "k4_all_feats",
        "k": 4,
        "cluster_features": FEATURE_COLS,
    },
    {
        "name": "k5_all_feats",
        "k": 5,
        "cluster_features": FEATURE_COLS,
    },
    # add more as needed
]

In [None]:
all_cluster_clf_results = {}
all_cluster_reg_results = {}

for cfg in CLUSTER_EXPERIMENTS:
    exp_name = cfg["name"]
    k = cfg["k"]
    cluster_features = cfg["cluster_features"]

    # 1) assign cluster labels for this experiment
    cluster_col = f"cluster_{exp_name}"
    df_with_clusters = assign_clusters(df, k, cluster_features, cluster_col)

    # 2) train cluster-specific classification models
    cluster_clf_results = train_cluster_classification_models(
        df_with_clusters,
        FEATURE_COLS,       # you can choose to change this too, if needed
        TARGET_CLF,
        cluster_col=cluster_col,
    )

    # 3) train cluster-specific regression models
    cluster_reg_results = train_cluster_regression_models(
        df_with_clusters,
        FEATURE_COLS,
        TARGET_REG,
        cluster_col=cluster_col,
    )

    # 4) store results keyed by experiment name
    all_cluster_clf_results[exp_name] = cluster_clf_results
    all_cluster_reg_results[exp_name] = cluster_reg_results


# Gathering Summary Results

In [None]:
summary_rows = []
#global classification
global_df = global_clf_results["all_models"].copy()
global_df = global_df.reset_index()
for _, row in global_df.iterrows():
    summary_rows.append({
        "Level": "Global",
        "Cluster": "-",
        "ClusterExp": "-",  
        "Task": "Classification",
        "Model": row["model"],
        "Technique": row["technique"],
        "Precision": row["Precision"],
        "Recall": row["Recall"],
        "Score": row["Score"],
        "R2": None,
        "MSE": None
    })


# ==========================================================
# CLUSTER CLASSIFICATION (MULTIPLE EXPERIMENTS)
# ==========================================================
for clust_id, res in cluster_clf_results.items():
    df_m = res["all_models"].reset_index()  # bring model, technique out of index

    for _, row in df_m.iterrows():
        summary_rows.append({
            "Level": "Cluster",
            "Cluster": clust_id,
            "ClusterExp": exp_name,
            "Task": "Classification",
            "Model": row["model"],          # now from columns
            "Technique": row["technique"],  # Baseline / Scaling
            "Precision": row["Precision"],
            "Recall": row["Recall"],
            "Score": row["Score"],
            "R2": None,
            "MSE": None,
        })

# --------------------------
# GLOBAL REGRESSION
# --------------------------
# linear
summary_rows.append({
    "Level": "Global",
    "Cluster": "-",
    "ClusterExp": "-",
    "Task": "Regression",
    "Model": "LinearRegression",
    "Technique": "Baseline",
    "Precision": None,
    "Recall": None,
    "Score": None,
    "R2": global_reg_results["linear"]["r2"],
    "MSE": global_reg_results["linear"]["mse"]
})

# lasso
summary_rows.append({
    "Level": "Global",
    "Cluster": "-",
    "ClusterExp": "-",
    "Task": "Regression",
    "Model": "LassoCV",
    "Technique": "Scaled",
    "Precision": None,
    "Recall": None,
    "Score": None,
    "R2": global_reg_results["lasso"]["r2"],
    "MSE": global_reg_results["lasso"]["mse"]
})


# ==========================================================
# CLUSTER REGRESSION (MULTIPLE EXPERIMENTS)
# ==========================================================
for exp_name, cluster_reg_results in all_cluster_reg_results.items():
    for clust_id, res in cluster_reg_results.items():

        # linear
        summary_rows.append({
            "Level": "Cluster",
            "Cluster": clust_id,
            "ClusterExp": exp_name, 
            "Task": "Regression",
            "Model": "LinearRegression",
            "Technique": "Baseline",
            "Precision": None,
            "Recall": None,
            "Score": None,
            "R2": res["linear"]["r2"],
            "MSE": res["linear"]["mse"]
        })

        # lasso
        summary_rows.append({
            "Level": "Cluster",
            "Cluster": clust_id,
            "ClusterExp": exp_name,
            "Task": "Regression",
            "Model": "LassoCV",
            "Technique": "Scaled",
            "Precision": None,
            "Recall": None,
            "Score": None,
            "R2": res["lasso"]["r2"],
            "MSE": res["lasso"]["mse"]
        })


# --------------------------
# BUILD FINAL SUMMARY TABLE
# --------------------------
summary_table = pd.DataFrame(summary_rows)

# sorting for readability
summary_table = summary_table.sort_values(
    by=["Task", "ClusterExp", "Level", "Cluster", "Model"]
).reset_index(drop=True)
summary_table

# More complex approaches

## Helper functions that draw a subset of the total dataset 

## Classification

### HistGradientBoostingClassifier on Global

In [None]:
# Global classification split (training already subsampled by your function)
Xtr_clf, Xte_clf, ytr_clf, yte_clf = global_clf_results["train_split"]

# Global regression split (training already subsampled)
Xtr_reg, Xte_reg, ytr_reg, yte_reg = global_reg_results["train_split"]

In [None]:
def train_hgb_classifier_on_existing_split(
    Xtr, Xte, ytr, yte,
    random_state=0,
):
    """
    Train HistGradientBoostingClassifier on the (already subsampled) train split.
    """
    hgb_clf = HistGradientBoostingClassifier(
        learning_rate=0.05,
        max_depth=6,
        max_iter=200,
        max_leaf_nodes=31,
        l2_regularization=1.0,
        random_state=random_state,
    )

    hgb_clf.fit(Xtr, ytr)
    y_pred = hgb_clf.predict(Xte)

    p = precision_score(yte, y_pred)
    r = recall_score(yte, y_pred)
    s = 0.5 * (p + r)

    return {
        "model": hgb_clf,
        "precision": p,
        "recall": r,
        "score": s,
        "n_train_used": len(Xtr),
    }


hgb_clf_results = train_hgb_classifier_on_existing_split(
    Xtr_clf, Xte_clf, ytr_clf, yte_clf,
    random_state=0,
)

print("HGB classifier – Precision: {:.3f}, Recall: {:.3f}, Score: {:.3f}".format(
    hgb_clf_results["precision"],
    hgb_clf_results["recall"],
    hgb_clf_results["score"],
))


### Stacking ensemble (LogReg + RF, global)

In [None]:
def train_stacking_classifier_on_existing_split(
    Xtr, Xte, ytr, yte,
    random_state=0,
):
    """
    Train stacking classifier (LogisticRegression + RandomForest) on the
    subsampled global train split.
    """
    # Base estimators
    base_lr = make_pipeline(
        StandardScaler(),
        LogisticRegression(
            random_state=random_state,
            solver="liblinear",
            max_iter=300,
        ),
    )

    base_rf = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        min_samples_leaf=20,
        max_features="sqrt",
        class_weight="balanced",
        n_jobs=-1,
        random_state=random_state,
    )

    stack_clf = StackingClassifier(
        estimators=[
            ("lr", base_lr),
            ("rf", base_rf),
        ],
        final_estimator=LogisticRegression(
            random_state=random_state,
            solver="lbfgs",
            max_iter=300,
        ),
        passthrough=True,
        n_jobs=-1,
    )

    stack_clf.fit(Xtr, ytr)
    y_pred = stack_clf.predict(Xte)

    p = precision_score(yte, y_pred)
    r = recall_score(yte, y_pred)
    s = 0.5 * (p + r)

    return {
        "model": stack_clf,
        "precision": p,
        "recall": r,
        "score": s,
        "n_train_used": len(Xtr),
    }


stack_clf_results = train_stacking_classifier_on_existing_split(
    Xtr_clf, Xte_clf, ytr_clf, yte_clf,
    random_state=0,
)

print("Stacking classifier – Precision: {:.3f}, Recall: {:.3f}, Score: {:.3f}".format(
    stack_clf_results["precision"],
    stack_clf_results["recall"],
    stack_clf_results["score"],
))


### HistGradientBoostingClassifier on Clusters

In [None]:
def train_cluster_hgb_classifiers(
    cluster_clf_results,
    random_state=0,
):
    """
    For each cluster in cluster_clf_results, train a HistGradientBoostingClassifier
    on that cluster's (already-subsampled) train split and evaluate on its test split.
    
    Assumes each entry has:
        res["train_split"] = (Xtr_sub, Xte, ytr_sub, yte)
    """
    cluster_hgb_clf_results = {}

    for clust_id, res in cluster_clf_results.items():
        if "train_split" not in res:
            # you can raise or skip; I'm being defensive
            continue

        Xtr_c, Xte_c, ytr_c, yte_c = res["train_split"]

        hgb_clf = HistGradientBoostingClassifier(
            learning_rate=0.05,
            max_depth=6,
            max_iter=200,
            max_leaf_nodes=31,
            l2_regularization=1.0,
            random_state=random_state,
        )

        hgb_clf.fit(Xtr_c, ytr_c)
        y_pred_c = hgb_clf.predict(Xte_c)

        p = precision_score(yte_c, y_pred_c)
        r = recall_score(yte_c, y_pred_c)
        s = 0.5 * (p + r)

        cluster_hgb_clf_results[clust_id] = {
            "model": hgb_clf,
            "precision": p,
            "recall": r,
            "score": s,
            "n_train_used": len(Xtr_c),
        }

    return cluster_hgb_clf_results


cluster_hgb_clf_results = train_cluster_hgb_classifiers(cluster_clf_results)


In [None]:
cluster_hgb_clf_results

### Clustered Stacking (LogReg + RF)

In [None]:
def train_cluster_stacking_classifiers(
    cluster_clf_results,
    random_state=0,
):
    """
    For each cluster, train a stacking classifier (LogReg + RF) on its train split.
    """

    cluster_stack_clf_results = {}

    for clust_id, res in cluster_clf_results.items():
        if "train_split" not in res:
            continue

        Xtr_c, Xte_c, ytr_c, yte_c = res["train_split"]

        # Base estimators per cluster
        base_lr = make_pipeline(
            StandardScaler(),
            LogisticRegression(
                random_state=random_state,
                solver="liblinear",
                max_iter=300,
            ),
        )

        base_rf = RandomForestClassifier(
            n_estimators=100,
            max_depth=10,
            min_samples_leaf=20,
            max_features="sqrt",
            class_weight="balanced",
            n_jobs=-1,
            random_state=random_state,
        )

        stack_clf = StackingClassifier(
            estimators=[
                ("lr", base_lr),
                ("rf", base_rf),
            ],
            final_estimator=LogisticRegression(
                random_state=random_state,
                solver="lbfgs",
                max_iter=300,
            ),
            passthrough=True,
            n_jobs=-1,
        )

        stack_clf.fit(Xtr_c, ytr_c)
        y_pred_c = stack_clf.predict(Xte_c)

        p = precision_score(yte_c, y_pred_c)
        r = recall_score(yte_c, y_pred_c)
        s = 0.5 * (p + r)

        cluster_stack_clf_results[clust_id] = {
            "model": stack_clf,
            "precision": p,
            "recall": r,
            "score": s,
            "n_train_used": len(Xtr_c),
        }

    return cluster_stack_clf_results


cluster_stack_clf_results = train_cluster_stacking_classifiers(cluster_clf_results)


In [None]:
cluster_stack_clf_results

## Regression

### RandomForestRegressor on Global

In [None]:
def train_rf_regressor_on_existing_split(
    Xtr, Xte, ytr, yte,
    random_state=0,
):
    """
    Train RandomForestRegressor on the subsampled global train split.
    """
    rf_reg = RandomForestRegressor(
        n_estimators=200,
        max_depth=15,
        min_samples_leaf=20,
        max_features="sqrt",
        n_jobs=-1,
        random_state=random_state,
    )

    rf_reg.fit(Xtr, ytr)
    y_pred = rf_reg.predict(Xte)

    r2 = r2_score(yte, y_pred)
    mse = mean_squared_error(yte, y_pred)

    return {
        "model": rf_reg,
        "r2": r2,
        "mse": mse,
        "n_train_used": len(Xtr),
    }


rf_reg_results = train_rf_regressor_on_existing_split(
    Xtr_reg, Xte_reg, ytr_reg, yte_reg,
    random_state=0,
)

print("RF regressor – R²: {:.3f}, MSE: {:.1f}".format(
    rf_reg_results["r2"],
    rf_reg_results["mse"],
))


### HistGradientBoostingRegressor on Global

In [None]:
def train_hgb_regressor_on_existing_split(
    Xtr, Xte, ytr, yte,
    random_state=0,
):
    """
    Train HistGradientBoostingRegressor on the subsampled global train split.
    """
    hgb_reg = HistGradientBoostingRegressor(
        learning_rate=0.05,
        max_depth=6,
        max_iter=200,
        max_leaf_nodes=31,
        l2_regularization=1.0,
        random_state=random_state,
    )

    hgb_reg.fit(Xtr, ytr)
    y_pred = hgb_reg.predict(Xte)

    r2 = r2_score(yte, y_pred)
    mse = mean_squared_error(yte, y_pred)

    return {
        "model": hgb_reg,
        "r2": r2,
        "mse": mse,
        "n_train_used": len(Xtr),
    }


hgb_reg_results = train_hgb_regressor_on_existing_split(
    Xtr_reg, Xte_reg, ytr_reg, yte_reg,
    random_state=0,
)

print("HGB regressor – R²: {:.3f}, MSE: {:.1f}".format(
    hgb_reg_results["r2"],
    hgb_reg_results["mse"],
))


### Clustered RandomForestRegressor

In [None]:
def train_cluster_rf_regressors(
    cluster_reg_results,
    random_state=0,
):
    """
    For each cluster in cluster_reg_results, train a RandomForestRegressor
    on its (already-subsampled) train split and evaluate on test split.
    """

    cluster_rf_reg_results = {}

    for clust_id, res in cluster_reg_results.items():
        if "train_split" not in res:
            continue

        Xtr_c, Xte_c, ytr_c, yte_c = res["train_split"]

        rf_reg = RandomForestRegressor(
            n_estimators=200,
            max_depth=15,
            min_samples_leaf=20,
            max_features="sqrt",
            n_jobs=-1,
            random_state=random_state,
        )

        rf_reg.fit(Xtr_c, ytr_c)
        y_pred_c = rf_reg.predict(Xte_c)

        r2 = r2_score(yte_c, y_pred_c)
        mse = mean_squared_error(yte_c, y_pred_c)

        cluster_rf_reg_results[clust_id] = {
            "model": rf_reg,
            "r2": r2,
            "mse": mse,
            "n_train_used": len(Xtr_c),
        }

    return cluster_rf_reg_results


cluster_rf_reg_results = train_cluster_rf_regressors(cluster_reg_results)


### Clustered HistGradientBoostingRegressor

In [None]:
def train_cluster_hgb_regressors(
    cluster_reg_results,
    random_state=0,
):
    """
    For each cluster, train a HistGradientBoostingRegressor on its train split.
    """

    cluster_hgb_reg_results = {}

    for clust_id, res in cluster_reg_results.items():
        if "train_split" not in res:
            continue

        Xtr_c, Xte_c, ytr_c, yte_c = res["train_split"]

        hgb_reg = HistGradientBoostingRegressor(
            learning_rate=0.05,
            max_depth=6,
            max_iter=200,
            max_leaf_nodes=31,
            l2_regularization=1.0,
            random_state=random_state,
        )

        hgb_reg.fit(Xtr_c, ytr_c)
        y_pred_c = hgb_reg.predict(Xte_c)

        r2 = r2_score(yte_c, y_pred_c)
        mse = mean_squared_error(yte_c, y_pred_c)

        cluster_hgb_reg_results[clust_id] = {
            "model": hgb_reg,
            "r2": r2,
            "mse": mse,
            "n_train_used": len(Xtr_c),
        }

    return cluster_hgb_reg_results


cluster_hgb_reg_results = train_cluster_hgb_regressors(cluster_reg_results)


# Add to summary results

In [None]:
summary_rows = []

# Classification – HGB
summary_rows.append({
    "Level": "Global",
    "Cluster": "-",
    "ClusterExp": "-",
    "Task": "Classification",
    "Model": "HistGradientBoostingClassifier",
    "Technique": "SubsampledGlobal",
    "Precision": hgb_clf_results["precision"],
    "Recall": hgb_clf_results["recall"],
    "Score": hgb_clf_results["score"],
    "R2": None,
    "MSE": None,
})

# Classification – Stacking
summary_rows.append({
    "Level": "Global",
    "Cluster": "-",
    "ClusterExp": "-",
    "Task": "Classification",
    "Model": "Stacking(LR+RF)",
    "Technique": "SubsampledGlobal",
    "Precision": stack_clf_results["precision"],
    "Recall": stack_clf_results["recall"],
    "Score": stack_clf_results["score"],
    "R2": None,
    "MSE": None,
})

# Regression – RF
summary_rows.append({
    "Level": "Global",
    "Cluster": "-",
    "ClusterExp": "-",
    "Task": "Regression",
    "Model": "RandomForestRegressor",
    "Technique": "SubsampledGlobal",
    "Precision": None,
    "Recall": None,
    "Score": None,
    "R2": rf_reg_results["r2"],
    "MSE": rf_reg_results["mse"],
})

# Regression – HGB
summary_rows.append({
    "Level": "Global",
    "Cluster": "-",
    "ClusterExp": "-",
    "Task": "Regression",
    "Model": "HistGradientBoostingRegressor",
    "Technique": "SubsampledGlobal",
    "Precision": None,
    "Recall": None,
    "Score": None,
    "R2": hgb_reg_results["r2"],
    "MSE": hgb_reg_results["mse"],
})

# Classification – HGB per cluster
for clust_id, res in cluster_hgb_clf_results.items():
    summary_rows.append({
        "Level": "Cluster",
        "Cluster": clust_id,
        "ClusterExp": exp_name if "exp_name" in locals() else "-",  # or however you're tracking it
        "Task": "Classification",
        "Model": "HistGradientBoostingClassifier",
        "Technique": "SubsampledCluster",
        "Precision": res["precision"],
        "Recall": res["recall"],
        "Score": res["score"],
        "R2": None,
        "MSE": None,
    })

# Classification – Stacking per cluster
for clust_id, res in cluster_stack_clf_results.items():
    summary_rows.append({
        "Level": "Cluster",
        "Cluster": clust_id,
        "ClusterExp": exp_name if "exp_name" in locals() else "-",
        "Task": "Classification",
        "Model": "Stacking(LR+RF)",
        "Technique": "SubsampledCluster",
        "Precision": res["precision"],
        "Recall": res["recall"],
        "Score": res["score"],
        "R2": None,
        "MSE": None,
    })

# Regression – RF per cluster
for clust_id, res in cluster_rf_reg_results.items():
    summary_rows.append({
        "Level": "Cluster",
        "Cluster": clust_id,
        "ClusterExp": exp_name if "exp_name" in locals() else "-",
        "Task": "Regression",
        "Model": "RandomForestRegressor",
        "Technique": "SubsampledCluster",
        "Precision": None,
        "Recall": None,
        "Score": None,
        "R2": res["r2"],
        "MSE": res["mse"],
    })

# Regression – HGB per cluster
for clust_id, res in cluster_hgb_reg_results.items():
    summary_rows.append({
        "Level": "Cluster",
        "Cluster": clust_id,
        "ClusterExp": exp_name if "exp_name" in locals() else "-",
        "Task": "Regression",
        "Model": "HistGradientBoostingRegressor",
        "Technique": "SubsampledCluster",
        "Precision": None,
        "Recall": None,
        "Score": None,
        "R2": res["r2"],
        "MSE": res["mse"],
    })

summary_rows