In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LassoCV, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, r2_score, mean_squared_error
from sklearn.cluster import KMeans

from sklearn.ensemble import (
    HistGradientBoostingClassifier,
    HistGradientBoostingRegressor,
    RandomForestRegressor,
    RandomForestClassifier,
    StackingClassifier,
)
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

In [2]:
DATA_PATH = "data_with_clusters.csv"

FEATURE_COLS = [
    "MONTH",
    "HOUR",
    "origin_flights_day",
    "airline_bucket",
    "origin_bucket",
    "destination_bucket",
    "lagged_delay_flag",
    "prev_real_delay",
]

TARGET_CLF = "DEP_DEL15" #binary departure delay indicator
TARGET_REG = "DEP_DELAY_NEW" #continuous departure delay (min)
CLUSTER_COL = "cluster"

df = pd.read_csv(DATA_PATH)

# keep only needed columns, drop rows with missing in features/targets
df = df.dropna(subset=FEATURE_COLS + [TARGET_CLF, TARGET_REG])

X_clf = df[FEATURE_COLS]
y_clf = df[TARGET_CLF].astype(int)

X_reg = df[FEATURE_COLS]
y_reg = df[TARGET_REG].astype(float)

In [3]:
df.head()

Unnamed: 0,YEAR,QUARTER,MONTH,DAY,DAY_OF_WEEK,MKT_CARRIER_AIRLINE_ID,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_NAME,DEST_AIRPORT_ID,...,prev_real_delay,FL_DATE,origin_flights_day,origin_bucket,dest_flights_day,destination_bucket,distance_bucket,airline_bucket,HOUR,cluster
0,2024,1,1,1,1,19393,10140,1014005,"Albuquerque, NM",10423,...,0.0,2024-01-01,67,1,241,2,2,1,7,0
1,2024,1,1,1,1,19393,10140,1014005,"Albuquerque, NM",10423,...,0.0,2024-01-01,67,1,241,2,2,1,18,0
2,2024,1,1,1,1,19393,10140,1014005,"Albuquerque, NM",10800,...,0.0,2024-01-01,67,1,90,1,3,1,14,1
3,2024,1,1,1,1,19393,10140,1014005,"Albuquerque, NM",10821,...,0.0,2024-01-01,67,1,265,2,4,1,15,0
4,2024,1,1,1,1,19393,10140,1014005,"Albuquerque, NM",11259,...,0.0,2024-01-01,67,1,214,2,2,1,5,0


In [7]:
def make_models():
    return {
        "LR_L2": LogisticRegression(
            random_state=0, solver="liblinear", max_iter=200
        ),
        "LR_L1": LogisticRegression(
            random_state=0,
            penalty="l1",
            solver="liblinear",
            class_weight="balanced",
            max_iter=500,
        ),
        "CART": DecisionTreeClassifier(
            random_state=0, class_weight="balanced"
        ),
        "RF": RandomForestClassifier(
            random_state=0, class_weight="balanced", n_jobs=-1,
        ),
    }


def init_all_models():
    model_names = ("LR_L2", "LR_L1", "CART", "RF")
    techniques = ("Baseline", "Scaling")  # subset of full Lab 6 list

    idx = pd.MultiIndex.from_product(
        [model_names, techniques],
        names=("model", "technique"),
    )
    all_models = pd.DataFrame(
        index=idx,
        columns=["Precision", "Recall", "Score", "Model"],
    )
    all_models[["Precision", "Recall", "Score"]] = all_models[
        ["Precision", "Recall", "Score"]
    ].astype(float)
    return all_models


def standardize_data(X_train, X_out):
    scaler = StandardScaler()
    scaler.fit(X_train)

    Xtr = pd.DataFrame(
        scaler.transform(X_train),
        index=X_train.index,
        columns=X_train.columns,
    )
    Xout = pd.DataFrame(
        scaler.transform(X_out),
        index=X_out.index,
        columns=X_out.columns,
    )
    return Xtr, Xout, scaler


def fit_and_score_model(all_models, stage_name,
                        X_train, X_out, y_train, y_out):
    models_dict = make_models()

    for model_name, model in models_dict.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_out)

        p = precision_score(y_out, y_pred)
        r = recall_score(y_out, y_pred)
        s = 0.5 * (p + r)

        idx = (model_name, stage_name)
        
        all_models.at[idx, "Precision"] = p
        all_models.at[idx, "Recall"] = r
        all_models.at[idx, "Score"] = s
        all_models.at[idx, "Model"] = model

    return all_models


def compare_models(all_models, technique_name="Scaling"):
    diffs = (
        all_models.xs(technique_name, level="technique").Score.values
        - all_models.xs("Baseline", level="technique").Score.values
    )
    print(
        f"{technique_name}: mean ΔScore={diffs.mean():.3f}, "
        f"max ΔScore={diffs.max():.3f}"
    )


In [8]:
GLOBAL_CLF_TRAIN_FRAC = 0.25   # 25% of global clf train
CLUSTER_CLF_TRAIN_FRAC = 0.5    # up to 50% of each cluster's train
GLOBAL_REG_TRAIN_FRAC = 0.25   # 25% of global reg train
CLUSTER_REG_TRAIN_FRAC = 0.5    # 30% of each cluster's reg train
LASSO_TRAIN_FRAC = 0.2  # 20% just for LassoCV (more expensive)

MIN_SUBSAMPLE_SIZE = 500000    # don't bother subsampling below this

def stratified_subsample_xy_frac(X, y, frac=1.0, random_state=0):
    """
    Classification subsample: take a fraction of (X, y), stratified by y.
    If frac >= 1.0 or dataset is already small, return (X, y) unchanged.
    """
    n = len(X)
    if frac >= 1.0 or n <= MIN_SUBSAMPLE_SIZE:
        return X, y

    n_sub = int(n * frac)
    if n_sub < MIN_SUBSAMPLE_SIZE:
        # if frac is tiny on a small dataset, just keep all
        return X, y

    X_sub, _, y_sub, _ = train_test_split(
        X, y,
        train_size=n_sub,
        stratify=y,
        random_state=random_state,
    )
    return X_sub, y_sub


def random_subsample_xy_frac(X, y, frac=1.0, random_state=0):
    """
    Regression subsample: take a random fraction of (X, y).
    If frac >= 1.0 or dataset is already small, return (X, y) unchanged.
    """
    n = len(X)
    if frac >= 1.0 or n <= MIN_SUBSAMPLE_SIZE:
        return X, y

    n_sub = int(n * frac)
    if n_sub < MIN_SUBSAMPLE_SIZE:
        return X, y

    rng = np.random.RandomState(random_state)
    idx = rng.choice(n, size=n_sub, replace=False)

    if isinstance(X, (pd.DataFrame, pd.Series)):
        return X.iloc[idx], y.iloc[idx]
    else:
        return X[idx], y[idx]


# Model Training Functions

In [9]:
def train_global_classification_models(
    X, y,
    test_size=0.2,
    random_state=0,
    train_frac=GLOBAL_CLF_TRAIN_FRAC,
):
    # Full split first
    Xtr, Xte, ytr, yte = train_test_split(
        X, y,
        test_size=test_size,
        random_state=random_state,
        stratify=y,
    )

    # === SUBSAMPLE TRAINING SET (stratified by y) ===
    Xtr_sub, ytr_sub = stratified_subsample_xy_frac(
        Xtr, ytr,
        frac=train_frac,
        random_state=random_state,
    )

    all_models = init_all_models()

    # Baseline (unscaled) – train on subsample, test on full test set
    all_models = fit_and_score_model(
        all_models, "Baseline",
        Xtr_sub, Xte, ytr_sub, yte
    )

    # Scaling – fit scaler on subsample, transform full test
    Xtr_s, Xte_s, scaler = standardize_data(Xtr_sub, Xte)
    all_models = fit_and_score_model(
        all_models, "Scaling",
        Xtr_s, Xte_s, ytr_sub, yte
    )

    compare_models(all_models, "Scaling")

    best_row = all_models.sort_values("Score").iloc[-1]
    best_model = best_row["Model"]

    return {
        "all_models": all_models,
        "best_model": best_model,
        "scaler": scaler,
        # store what we *actually used* for training
        "train_split": (Xtr_sub, Xte, ytr_sub, yte),
    }


global_clf_results = train_global_classification_models(X_clf, y_clf)
global_clf_results["all_models"]


: 

In [None]:
def train_cluster_classification_models(
    df,
    feature_cols,
    target_col,
    cluster_col="cluster",
    test_size=0.2,
    random_state=0,
    train_frac=CLUSTER_CLF_TRAIN_FRAC,
):
    cluster_results = {}

    for clust_id, df_c in df.groupby(cluster_col):
        y_c = df_c[target_col].astype(int)
        if y_c.nunique() < 2 or len(df_c) < 40:
            continue

        X_c = df_c[feature_cols]

        Xtr, Xte, ytr, yte = train_test_split(
            X_c, y_c,
            test_size=test_size,
            random_state=random_state,
            stratify=y_c,
        )

        # === SUBSAMPLE TRAINING SET IN THIS CLUSTER ===
        Xtr_sub, ytr_sub = stratified_subsample_xy_frac(
            Xtr, ytr,
            frac=train_frac,
            random_state=random_state,
        )

        all_models = init_all_models()

        # Baseline
        all_models = fit_and_score_model(
            all_models, "Baseline", Xtr_sub, Xte, ytr_sub, yte
        )

        # Scaling
        Xtr_s, Xte_s, scaler = standardize_data(Xtr_sub, Xte)
        all_models = fit_and_score_model(
            all_models, "Scaling", Xtr_s, Xte_s, ytr_sub, yte
        )

        best_row = all_models.sort_values("Score").iloc[-1]
        best_model = best_row["Model"]

        cluster_results[clust_id] = {
            "all_models": all_models,
            "best_model": best_model,
            "scaler": scaler,
            "train_split": (Xtr_sub, Xte, ytr_sub, yte),
        }

    return cluster_results


cluster_clf_results = train_cluster_classification_models(
    df,
    FEATURE_COLS,
    TARGET_CLF,
    cluster_col=CLUSTER_COL,
)


In [None]:
def train_global_regression_models(
    X, y,
    test_size=0.2,
    random_state=0,
    train_frac=GLOBAL_REG_TRAIN_FRAC,
    lasso_frac=LASSO_TRAIN_FRAC,
):
    # Full split
    Xtr, Xte, ytr, yte = train_test_split(
        X, y,
        test_size=test_size,
        random_state=random_state,
    )

    # === SUBSAMPLE TRAINING FOR LINEAR & LASSO ===
    Xtr_sub, ytr_sub = random_subsample_xy_frac(
        Xtr, ytr,
        frac=train_frac,
        random_state=random_state,
    )

    # Plain linear regression (unscaled)
    lin = LinearRegression()
    lin.fit(Xtr_sub, ytr_sub)
    yhat_lin = lin.predict(Xte)
    lin_r2 = r2_score(yte, yhat_lin)
    lin_mse = mean_squared_error(yte, yhat_lin)

    # LassoCV (scaled) – optionally even smaller subsample
    Xtr_lasso, ytr_lasso = random_subsample_xy_frac(
        Xtr_sub, ytr_sub,
        frac=lasso_frac / max(train_frac, 1e-9) if lasso_frac < train_frac else 1.0,
        random_state=random_state,
    )

    Xtr_s, Xte_s, scaler = standardize_data(Xtr_lasso, Xte)
    lasso = LassoCV(
        cv=3,               # lighter than 5-fold
        random_state=random_state,
        n_jobs=-1,
    )
    lasso.fit(Xtr_s, ytr_lasso)
    yhat_lasso = lasso.predict(Xte_s)
    lasso_r2 = r2_score(yte, yhat_lasso)
    lasso_mse = mean_squared_error(yte, yhat_lasso)

    return {
        "linear": {
            "model": lin,
            "r2": lin_r2,
            "mse": lin_mse,
        },
        "lasso": {
            "model": lasso,
            "scaler": scaler,
            "r2": lasso_r2,
            "mse": lasso_mse,
        },
        "train_split": (Xtr_sub, Xte, ytr_sub, yte),
    }


global_reg_results = train_global_regression_models(X_reg, y_reg)
global_reg_results


In [None]:
def train_cluster_regression_models(
    df,
    feature_cols,
    target_col,
    cluster_col="cluster",
    test_size=0.2,
    random_state=0,
    min_rows=40,
    train_frac=CLUSTER_REG_TRAIN_FRAC,
    lasso_frac=LASSO_TRAIN_FRAC,
):
    cluster_reg_results = {}

    for clust_id, df_c in df.groupby(cluster_col):
        if len(df_c) < min_rows:
            continue

        X_c = df_c[feature_cols]
        y_c = df_c[target_col].astype(float)

        Xtr, Xte, ytr, yte = train_test_split(
            X_c, y_c,
            test_size=test_size,
            random_state=random_state,
        )

        # === SUBSAMPLE TRAINING FOR THIS CLUSTER ===
        Xtr_sub, ytr_sub = random_subsample_xy_frac(
            Xtr, ytr,
            frac=train_frac,
            random_state=random_state,
        )

        # Linear
        lin = LinearRegression()
        lin.fit(Xtr_sub, ytr_sub)
        yhat_lin = lin.predict(Xte)
        lin_r2 = r2_score(yte, yhat_lin)
        lin_mse = mean_squared_error(yte, yhat_lin)

        # Lasso
        Xtr_lasso, ytr_lasso = random_subsample_xy_frac(
            Xtr_sub, ytr_sub,
            frac=lasso_frac / max(train_frac, 1e-9) if lasso_frac < train_frac else 1.0,
            random_state=random_state,
        )

        Xtr_s, Xte_s, scaler = standardize_data(Xtr_lasso, Xte)
        lasso = LassoCV(
            cv=3,
            random_state=random_state,
            n_jobs=-1,
        )
        lasso.fit(Xtr_s, ytr_lasso)
        yhat_lasso = lasso.predict(Xte_s)
        lasso_r2 = r2_score(yte, yhat_lasso)
        lasso_mse = mean_squared_error(yte, yhat_lasso)

        cluster_reg_results[clust_id] = {
            "linear": {
                "model": lin,
                "r2": lin_r2,
                "mse": lin_mse,
            },
            "lasso": {
                "model": lasso,
                "scaler": scaler,
                "r2": lasso_r2,
                "mse": lasso_mse,
            },
            "train_split": (Xtr_sub, Xte, ytr_sub, yte),
        }

    return cluster_reg_results


cluster_reg_results = train_cluster_regression_models(
    df,
    FEATURE_COLS,
    TARGET_REG,
    cluster_col=CLUSTER_COL,
)

## Not on a subset of data

In [6]:
def train_global_classification_models(X, y, test_size=0.2, random_state=0):
    Xtr, Xte, ytr, yte = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    all_models = init_all_models()

    # Baseline
    all_models = fit_and_score_model(
        all_models, "Baseline", Xtr, Xte, ytr, yte
    )

    # Scaling
    Xtr_s, Xte_s, scaler = standardize_data(Xtr, Xte)
    all_models = fit_and_score_model(
        all_models, "Scaling", Xtr_s, Xte_s, ytr, yte
    )

    compare_models(all_models, "Scaling")

    best_row = all_models.sort_values("Score").iloc[-1]
    best_model = best_row["Model"]

    return {
        "all_models": all_models,
        "best_model": best_model,
        "scaler": scaler,
        "train_split": (Xtr, Xte, ytr, yte),
    }


global_clf_results = train_global_classification_models(X_clf, y_clf)
global_clf_results["all_models"]


KeyboardInterrupt: 

In [None]:
def train_cluster_classification_models(df,
                                        feature_cols,
                                        target_col,
                                        cluster_col="cluster",
                                        test_size=0.2,
                                        random_state=0):
    cluster_results = {}

    for clust_id, df_c in df.groupby(cluster_col):
        y_c = df_c[target_col].astype(int)
        if y_c.nunique() < 2 or len(df_c) < 40:
            continue

        X_c = df_c[feature_cols]

        Xtr, Xte, ytr, yte = train_test_split(
            X_c, y_c,
            test_size=test_size,
            random_state=random_state,
            stratify=y_c,
        )

        all_models = init_all_models()

        all_models = fit_and_score_model(
            all_models, "Baseline", Xtr, Xte, ytr, yte
        )

        Xtr_s, Xte_s, scaler = standardize_data(Xtr, Xte)
        all_models = fit_and_score_model(
            all_models, "Scaling", Xtr_s, Xte_s, ytr, yte
        )

        best_row = all_models.sort_values("Score").iloc[-1]
        best_model = best_row["Model"]

        cluster_results[clust_id] = {
            "all_models": all_models,
            "best_model": best_model,
            "scaler": scaler,
        }

    return cluster_results


cluster_clf_results = train_cluster_classification_models(
    df,
    FEATURE_COLS,
    TARGET_CLF,
    cluster_col=CLUSTER_COL,
)


In [None]:
def train_global_regression_models(X, y, test_size=0.2, random_state=0):
    Xtr, Xte, ytr, yte = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    # Plain linear regression (unscaled)
    lin = LinearRegression()
    lin.fit(Xtr, ytr)
    yhat_lin = lin.predict(Xte)
    lin_r2 = r2_score(yte, yhat_lin)
    lin_mse = mean_squared_error(yte, yhat_lin)

    # LassoCV (scaled)
    Xtr_s, Xte_s, scaler = standardize_data(Xtr, Xte)
    lasso = LassoCV(cv=5, random_state=random_state)
    lasso.fit(Xtr_s, ytr)
    yhat_lasso = lasso.predict(Xte_s)
    lasso_r2 = r2_score(yte, yhat_lasso)
    lasso_mse = mean_squared_error(yte, yhat_lasso)

    return {
        "linear": {
            "model": lin,
            "r2": lin_r2,
            "mse": lin_mse,
        },
        "lasso": {
            "model": lasso,
            "scaler": scaler,
            "r2": lasso_r2,
            "mse": lasso_mse,
        },
        "train_split": (Xtr, Xte, ytr, yte),
    }


global_reg_results = train_global_regression_models(X_reg, y_reg)
global_reg_results


{'linear': {'model': LinearRegression(),
  'r2': 0.004382212091354809,
  'mse': 3031.940064178128},
 'lasso': {'model': LassoCV(cv=5, random_state=0),
  'scaler': StandardScaler(),
  'r2': 0.004382149319585027,
  'mse': 3031.9402553360665},
 'train_split': (         airline_bucket  origin_bucket  destination_bucket  lagged_delay_flag  \
  2840684               1              2                   3                  0   
  2506158               1              2                   4                  0   
  1795334               1              4                   2                  0   
  930487                1              3                   1                  0   
  749653                1              4                   1                  0   
  ...                 ...            ...                 ...                ...   
  2249467               1              1                   2                  0   
  5157699               1              2                   4                  0 

In [14]:
def train_cluster_regression_models(df,
                                    feature_cols,
                                    target_col,
                                    cluster_col="cluster",
                                    test_size=0.2,
                                    random_state=0,
                                    min_rows=40):
    cluster_reg_results = {}

    for clust_id, df_c in df.groupby(cluster_col):
        if len(df_c) < min_rows:
            continue

        X_c = df_c[feature_cols]
        y_c = df_c[target_col].astype(float)

        Xtr, Xte, ytr, yte = train_test_split(
            X_c, y_c,
            test_size=test_size,
            random_state=random_state,
        )

        # linear
        lin = LinearRegression()
        lin.fit(Xtr, ytr)
        yhat_lin = lin.predict(Xte)
        lin_r2 = r2_score(yte, yhat_lin)
        lin_mse = mean_squared_error(yte, yhat_lin)

        # lasso (scaled)
        Xtr_s, Xte_s, scaler = standardize_data(Xtr, Xte)
        lasso = LassoCV(cv=5, random_state=random_state)
        lasso.fit(Xtr_s, ytr)
        yhat_lasso = lasso.predict(Xte_s)
        lasso_r2 = r2_score(yte, yhat_lasso)
        lasso_mse = mean_squared_error(yte, yhat_lasso)

        cluster_reg_results[clust_id] = {
            "linear": {
                "model": lin,
                "r2": lin_r2,
                "mse": lin_mse,
            },
            "lasso": {
                "model": lasso,
                "scaler": scaler,
                "r2": lasso_r2,
                "mse": lasso_mse,
            },
        }

    return cluster_reg_results


cluster_reg_results = train_cluster_regression_models(
    df,
    FEATURE_COLS,
    TARGET_REG,
    cluster_col=CLUSTER_COL,
)


# Cluster Experiments

In [16]:
def assign_clusters(df, k, cluster_features, cluster_col_name):
    X_cluster = df[cluster_features].copy()

    kmeans = KMeans(n_clusters=k, random_state=0)
    labels = kmeans.fit_predict(X_cluster)

    df = df.copy()
    df[cluster_col_name] = labels
    return df

In [None]:
CLUSTER_EXPERIMENTS = [
    {
        "name": "k3_all_feats",
        "k": 3,
        "cluster_features": FEATURE_COLS,      # use all current features
    },
    {
        "name": "k4_all_feats",
        "k": 4,
        "cluster_features": FEATURE_COLS,
    },
    {
        "name": "k5_all_feats",
        "k": 5,
        "cluster_features": FEATURE_COLS,
    },
    # add more as needed
]

In [18]:
all_cluster_clf_results = {}
all_cluster_reg_results = {}

for cfg in CLUSTER_EXPERIMENTS:
    exp_name = cfg["name"]
    k = cfg["k"]
    cluster_features = cfg["cluster_features"]

    # 1) assign cluster labels for this experiment
    cluster_col = f"cluster_{exp_name}"
    df_with_clusters = assign_clusters(df, k, cluster_features, cluster_col)

    # 2) train cluster-specific classification models
    cluster_clf_results = train_cluster_classification_models(
        df_with_clusters,
        FEATURE_COLS,       # you can choose to change this too, if needed
        TARGET_CLF,
        cluster_col=cluster_col,
    )

    # 3) train cluster-specific regression models
    cluster_reg_results = train_cluster_regression_models(
        df_with_clusters,
        FEATURE_COLS,
        TARGET_REG,
        cluster_col=cluster_col,
    )

    # 4) store results keyed by experiment name
    all_cluster_clf_results[exp_name] = cluster_clf_results
    all_cluster_reg_results[exp_name] = cluster_reg_results


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

# Gathering Summary Results

In [None]:
summary_rows = []
#global classification
global_df = global_clf_results["all_models"].copy()
global_df = global_df.reset_index()
for _, row in global_df.iterrows():
    summary_rows.append({
        "Level": "Global",
        "Cluster": "-",
        "ClusterExp": "-",  
        "Task": "Classification",
        "Model": row["model"],
        "Technique": row["technique"],
        "Precision": row["Precision"],
        "Recall": row["Recall"],
        "Score": row["Score"],
        "R2": None,
        "MSE": None
    })


# ==========================================================
# CLUSTER CLASSIFICATION (MULTIPLE EXPERIMENTS)
# ==========================================================
for clust_id, res in cluster_clf_results.items():
    df_m = res["all_models"].reset_index()  # bring model, technique out of index

    for _, row in df_m.iterrows():
        summary_rows.append({
            "Level": "Cluster",
            "Cluster": clust_id,
            "ClusterExp": exp_name,
            "Task": "Classification",
            "Model": row["model"],          # now from columns
            "Technique": row["technique"],  # Baseline / Scaling
            "Precision": row["Precision"],
            "Recall": row["Recall"],
            "Score": row["Score"],
            "R2": None,
            "MSE": None,
        })

# --------------------------
# GLOBAL REGRESSION
# --------------------------
# linear
summary_rows.append({
    "Level": "Global",
    "Cluster": "-",
    "ClusterExp": "-",
    "Task": "Regression",
    "Model": "LinearRegression",
    "Technique": "Baseline",
    "Precision": None,
    "Recall": None,
    "Score": None,
    "R2": global_reg_results["linear"]["r2"],
    "MSE": global_reg_results["linear"]["mse"]
})

# lasso
summary_rows.append({
    "Level": "Global",
    "Cluster": "-",
    "ClusterExp": "-",
    "Task": "Regression",
    "Model": "LassoCV",
    "Technique": "Scaled",
    "Precision": None,
    "Recall": None,
    "Score": None,
    "R2": global_reg_results["lasso"]["r2"],
    "MSE": global_reg_results["lasso"]["mse"]
})


# ==========================================================
# CLUSTER REGRESSION (MULTIPLE EXPERIMENTS)
# ==========================================================
for exp_name, cluster_reg_results in all_cluster_reg_results.items():
    for clust_id, res in cluster_reg_results.items():

        # linear
        summary_rows.append({
            "Level": "Cluster",
            "Cluster": clust_id,
            "ClusterExp": exp_name, 
            "Task": "Regression",
            "Model": "LinearRegression",
            "Technique": "Baseline",
            "Precision": None,
            "Recall": None,
            "Score": None,
            "R2": res["linear"]["r2"],
            "MSE": res["linear"]["mse"]
        })

        # lasso
        summary_rows.append({
            "Level": "Cluster",
            "Cluster": clust_id,
            "ClusterExp": exp_name,
            "Task": "Regression",
            "Model": "LassoCV",
            "Technique": "Scaled",
            "Precision": None,
            "Recall": None,
            "Score": None,
            "R2": res["lasso"]["r2"],
            "MSE": res["lasso"]["mse"]
        })


# --------------------------
# BUILD FINAL SUMMARY TABLE
# --------------------------
summary_table = pd.DataFrame(summary_rows)

# sorting for readability
summary_table = summary_table.sort_values(
    by=["Task", "ClusterExp", "Level", "Cluster", "Model"]
).reset_index(drop=True)
summary_table

Unnamed: 0,Level,Cluster,ClusterExp,Task,Model,Technique,Precision,Recall,Score,R2,MSE
0,Global,-,-,Classification,CART,Baseline,0.259102,0.444737,0.351919,,
1,Global,-,-,Classification,CART,Scaling,0.259099,0.444734,0.351916,,
2,Global,-,-,Classification,LR_L1,Baseline,0.239996,0.513178,0.376587,,
3,Global,-,-,Classification,LR_L1,Scaling,0.239987,0.512857,0.376422,,
4,Global,-,-,Classification,LR_L2,Baseline,0.311712,0.002857,0.157284,,
5,Global,-,-,Classification,LR_L2,Scaling,0.311847,0.002861,0.157354,,
6,Global,-,-,Classification,RF,Baseline,0.258949,0.442471,0.35071,,
7,Global,-,-,Classification,RF,Scaling,0.258941,0.442458,0.350699,,
8,Cluster,0,k3_all_feats,Classification,LR_L1,Baseline,0.359075,0.604621,0.481848,,
9,Cluster,1,k3_all_feats,Classification,LR_L1,Scaling,0.240317,0.447639,0.343978,,


# More complex approaches

## Helper functions that draw a subset of the total dataset 

In [4]:
def stratified_subsample(X, y, max_samples=200_000, random_state=0):
    """
    For classification: draw a stratified subsample of size max_samples.
    If dataset is smaller than max_samples, return as-is.
    """
    n = len(X)
    if n <= max_samples:
        return X, y

    X_sub, _, y_sub, _ = train_test_split(
        X, y,
        train_size=max_samples,
        stratify=y,
        random_state=random_state,
    )
    return X_sub, y_sub


def random_subsample(X, y, max_samples=200_000, random_state=0):
    """
    For regression: draw a random subsample of size max_samples.
    If dataset is smaller than max_samples, return as-is.
    """
    n = len(X)
    if n <= max_samples:
        return X, y

    rng = np.random.RandomState(random_state)
    idx = rng.choice(n, size=max_samples, replace=False)
    if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series):
        return X.iloc[idx], y.iloc[idx]
    else:
        return X[idx], y[idx]

## HistGradientBoostingClassifier

In [None]:
def train_hgb_classifier_on_split(
    Xtr, Xte, ytr, yte,
    max_train_samples=200_000,
    random_state=0
):
    """
    Train a HistGradientBoostingClassifier on a (possibly) subsampled training split.
    Evaluate on the full test split.
    """
    # Subsample training data (stratified)
    Xtr_sub, ytr_sub = stratified_subsample(
        Xtr, ytr,
        max_samples=max_train_samples,
        random_state=random_state,
    )

    hgb_clf = HistGradientBoostingClassifier(
        learning_rate=0.05,
        max_depth=6,
        max_iter=200,
        max_leaf_nodes=31,
        l2_regularization=1.0,
        random_state=random_state,
    )

    hgb_clf.fit(Xtr_sub, ytr_sub)
    y_pred = hgb_clf.predict(Xte)

    p = precision_score(yte, y_pred)
    r = recall_score(yte, y_pred)
    s = 0.5 * (p + r)

    results = {
        "model": hgb_clf,
        "precision": p,
        "recall": r,
        "score": s,
        "n_train_used": len(Xtr_sub),
    }
    return results


hgb_clf_results = train_hgb_classifier_on_split(
    Xtr_clf, Xte_clf, ytr_clf, yte_clf,
    max_train_samples=200_000,
    random_state=0,
)
print("HGB classifier – Precision: {:.3f}, Recall: {:.3f}, Score: {:.3f}".format(
    hgb_clf_results["precision"],
    hgb_clf_results["recall"],
    hgb_clf_results["score"],
))
