In [3]:
# -*- coding: utf-8 -*-
import os
import math
import json
import numpy as np
import pandas as pd

from typing import List, Dict
from datetime import datetime

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, median_absolute_error
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.base import clone

# Optional: LightGBM (fast, handles NaN)
import lightgbm as lgb
import joblib


# -----------------------------
# Config
# -----------------------------
DATA_PATH = "data/train/train.csv"
OUT_DIR   = "./models/cbb_models"
os.makedirs(OUT_DIR, exist_ok=True)

TARGETS = [
    'home_1h', 'away_1h', 'home_2h', 'away_2h',
    'home_score', 'away_score', 'home_margin', 'away_margin'
]

ID_LIKE = ['Unnamed: 0', 'game_id', 'home', 'away']  # drop from features


# -----------------------------
# Helpers
# -----------------------------
def safe_mape(y_true, y_pred, eps=1e-6):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    denom = np.clip(np.abs(y_true), eps, None)
    return np.mean(np.abs((y_true - y_pred) / denom))


def time_folds(df_sorted_dates: pd.DataFrame, k: int = 5, date_col: str = "date_utc"):
    """
    Expanding time-based CV with non-empty splits:
      - Sort by date
      - Create k contiguous validation windows
      - Fold i: train = [min_date ... cut[i-1]], valid = (cut[i-1] ... cut[i]]
    Skips folds with empty train/valid.
    Returns: list of (train_idx, valid_idx)
    """
    if date_col not in df_sorted_dates.columns:
        raise ValueError(f"Missing {date_col} for time-based CV")

    dfx = df_sorted_dates[df_sorted_dates[date_col].notna()].copy()
    dfx = dfx.sort_values(date_col).reset_index(drop=True)
    if len(dfx) < k + 2:
        # not enough rows for k folds; fall back to a single 80/20 split
        cut = int(len(dfx) * 0.8)
        return [(dfx.index[:cut], dfx.index[cut:])], dfx.index

    # k+1 cut points from 0..1 (inclusive); use quantiles to respect time density
    qs = np.linspace(0.0, 1.0, k + 1)
    cuts = [dfx[date_col].quantile(q) for q in qs]

    folds = []
    for i in range(1, len(cuts)):
        left_cut = cuts[i - 1]
        right_cut = cuts[i]

        # train: <= left_cut, valid: (left_cut, right_cut]
        train_mask = dfx[date_col] <= left_cut
        valid_mask = (dfx[date_col] > left_cut) & (dfx[date_col] <= right_cut)

        tr_idx = dfx.index[train_mask]
        va_idx = dfx.index[valid_mask]

        # skip degenerate folds
        if len(tr_idx) == 0 or len(va_idx) == 0:
            continue

        folds.append((tr_idx, va_idx))

    # if all folds degenerate, fallback to 80/20
    if not folds:
        cut = int(len(dfx) * 0.8)
        folds = [(dfx.index[:cut], dfx.index[cut:])]

    return folds, dfx.index



def pick_feature_columns(df: pd.DataFrame, targets: List[str], drop_ids: List[str]) -> List[str]:
    drop = set(targets + drop_ids)
    feats = [c for c in df.columns if c not in drop and np.issubdtype(df[c].dtype, np.number)]
    return feats


def fit_and_eval_one_target(
    df: pd.DataFrame,
    features: List[str],
    target: str,
    date_col: str = "date_utc",
    k_folds: int = 5
) -> pd.DataFrame:
    df_local = df.copy()

    # Sort by date (fall back to 'date' if needed)
    if date_col not in df_local.columns or df_local[date_col].isna().all():
        date_col = "date" if "date" in df_local.columns else date_col
    if date_col in df_local.columns:
        df_local = df_local.sort_values(date_col).reset_index(drop=True)

    # Build folds on rows that have a date; map back to df_local indices
    folds, dfx_idx = time_folds(df_local, k=k_folds, date_col=date_col)
    # translate fold indices into df_local indices
    idx_map = {i:i for i in dfx_idx}  # identity; dfx is a filtered/sorted view

    # Models (choose NaN-friendly models)
    model_specs = {
        "LightGBM": lgb.LGBMRegressor(
            n_estimators=700,          # good default; early stopping on val
            learning_rate=0.05,
            num_leaves=63,
            subsample=0.8,
            colsample_bytree=0.8,
            min_child_samples=20,
            reg_alpha=0.0,
            reg_lambda=0.0,
            random_state=42,
            n_jobs=-1
        ),
        "HistGB": HistGradientBoostingRegressor(
            loss="squared_error",
            learning_rate=0.07,
            max_leaf_nodes=63,
            min_samples_leaf=20,
            early_stopping=True,
            validation_fraction=0.1,
            n_iter_no_change=50,
            random_state=42
        )
    }

    rows = []
    best_models: Dict[str, object] = {}

    for model_name, model in model_specs.items():
        fold_metrics = []
        fold_num = 0
        # Keep model trained on the LAST fold for quick deployment; you can refit on full train later
        last_fit_model = None

        for tr_idx, va_idx in folds:
            fold_num += 1
            X_tr = df_local.loc[tr_idx, features]
            y_tr = df_local.loc[tr_idx, target]
            X_va = df_local.loc[va_idx, features]
            y_va = df_local.loc[va_idx, target]

            mdl = clone(model)

            if model_name == "LightGBM":
                mdl.fit(
                    X_tr, y_tr,
                    eval_set=[(X_va, y_va)],
                    eval_metric="l2",
                    callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)]
                )
            else:
                mdl.fit(X_tr, y_tr)

            y_hat = mdl.predict(X_va)
            rmse = mean_squared_error(y_va, y_hat, squared=False)
            mae = mean_absolute_error(y_va, y_hat)
            medae = median_absolute_error(y_va, y_hat)
            r2 = r2_score(y_va, y_hat) if len(np.unique(y_va)) > 1 else np.nan
            mape = safe_mape(y_va, y_hat)

            fold_metrics.append((rmse, mae, medae, r2, mape))
            last_fit_model = mdl

        # Aggregate
        rmses, maes, medaes, r2s, mapes = zip(*fold_metrics)
        row = {
            "target": target,
            "model": model_name,
            "folds": len(fold_metrics),
            "RMSE_mean": float(np.mean(rmses)),
            "RMSE_std":  float(np.std(rmses)),
            "MAE_mean":  float(np.mean(maes)),
            "MAE_std":   float(np.std(maes)),
            "MedAE_mean":float(np.mean(medaes)),
            "R2_mean":   float(np.nanmean(r2s)),
            "MAPE_mean": float(np.mean(mapes)),
        }
        rows.append(row)

        # Save last fold model
        best_models[model_name] = last_fit_model

        # Save feature importances for LightGBM
        if model_name == "LightGBM":
            imp = pd.Series(last_fit_model.feature_importances_, index=features).sort_values(ascending=False)
            imp.head(50).to_csv(os.path.join(OUT_DIR, f"feat_importance_{model_name}_{target}.csv"))

    # Persist models for this target
    for name, mdl in best_models.items():
        joblib.dump(mdl, os.path.join(OUT_DIR, f"model_{name}_{target}.joblib"))

    return pd.DataFrame(rows)


# -----------------------------
# Main
# -----------------------------
df = pd.read_csv(DATA_PATH)

# Parse dates for time-aware CV
for c in ["date_utc", "date"]:
    if c in df.columns:
        df[c] = pd.to_datetime(df[c], errors="coerce")

# Build feature list (numeric only, exclude IDs & targets)
feature_cols = pick_feature_columns(df, TARGETS, ID_LIKE)
print(f"{len(feature_cols)} numeric feature columns selected.")

# Train/evaluate per target
all_results = []
for tgt in TARGETS:
    if tgt not in df.columns:
        print(f"[WARN] target {tgt} not in data; skipping.")
        continue
    print(f"\n=== Training for target: {tgt} ===")
    res = fit_and_eval_one_target(df, feature_cols, tgt, date_col="date_utc", k_folds=5)
    all_results.append(res)

results_df = pd.concat(all_results, ignore_index=True).sort_values(["target","RMSE_mean"]).reset_index(drop=True)

# Save metrics report
results_path = os.path.join(OUT_DIR, "cv_metrics_report.csv")
results_df.to_csv(results_path, index=False)
print("\nSaved metrics report to:", results_path)

# Show top rows
print(results_df.head(20).to_string(index=False))


504 numeric feature columns selected.

=== Training for target: home_1h ===
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 37, number of used features: 0
[LightGBM] [Info] Start training from score 38.189189
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009566 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 81826
[LightGBM] [Info] Number of data points in the train set: 4688, number of used features: 503
[LightGBM] [Info] Start training from score 34.505546
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017973 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 83702
[LightGBM] [Info] Number of data points in the train set: 9259, number of used features: 503
[LightGBM] [Info] Start training from score 34.283292
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the 