In [None]:
"""
TRAINING-ONLY: Robust, time-aware LightGBM + Optuna pipeline for multiple targets.

Outputs per target:
- tuning_summary.json: best params, best CV MAE (Optuna)
- cv_metrics.csv: per-fold metrics (MAE, RMSE, R2, MAPE, MedAE)
- cv_metrics_mean.json: mean/std of metrics across folds
- timings.json: timing for tuning and final CV runs (seconds)
- feature_importance.csv: LightGBM gain-based importances on final refit (optional)
- optuna_study.sqlite (optional): full Optuna study (set SAVE_STUDY=True)

Requires:
  pip install pandas numpy scikit-learn lightgbm optuna joblib
"""

import os
import time
import json
import warnings
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
import optuna
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import (
    mean_absolute_error as MAE,
    mean_squared_error as MSE,
    r2_score as R2,
    median_absolute_error as MedAE,
)
from sklearn.preprocessing import OneHotEncoder
import lightgbm as lgb

warnings.filterwarnings("ignore", category=UserWarning)

# =====================
# CONFIG
# =====================
DATE_COL = "date_utc"  # used ONLY for time ordering / CV, not as a feature
RAW_TIME_COLS_TO_EXCLUDE = [
    DATE_COL, "season", "game_datetime_utc", "game_datetime_local",
    "date_home", "date_away"  # in case daily ratings were merged earlier
]
ID_COLS = ["game_id", "home_team_id", "away_team_id"]
CATEGORICAL_COLS = ["conf_home", "conf_away"]
TARGETS = [
    "home_1h","away_1h","home_2h","away_2h","home_score","away_score",
    "total","1h_total","2h_total","margin","1h_margin","2h_margin"
]

# CV / tuning knobs
TSCV_SPLITS = 5
OPTUNA_TRIALS = 30     # raise for stronger tuning
RANDOM_STATE = 42
USE_GPU = False        # True if LightGBM GPU build installed
NUM_BOOST_ROUND = 20000
EARLY_STOPPING_ROUNDS = 200
N_JOBS = -1

# Outputs
OUT_DIR = "training_reports"
SAVE_STUDY = True
os.makedirs(OUT_DIR, exist_ok=True)


# =====================
# HELPERS (SAFE BY DESIGN)
# =====================
def _parse_dates(df: pd.DataFrame, date_col: str = DATE_COL) -> pd.DataFrame:
    out = df.copy()
    out[date_col] = pd.to_datetime(out[date_col], errors="coerce", utc=True)
    return out.loc[out[date_col].notna()].copy()

def _make_ohe():
    """OneHotEncoder compatible with sklearn<=1.3 and >=1.4."""
    try:
        return OneHotEncoder(handle_unknown="ignore", sparse_output=True)  # sklearn >= 1.4
    except TypeError:
        return OneHotEncoder(handle_unknown="ignore", sparse=True)         # sklearn <= 1.3

def _build_feature_frame(df: pd.DataFrame, target: str) -> pd.DataFrame:
    """
    Returns a frame with ONLY model features + [target, DATE_COL] for CV ordering.
    Drops:
      - all ID columns
      - ALL targets (including the current one)
      - any raw time/grouping columns (date_utc, season, etc.)
    """
    exclude = set(ID_COLS) | set(TARGETS) | set(RAW_TIME_COLS_TO_EXCLUDE)
    feat_cols = [c for c in df.columns if c not in exclude]
    # safety: current target should never slip into features
    if target in feat_cols:
        feat_cols.remove(target)
    # return features + current target + date for time-aware CV
    return df[feat_cols + [target, DATE_COL]].copy()

def _preprocessor_for_df(X: pd.DataFrame) -> ColumnTransformer:
    """
    Builds a ColumnTransformer from dtypes:
      - numeric pipeline gets only numeric dtypes
      - categorical pipeline gets the declared CATEGORICAL_COLS (if present)
      - anything else (e.g., datetime/object columns not in cat list) is dropped (remainder="drop")
    This guarantees no Timestamp columns reach the model.
    """
    cat_cols = [c for c in CATEGORICAL_COLS if c in X.columns]
    num_cols = X.select_dtypes(include=[np.number]).columns.difference(cat_cols).tolist()

    pre = ColumnTransformer(
        transformers=[
            ("num", SimpleImputer(strategy="median"), num_cols),
            ("cat", _make_ohe(), cat_cols),
        ],
        remainder="drop",
        verbose_feature_names_out=False,
    )
    pre.set_output(transform="default")
    return pre

def _suggest_params(trial: optuna.Trial) -> Dict:
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 31, 255, log=True),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 0, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 10, 300, log=True),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "max_depth": trial.suggest_int("max_depth", -1, 12),
        "n_jobs": N_JOBS,
        "objective": "mae",
        "random_state": RANDOM_STATE,
    }
    if USE_GPU:
        params["device_type"] = "gpu"   # lightgbm>=4
    return params

def _fold_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> Dict[str,float]:
    rmse = float(np.sqrt(MSE(y_true, y_pred)))
    mae = float(MAE(y_true, y_pred))
    r2  = float(R2(y_true, y_pred))
    mape = float(np.mean(np.abs((y_true - y_pred) / np.clip(np.abs(y_true), 1e-8, None))))
    medae = float(MedAE(y_true, y_pred))
    return {"MAE": mae, "RMSE": rmse, "R2": r2, "MAPE": mape, "MedAE": medae}

def _summarize_metrics(rows: List[Dict[str,float]]) -> Dict[str,Dict[str,float]]:
    keys = rows[0].keys()
    arr = {k: np.array([r[k] for r in rows], dtype=float) for k in keys}
    return {k: {"mean": float(arr[k].mean()), "std": float(arr[k].std(ddof=1) if len(arr[k])>1 else 0.0)} for k in keys}


# =====================
# CORE TRAINING LOGIC
# =====================
def tune_with_optuna(X: pd.DataFrame, y: pd.Series, dates: pd.Series) -> Tuple[Dict, float, float]:
    """Tune hyperparameters using time-aware CV (objective: MAE). Returns (best_params, best_score, elapsed_sec)."""
    order = np.argsort(dates.values)
    X, y, dates = X.iloc[order], y.iloc[order], dates.iloc[order]

    t0 = time.time()
    tscv = TimeSeriesSplit(n_splits=TSCV_SPLITS)

    def objective(trial: optuna.Trial) -> float:
        params = _suggest_params(trial)
        fold_maes = []

        for tr_idx, va_idx in tscv.split(X):
            Xtr, Xva = X.iloc[tr_idx], X.iloc[va_idx]
            ytr, yva = y.iloc[tr_idx], y.iloc[va_idx]

            pre = _preprocessor_for_df(Xtr)
            Xtr_t = pre.fit_transform(Xtr)
            Xva_t = pre.transform(Xva)

            model = lgb.LGBMRegressor(**params, n_estimators=NUM_BOOST_ROUND)
            model.fit(
                Xtr_t, ytr,
                eval_set=[(Xva_t, yva)],
                callbacks=[lgb.early_stopping(EARLY_STOPPING_ROUNDS, verbose=False)],
            )
            pred = model.predict(Xva_t)
            fold_maes.append(MAE(yva, pred))

        return float(np.mean(fold_maes))

    if SAVE_STUDY:
        study = optuna.create_study(
            direction="minimize",
            storage=f"sqlite:///{os.path.join(OUT_DIR, 'optuna_study.sqlite')}",
            study_name="cbb_study",
            load_if_exists=True
        )
    else:
        study = optuna.create_study(direction="minimize")

    study.optimize(objective, n_trials=OPTUNA_TRIALS, show_progress_bar=False)
    elapsed = time.time() - t0
    return study.best_params, float(study.best_value), float(elapsed)


def crossval_report(X: pd.DataFrame, y: pd.Series, dates: pd.Series, best_params: Dict) -> Tuple[pd.DataFrame, Dict, float]:
    """Run full time-aware CV using best params and return (per_fold_df, summary_json, elapsed_sec)."""
    order = np.argsort(dates.values)
    X, y, dates = X.iloc[order], y.iloc[order], dates.iloc[order]

    t0 = time.time()
    tscv = TimeSeriesSplit(n_splits=TSCV_SPLITS)

    fold_rows = []
    for fold, (tr_idx, va_idx) in enumerate(tscv.split(X), start=1):
        Xtr, Xva = X.iloc[tr_idx], X.iloc[va_idx]
        ytr, yva = y.iloc[tr_idx], y.iloc[va_idx]

        pre = _preprocessor_for_df(Xtr)
        Xtr_t = pre.fit_transform(Xtr)
        Xva_t = pre.transform(Xva)

        model = lgb.LGBMRegressor(**best_params, n_estimators=NUM_BOOST_ROUND, objective="mae", random_state=RANDOM_STATE)
        model.fit(
            Xtr_t, ytr,
            eval_set=[(Xva_t, yva)],
            callbacks=[lgb.early_stopping(EARLY_STOPPING_ROUNDS, verbose=False)],
        )
        ypred = model.predict(Xva_t)
        fold_rows.append({
            "fold": fold,
            **_fold_metrics(yva.values, ypred),
            "best_iteration": int(getattr(model, "best_iteration_", model.n_estimators)),
            "n_samples_train": int(len(tr_idx)),
            "n_samples_val": int(len(va_idx)),
        })

    per_fold = pd.DataFrame(fold_rows)
    summary = _summarize_metrics(per_fold[["MAE","RMSE","R2","MAPE","MedAE"]].to_dict(orient="records"))
    elapsed = time.time() - t0
    return per_fold, summary, elapsed


def train_targets_training_only(df: pd.DataFrame):
    # Ensure time column is parsed
    df = _parse_dates(df, DATE_COL)

    all_reports = []

    for target in TARGETS:
        print(f"\n==============================")
        print(f"Training target: {target}")
        print(f"==============================")

        # --- Build leak-safe working frame
        work = _build_feature_frame(df, target).dropna(subset=[target]).copy()

        # Split into X / y / dates
        X = work.drop(columns=[target, DATE_COL])
        y = work[target].astype(float)
        dates = work[DATE_COL]

        # OPTIONAL sanity print (first run): confirm no targets/dates in X
        # print("n_features:", X.shape[1])

        # --- Tuning with Optuna (time-aware CV)
        best_params, best_mae, tune_sec = tune_with_optuna(X, y, dates)
        print(f"Best MAE (Optuna CV) for {target}: {best_mae:.5f}")

        # Persist tuning summary
        tgt_dir = os.path.join(OUT_DIR, target)
        os.makedirs(tgt_dir, exist_ok=True)
        with open(os.path.join(tgt_dir, "tuning_summary.json"), "w") as f:
            json.dump({"target": target, "best_params": best_params, "cv_mae": best_mae, "tuning_seconds": tune_sec}, f, indent=2)

        # --- Full CV report with best params (richer metrics)
        per_fold, summary, cv_sec = crossval_report(X, y, dates, best_params)
        per_fold.to_csv(os.path.join(tgt_dir, "cv_metrics.csv"), index=False)
        with open(os.path.join(tgt_dir, "cv_metrics_mean.json"), "w") as f:
            json.dump(summary, f, indent=2)
        with open(os.path.join(tgt_dir, "timings.json"), "w") as f:
            json.dump({"tuning_seconds": tune_sec, "cv_seconds": cv_sec}, f, indent=2)

        # --- Optional: final refit on full data for importance snapshot
        pre = _preprocessor_for_df(X)
        X_all = pre.fit_transform(X)
        final_model = lgb.LGBMRegressor(
            **best_params,
            n_estimators=NUM_BOOST_ROUND,
            objective="mae",
            random_state=RANDOM_STATE,
        )
        final_model.fit(X_all, y)  # <-- no early stopping here
        try:
            importances = final_model.booster_.feature_importance(importance_type="gain")
            imp_df = pd.DataFrame({"feature_index": np.arange(len(importances)), "gain_importance": importances})
            imp_df.sort_values("gain_importance", ascending=False).to_csv(os.path.join(tgt_dir, "feature_importance.csv"), index=False)
        except Exception as e:
            with open(os.path.join(tgt_dir, "feature_importance_error.txt"), "w") as f:
                f.write(str(e))
        # --- Save final model + preprocessor for inference
        import joblib
        model_artifact = {
            "preprocessor": pre,          # ColumnTransformer fitted on training data
            "model": final_model,         # fitted LightGBMRegressor
            "features": X.columns.tolist(),
            "best_params": best_params,
            "target": target,
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
        }

        joblib.dump(model_artifact, os.path.join(tgt_dir, f"{target}_final_model.joblib"))
        print(f"Saved model artifact: {os.path.join(tgt_dir, f'{target}_final_model.joblib')}")

        # Aggregate summary to a master report
        report_row = {
            "target": target,
            "optuna_cv_mae": best_mae,
            "tuning_seconds": tune_sec,
            "cv_seconds": cv_sec,
            **{f"mean_{k}": v["mean"] for k, v in summary.items()},
            **{f"std_{k}": v["std"] for k, v in summary.items()},
        }
        all_reports.append(report_row)

    summary_df = pd.DataFrame(all_reports)
    summary_path = os.path.join(OUT_DIR, "overall_summary.csv")
    summary_df.to_csv(summary_path, index=False)
    print(f"\nSaved overall training summary to {summary_path}")
    print("Per-target reports saved under:", os.path.abspath(OUT_DIR))

In [None]:
df = pd.read_csv("s3://collegebasketballinsiders/train/train.csv", index_col=0)  # your assembled 27k x ~400 frame
train_targets_training_only(df)