In [None]:
"""
League of Legends Win Probability Prediction
- Loads Kaggle LoL 10-min dataset
- Cleans/engineers features
- Trains logistic regression with 5-fold CV
- Bootstraps coefficients and accuracy/AUC CIs
- Saves artifacts to ./reports

Run:
  python src/lol_win_model.py --data data/lol_10min.csv --target blueWins
"""

import argparse
import json
import os
from dataclasses import dataclass
from typing import Dict, Tuple

import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
import matplotlib.pyplot as plt


# --------------------------
# Config / Defaults
# --------------------------

SEED = 42
np.random.seed(SEED)

FEATURES_DEFAULT = [
    "blueKills",
    "blueDeaths",
    "blueGoldDiff",
    "blueExperienceDiff",
]

# --------------------------
# Helpers
# --------------------------

def load_data(path: str, target: str) -> Tuple[pd.DataFrame, pd.Series]:
    df = pd.read_csv(path)
    if target not in df.columns:
        raise ValueError(f"Target column '{target}' not found in the dataset.")

    # Basic sanity filter (the Kaggle file sometimes includes NaNs)
    df = df.dropna(subset=FEATURES_DEFAULT + [target])
    X = df[FEATURES_DEFAULT].copy()
    y = df[target].astype(int).copy()

    # Optional: enforce integer types for kill/death
    for c in ["blueKills", "blueDeaths"]:
        if c in X.columns:
            X[c] = X[c].astype(int)

    return X, y


def make_pipeline() -> Pipeline:
    # Liblinear handles small to mid datasets well
    logreg = LogisticRegression(
        solver="liblinear", random_state=SEED, max_iter=1000
    )
    return Pipeline([
        ("scaler", StandardScaler(with_mean=True, with_std=True)),
        ("clf", logreg),
    ])


def cross_validate(X: pd.DataFrame, y: pd.Series, pipe: Pipeline, n_splits: int = 5) -> Dict[str, float]:
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    acc = cross_val_score(pipe, X, y, scoring="accuracy", cv=kf)
    auc = cross_val_score(pipe, X, y, scoring="roc_auc", cv=kf)
    return {
        "cv_accuracy_mean": float(np.mean(acc)),
        "cv_accuracy_std": float(np.std(acc)),
        "cv_auc_mean": float(np.mean(auc)),
        "cv_auc_std": float(np.std(auc)),
    }


def bootstrap_ci(
    X: pd.DataFrame,
    y: pd.Series,
    n_boot: int = 1000,
) -> Tuple[pd.DataFrame, Dict[str, float]]:
    """
    Bootstrap both coefficients (via statsmodels Logit) and performance (acc/auc).
    Returns:
      coef_df: DataFrame with coefficient bootstrap distribution + CI
      perf_ci: dict with accuracy/auc CIs
    """
    n = len(y)
    coef_samples = []
    acc_samples = []
    auc_samples = []

    # Prepare a single held-out style split for perf bootstrap (or use full-sample preds)
    # We'll refit each bootstrap sample and evaluate on its own OOB complement (approximation).
    for b in range(n_boot):
        idx = np.random.randint(0, n, size=n)       # sample with replacement
        oob = np.setdiff1d(np.arange(n), idx)       # out-of-bag indices

        Xb = X.iloc[idx].reset_index(drop=True)
        yb = y.iloc[idx].reset_index(drop=True)

        # statsmodels for coefficients
        Xb_sm = sm.add_constant(Xb)
        try:
            model = sm.Logit(yb, Xb_sm).fit(disp=False)
            coef_samples.append(model.params.values)  # const + features
        except Exception:
            # if a bootstrap sample fails to converge, skip (rare)
            continue

        # Performance on OOB if available
        if len(oob) > 0:
            pipe = make_pipeline()
            pipe.fit(Xb, yb)
            yhat_proba = pipe.predict_proba(X.iloc[oob])[:, 1]
            yhat = (yhat_proba >= 0.5).astype(int)

            acc_samples.append(accuracy_score(y.iloc[oob], yhat))
            try:
                auc_samples.append(roc_auc_score(y.iloc[oob], yhat_proba))
            except Exception:
                # if OOB has one class, ROC AUC is undefined
                pass

    coef_samples = np.array(coef_samples)  # shape: [B, k]
    columns = ["const"] + list(X.columns)
    coef_df = pd.DataFrame(coef_samples, columns=columns)

    def ci(a, low=5, high=95):
        return np.percentile(a, [low, high])

    # Build CI table
    ci_rows = []
    for c in coef_df.columns:
        low, high = ci(coef_df[c].values)
        ci_rows.append({"term": c, "ci_low": low, "ci_high": high, "mean": coef_df[c].mean()})
    coef_ci_df = pd.DataFrame(ci_rows)

    # Perf CI
    perf_ci = {}
    if len(acc_samples) > 0:
        acc_low, acc_high = ci(np.array(acc_samples))
        perf_ci["bootstrap_acc_mean"] = float(np.mean(acc_samples))
        perf_ci["bootstrap_acc_ci_90"] = [float(acc_low), float(acc_high)]
    if len(auc_samples) > 0:
        auc_low, auc_high = ci(np.array(auc_samples))
        perf_ci["bootstrap_auc_mean"] = float(np.mean(auc_samples))
        perf_ci["bootstrap_auc_ci_90"] = [float(auc_low), float(auc_high)]

    return coef_ci_df, perf_ci


@dataclass
class RunOutputs:
    metrics: Dict[str, float]
    coef_ci: pd.DataFrame


def run(data_path: str, target: str) -> RunOutputs:
    X, y = load_data(data_path, target)

    # CV metrics
    pipe = make_pipeline()
    metrics = cross_validate(X, y, pipe, n_splits=5)

    # Fit once on full data for quick sanity
    pipe.fit(X, y)
    yhat_proba = pipe.predict_proba(X)[:, 1]
    yhat = (yhat_proba >= 0.5).astype(int)
    metrics["train_accuracy"] = float(accuracy_score(y, yhat))
    try:
        metrics["train_auc"] = float(roc_auc_score(y, yhat_proba))
    except Exception:
        pass

    # Bootstrap CIs for coefficients + performance
    coef_ci_df, perf_ci = bootstrap_ci(X, y, n_boot=1000)
    metrics.update(perf_ci)

    return RunOutputs(metrics=metrics, coef_ci=coef_ci_df)


def save_outputs(outputs: RunOutputs, out_dir: str = "reports") -> None:
    os.makedirs(out_dir, exist_ok=True)
    with open(os.path.join(out_dir, "metrics.json"), "w") as f:
        json.dump(outputs.metrics, f, indent=2)
    outputs.coef_ci.to_csv(os.path.join(out_dir, "coef_bootstrap.csv"), index=False)


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--data", type=str, required=True, help="Path to Kaggle CSV (10-minute dataset)")
    parser.add_argument("--target", type=str, default="blueWins", help="Binary target column (default: blueWins)")
    args = parser.parse_args()

    outputs = run(args.data, args.target)
    save_outputs(outputs)
    print("\n=== Cross-Validation ===")
    for k in ["cv_accuracy_mean", "cv_accuracy_std", "cv_auc_mean", "cv_auc_std"]:
        if k in outputs.metrics:
            print(f"{k}: {outputs.metrics[k]:.4f}")
    print("\n=== Bootstrap (90% CI) ===")
    if "bootstrap_acc_ci_90" in outputs.metrics:
        lo, hi = outputs.metrics["bootstrap_acc_ci_90"]
        print(f"Accuracy ~ mean {outputs.metrics['bootstrap_acc_mean']:.4f} | CI90 [{lo:.4f}, {hi:.4f}]")
    if "bootstrap_auc_ci_90" in outputs.metrics:
        lo, hi = outputs.metrics["bootstrap_auc_ci_90"]
        print(f"AUC ~ mean {outputs.metrics['bootstrap_auc_mean']:.4f} | CI90 [{lo:.4f}, {hi:.4f}]")
    print("\nSaved: reports/metrics.json & reports/coef_bootstrap.csv")


if __name__ == "__main__":
    main()
