In [2]:
import os, re, json, sys, glob, subprocess, warnings
from typing import Any, Dict, List, Optional, Tuple
warnings.filterwarnings("ignore")

def ensure_packages():
    pkgs = ["xgboost", "catboost", "pandas", "nbimporter", "ipykernel", "scikit-learn", "numpy"]
    for p in pkgs:
        try:
            __import__(p.split("==")[0])
        except Exception:
            subprocess.check_call([sys.executable, "-m", "pip", "install", p])

def maybe_mount_drive(mount: bool):
    if mount:
        try:
            from google.colab import drive
            drive.mount("/content/drive")
            print("[INFO] Google Drive mounted at /content/drive")
        except Exception as e:
            print("[WARN] Colab drive mount failed or not in Colab:", e)

# Notebook import utilities
def sanitize_notebook(src_path: str, dst_dir: str) -> Optional[str]:
    if not src_path or not os.path.exists(src_path):
        return None
    os.makedirs(dst_dir, exist_ok=True)
    base = os.path.basename(src_path)
    name, ext = os.path.splitext(base)
    safe = re.sub(r"[^A-Za-z0-9_]", "_", name) + ext
    dst = os.path.join(dst_dir, safe)
    try:
        if os.path.abspath(src_path) != os.path.abspath(dst):
            from shutil import copy2
            copy2(src_path, dst)
        return dst
    except Exception as e:
        print("[WARN] Failed to copy notebook:", src_path, "->", dst, e)
        return None

def auto_locate(patterns: List[str], search_root="/content/drive"):
    found = []
    for pat in patterns:
        matches = glob.glob(os.path.join(search_root, "**", pat), recursive=True)
        if matches:
            found.append(sorted(matches, key=lambda p: len(p))[0])
        else:
            found.append(None)
    return found

def import_notebook_module(nb_path: Optional[str]):
    if nb_path is None:
        return None
    try:
        import nbimporter
        sys.path.append(os.path.dirname(nb_path))
        mod_name = os.path.splitext(os.path.basename(nb_path))[0]
        mod = __import__(mod_name)
        print(f"[INFO] Imported notebook as module: {mod_name}")
        return mod
    except Exception as e:
        print("[WARN] Failed to import notebook:", nb_path, e)
        return None

# Pipeline core (metrics/modeling helpers)
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
try:
    from catboost import CatBoostClassifier
    _CATBOOST_AVAILABLE = True
except Exception:
    _CATBOOST_AVAILABLE = False

# Debug switch (print label distribution/probability statistics by fold)
DEBUG = False

RANK_PRIMARY = "logloss"
RANK_TIEBREAKERS = ["brier","auc"]

def make_xgb():
    return XGBClassifier(
        objective="binary:logistic",
        tree_method="hist",
        eval_metric="logloss",
        n_estimators=500,
        random_state=42
    )

def make_catboost():
    if not _CATBOOST_AVAILABLE:
        raise RuntimeError("catboost is not installed.")
    return CatBoostClassifier(
        loss_function="Logloss",
        verbose=False,
        random_seed=42,
        iterations=700,
        allow_writing_files=False
    )

def make_logreg():
    return LogisticRegression(solver="lbfgs", penalty="l2", max_iter=2000)

def refresh_model_registry():
    global _CATBOOST_AVAILABLE, MODEL_MAKERS, CatBoostClassifier
    try:
        from catboost import CatBoostClassifier as _CBC
        CatBoostClassifier = _CBC
        _CATBOOST_AVAILABLE = True
        MODEL_MAKERS["catboost"] = make_catboost
        print("[INFO] CatBoost available. Added to model registry.")
    except Exception as e:
        _CATBOOST_AVAILABLE = False
        MODEL_MAKERS.pop("catboost", None)
        print("[WARN] CatBoost not available:", e)

# Stage 3
PARAM_SPACE = {
    "xgb": {
        "max_depth": [3,4,5,6,7,8],
        "learning_rate": list(np.logspace(-3, -1, 7)),
        "subsample": [0.6,0.7,0.8,0.9,1.0],
        "colsample_bytree": [0.6,0.7,0.8,0.9,1.0],
        "min_child_weight": [1,2,5,10],
        "gamma": [0, 0.1, 0.3, 1.0],
        "reg_lambda": list(np.logspace(-2, 1, 6)),
    },
    "catboost": {
        "depth": [4,5,6,7,8],
        "learning_rate": list(np.logspace(-3, -1, 7)),
        "l2_leaf_reg": [1.0, 3.0, 5.0, 10.0, 20.0],
        "bagging_temperature": [0.0, 0.5, 1.0],
        "border_count": [32, 64, 128],
    },
    "logreg": {
        "C": list(np.logspace(-3, 2, 10)),
        "max_iter": [500, 1000, 2000],
        "fit_intercept": [True, False],
    }
}

MODEL_MAKERS = {"xgb": make_xgb, "logreg": make_logreg}
if _CATBOOST_AVAILABLE:
    MODEL_MAKERS["catboost"] = make_catboost

def read_feature_names(path: str):
    names = []
    if path and os.path.exists(path):
        with open(path, "r", encoding="utf-8") as f:
            names = [line.strip() for line in f if line.strip()]
    return names

def select_feature_columns(X: pd.DataFrame, drop_onoff: bool=False):
    Xwork = X.copy()
    if drop_onoff:
        for col in ["onball_contribution_score","offball_contribution_score"]:
            if col in Xwork.columns:
                Xwork = Xwork.drop(columns=[col])
    drop_cols = ["event_uuid","game_id"]
    feat_cols = [c for c in Xwork.columns if c not in drop_cols and np.issubdtype(Xwork[c].dtype, np.number)]
    Xnum = Xwork[feat_cols].replace([np.inf, -np.inf], np.nan).fillna(0.0)
    return feat_cols, Xnum

def load_xy_from_files(features_h5, labels_h5, feature_names_txt, score_col="scores", concede_col="concedes"):
    X = pd.read_hdf(features_h5)
    ydf = pd.read_hdf(labels_h5)
    if "event_uuid" in X.columns and "event_uuid" in ydf.columns:
        ydf = ydf.set_index("event_uuid").reindex(X["event_uuid"]).reset_index()
    feat_names = read_feature_names(feature_names_txt)
    if feat_names:
        feat_names = [c for c in feat_names if c in X.columns]
        keep = feat_names + [c for c in ["event_uuid","game_id"] if c in X.columns]
        X = X[keep].copy()
    y_score = ydf[score_col].astype(int).values
    y_concede = ydf[concede_col].astype(int).values
    groups = X["game_id"] if "game_id" in X.columns else None
    return X, y_score, y_concede, groups

def eval_metrics(y_true, y_proba):
    p = np.asarray(y_proba, dtype=float).ravel()
    p = np.nan_to_num(p, nan=0.5, posinf=1.0 - 1e-7, neginf=1e-7)
    p = np.clip(p, 1e-7, 1.0 - 1e-7)

    y = np.asarray(y_true, dtype=float).ravel()

    logloss = float(-np.mean(y * np.log(p) + (1.0 - y) * np.log(1.0 - p)))
    brier = float(np.mean((y - p) ** 2))
    auc = float(roc_auc_score(y, p)) if len(np.unique(y)) > 1 else float("nan")
    return {"logloss": logloss, "brier": brier, "auc": auc}

def rank_df(sub: pd.DataFrame, primary: str, tiebreakers: List[str]):
    asc_list = [False if c=="auc" else True for c in [primary]+tiebreakers]
    return sub.sort_values([primary]+tiebreakers, ascending=asc_list)

def get_cv_splitter(y, groups):
    if groups is None:
        return StratifiedKFold(n_splits=5, shuffle=True, random_state=42), None
    else:
        return GroupKFold(n_splits=5), groups

def cv_predict_model(model, Xmat, y, groups):
    splitter, groups_arg = get_cv_splitter(y, groups)
    y_proba_all = np.zeros_like(y, dtype=float)

    fold_idx = 0
    for train_idx, val_idx in splitter.split(Xmat, y, groups_arg):
        Xtr, Xva = Xmat[train_idx], Xmat[val_idx]
        ytr = y[train_idx]

        Xtr = Xtr.astype(np.float32, copy=False)
        Xva = Xva.astype(np.float32, copy=False)

        model.fit(Xtr, ytr)

        if DEBUG:
            binc = np.bincount(ytr)
            print(f"[CV]{fold_idx} train y dist:", dict(zip(range(len(binc)), binc)))

        proba = model.predict_proba(Xva)[:, 1]

        if DEBUG:
            pmin = np.nanmin(proba); pmax = np.nanmax(proba); pfin = np.isfinite(proba).mean()
            print(f"[CV]{fold_idx} proba(before) min={pmin:.6f} max={pmax:.6f} finite={pfin:.3f}")

        proba = np.nan_to_num(proba, nan=0.5, posinf=1.0 - 1e-7, neginf=1e-7)
        proba = np.clip(proba, 1e-7, 1.0 - 1e-7)

        if DEBUG:
            pmin = np.min(proba); pmax = np.max(proba)
            print(f"[CV]{fold_idx} proba(after)  min={pmin:.6f} max={pmax:.6f}")

        y_proba_all[val_idx] = proba
        fold_idx += 1

    return y_proba_all

NB_FUNCS = {
    "compute_on_offball_features": None,
    "build_features_and_labels": None,
}

def adopt_functions_from_notebooks(spadl_nb, onoff_nb, convert_nb):
    mapping_candidates = {
        "compute_on_offball_features": [
            "compute_on_offball_features","compute_on_offball",
            "on_offball_features","make_on_offball_features",
        ],
        "build_features_and_labels": [
            "build_features_and_labels","convert_into_features_and_labels",
            "make_features_and_labels","create_features_and_labels",
        ]
    }
    for name, candidates in mapping_candidates.items():
        for mod in [onoff_nb, convert_nb, spadl_nb]:
            if mod is None:
                continue
            for attr in candidates:
                func = getattr(mod, attr, None)
                if callable(func):
                    NB_FUNCS[name] = func
                    print(f"[INFO] Bound notebook function: {name} <- {mod.__name__}.{attr}")
                    break
            if NB_FUNCS[name] is not None:
                break
    missing = [k for k,v in NB_FUNCS.items() if v is None]
    if missing:
        print("[WARN] Missing notebook functions, will use built-ins for:", missing)

# Fallback code
def fallback_compute_on_offball_features(events, line_gap_threshold, in_line_gap_threshold, nr_actions):
    results = []
    def extract_onball_attacker_position(freeze_frame):
        if not isinstance(freeze_frame, list): return None
        for player in freeze_frame:
            if player.get("actor", False): return player.get("location", None)
        return None
    def compute_ball_direction_vector(event):
        freeze_frame = event.get("freeze_frame"); end_x = event.get("end_x"); end_y = event.get("end_y")
        if freeze_frame is None or end_x is None or end_y is None: return (0.0, 0.0)
        start_pos = extract_onball_attacker_position(freeze_frame)
        if start_pos is None: return (0.0, 0.0)
        dx = end_x - start_pos[0]; dy = end_y - start_pos[1]
        return (dx, dy)
    def assign_defensive_lines(defenders, x_threshold=3.0, y_tolerance=2.0):
        if not defenders: return []
        x_coords = np.array([p["location"][0] for p in defenders])
        y_coords = np.array([p["location"][1] for p in defenders])
        median_y = np.median(y_coords)
        central_indices = np.where(np.abs(y_coords - median_y) <= y_tolerance)[0]
        assigned = np.full(len(defenders), -1, dtype=int)
        current_line = 0
        for cx in sorted(x_coords[central_indices]):
            for i in range(len(defenders)):
                if assigned[i] == -1 and abs(x_coords[i] - cx) <= x_threshold:
                    assigned[i] = current_line
            current_line += 1
        for i in range(len(defenders)):
            if assigned[i] == -1:
                assigned[i] = current_line
                current_line += 1
        for idx, p in enumerate(defenders): p["line"] = int(assigned[idx])
        return defenders
    def compute_line_gaps(defenders):
        import collections


        if not defenders:
            return None, None

        line_to_x = collections.defaultdict(list)
        line_to_y = collections.defaultdict(list)

        for d in defenders:
            label = d.get("line", -1)
            if isinstance(label, (int, float)) and label != -1:
                x, y = d["location"]
                line_to_x[int(label)].append(float(x))
                line_to_y[int(label)].append(float(y))

        if not line_to_x:
            return None, None

        sorted_labels = sorted(line_to_x.keys(), key=lambda L: np.mean(line_to_x[L]))
        line_gap = None
        if len(sorted_labels) >= 2:
            centers_x = [np.mean(line_to_x[L]) for L in sorted_labels]
            diffs = [centers_x[i + 1] - centers_x[i] for i in range(len(centers_x) - 1)]
            line_gap = max(diffs) if diffs else None

        spans_y = [
            (max(ys) - min(ys)) if len(ys) >= 2 else 0.0
            for ys in line_to_y.values()
        ]
        in_line_gap = max(spans_y) if spans_y else None

        return line_gap, in_line_gap
    def extract_defensive_line_features(freeze_frame):
        if not freeze_frame: return None, None
        defenders = [p for p in freeze_frame if not p.get("teammate", False)]
        if not defenders: return None, None
        defenders = assign_defensive_lines(defenders)
        return compute_line_gaps(defenders)
    def detect_defensive_collapse_onball(ff_cur, ff_nxt):
        if not isinstance(ff_cur, list) or not isinstance(ff_nxt, list): return 0.0
        defenders_current = [p for p in ff_cur if not p.get("teammate", False)]
        defenders_after   = [p for p in ff_nxt if not p.get("teammate", False)]
        diff = len(defenders_current) - len(defenders_after)
        return 0.05 if diff>=3 else 0.03 if diff==2 else 0.01 if diff==1 else 0.0
    def cosine_similarity(vec1, vec2):
        (x1,y1),(x2,y2)=vec1,vec2
        n1=(x1*x1+y1*y1)**0.5; n2=(x2*x2+y2*y2)**0.5
        if n1==0 or n2==0: return 0.0
        return (x1*x2+y1*y2)/(n1*n2)
    def detect_defensive_collapse_offball(ff_cur, ff_nxt, ball_vec, line_thr, in_line_thr, cos_thr=0.8):
        if not ff_cur or not ff_nxt: return False
        attackers_current = {(p.get("player_id") or p.get("actor_id")): p["location"] for p in ff_cur if p.get("teammate", False) and not p.get("actor", False) and "location" in p}
        attackers_after   = {(p.get("player_id") or p.get("actor_id")): p["location"] for p in ff_nxt if p.get("teammate", False) and not p.get("actor", False) and "location" in p}
        moved=False
        for pid,pos_before in attackers_current.items():
            pos_after=attackers_after.get(pid);
            if not pos_after: continue
            dx=pos_after[0]-pos_before[0]; dy=pos_after[1]-pos_before[1]
            if dx==0 and dy==0: continue
            if cosine_similarity((dx,dy), ball_vec)>=cos_thr: moved=True; break
        if not moved: return False
        gap_c, in_gap_c = extract_defensive_line_features(ff_cur)
        gap_a, in_gap_a = extract_defensive_line_features(ff_nxt)
        if gap_c is None or gap_a is None: return False
        return ((gap_a or 0)-(gap_c or 0) >= line_thr) or ((in_gap_a or 0)-(in_gap_c or 0) >= in_line_thr)

    results = []
    for i, current in enumerate(events):
        next_ev = events[i+1] if i+1 < len(events) else None
        out = {"index": current.get("index"), "event_uuid": current.get("event_uuid"),
               "onball_collapse": False, "offball_collapse": False,
               "contributed_shot": False, "contributed_goal": False,
               "onball_contribution_score": 0.0, "offball_contribution_score": 0.0}
        if next_ev and current.get("game_id")==next_ev.get("game_id") and current.get("period_id")==next_ev.get("period_id"):
            ff_cur=current.get("freeze_frame"); ff_nxt=next_ev.get("freeze_frame")
            if isinstance(ff_cur, list) and isinstance(ff_nxt, list):
                if (current.get("action_type") or "").lower()=="dribble":
                    on_val=detect_defensive_collapse_onball(ff_cur, ff_nxt)
                    if on_val>0.0:
                        out["onball_collapse"]=True; out["onball_contribution_score"]=on_val
                if (current.get("action_type") or "").lower() in {"pass","dribble"}:
                    ball_vec=compute_ball_direction_vector(current)
                    if detect_defensive_collapse_offball(ff_cur, ff_nxt, ball_vec, line_gap_threshold, in_line_gap_threshold, 0.8):
                        out["offball_collapse"]=True; out["offball_contribution_score"]=0.02
        results.append(out)
    return pd.DataFrame(results)

def fallback_build_features_and_labels(enriched: pd.DataFrame, nr_actions: int):
    df = enriched.copy()

    period_col = None
    for c in ['period_id', 'period', 'half']:
        if c in df.columns:
            period_col = c
            break
    if period_col is None:
        period_col = '_tmp_period'
        df[period_col] = 1

    # 1) Left-to-right normalization
    forward_like_actions = {'pass', 'cross', 'carry', 'dribble', 'take_on',
                            'shot', 'shot_freekick', 'shot_penalty'}

    def _normalize_group(g: pd.DataFrame) -> pd.DataFrame:
        if 'action_type' in g.columns:
            m = g['action_type'].isin(forward_like_actions)
            gg = g[m] if m.any() else g
        else:
            gg = g

        med_dx = (gg['end_x'] - gg['start_x']).median()

        flip = pd.notnull(med_dx) and med_dx < 0

        if flip:
            g = g.copy()
            g['start_x'] = 120 - g['start_x']
            g['end_x']   = 120 - g['end_x']
            g['start_y'] = 80  - g['start_y']
            g['end_y']   = 80  - g['end_y']

        g['dx'] = g['end_x'] - g['start_x']
        g['dy'] = g['end_y'] - g['start_y']
        return g

    df = (
        df.groupby(['game_id', 'team_id', period_col], sort=False, group_keys=False)
          .apply(_normalize_group)
    )

    if '_tmp_period' in df.columns:
        df = df.drop(columns=['_tmp_period'])

    nb_prev = 3
    df_slice = df.iloc[nb_prev:].reset_index(drop=True)

    def startlocation(d): return d[['start_x','start_y']].copy()
    def endlocation(d):   return d[['end_x','end_y']].copy()
    def movement(d):
        dx = d['dx']; dy = d['dy']
        return pd.DataFrame({'movement': np.sqrt(dx**2 + dy**2)})

    def startpolar(d):
        dx = 120 - d['start_x']; dy = 40 - d['start_y']
        return pd.DataFrame({
            'start_dist_to_goal':  np.sqrt(dx**2 + dy**2),
            'start_angle_to_goal': np.degrees(np.arctan2(dy, dx))
        })

    def endpolar(d):
        dx = 120 - d['end_x']; dy = 40 - d['end_y']
        return pd.DataFrame({
            'end_dist_to_goal':  np.sqrt(dx**2 + dy**2),
            'end_angle_to_goal': np.degrees(np.arctan2(dy, dx))
        })

    # Create X
    X_df = pd.concat([
        startlocation(df_slice), endlocation(df_slice), movement(df_slice),
        startpolar(df_slice), endpolar(df_slice),
        pd.get_dummies(df_slice['action_type'], prefix='action_type'),
        pd.get_dummies(df_slice['result'],      prefix='result'),
        pd.get_dummies(df_slice['bodypart'],    prefix='bodypart'),
        df_slice[['onball_contribution_score','offball_contribution_score']].copy()
    ], axis=1)

    ids = df_slice[['event_uuid','game_id']].copy()
    X = pd.concat([ids, X_df], axis=1)

    # Create Y
    labels = {"scores": [], "concedes": [], "event_uuid": []}
    goal_actions = {'shot','shot_freekick','shot_penalty'}

    for game_id, df_game in df.iloc[nb_prev:].groupby("game_id"):
        if 'index' in df_game.columns:
            df_game = df_game.sort_values("index").reset_index(drop=True)
        else:
            df_game = df_game.reset_index().rename(columns={"index": "index"})

        for i, row in df_game.iterrows():
            team_id   = row["team_id"]
            event_uuid= row["event_uuid"]
            future    = df_game.iloc[i+1 : i+1+nr_actions]

            scored = (
                (future["team_id"] == team_id) &
                (future["action_type"].isin(goal_actions)) &
                (future["result"] == "success")
            ).any()

            conceded = (
                (future["team_id"] != team_id) &
                (future["action_type"].isin(goal_actions)) &
                (future["result"] == "success")
            ).any()

            labels["scores"].append(int(scored))
            labels["concedes"].append(int(conceded))
            labels["event_uuid"].append(event_uuid)

    y = pd.DataFrame(labels)
    return X, y


def get_func(name):
    f = NB_FUNCS.get(name)
    if callable(f): return f
    return fallback_compute_on_offball_features if name=="compute_on_offball_features" else fallback_build_features_and_labels

def load_merged_actions(h5_path: str) -> Dict[int, pd.DataFrame]:
    out = {}
    with pd.HDFStore(h5_path, mode='r') as store:
        keys = [k for k in store.keys() if k.startswith('/actions/')]
        for k in keys:
            try:
                mid = int(k.split('/')[-1])
            except Exception:
                tail = k.split('/')[-1]
                mid = int(re.sub(r"[^0-9]", "", tail) or 0)
            out[mid] = store[k].copy()
    return out

def rebuild_Xy_for_combo(merged_h5: str, line_gap: float, in_line_gap: float, nr_actions: int):
    matches = load_merged_actions(merged_h5)
    X_all, y_all = [], []
    onoff_fn = get_func("compute_on_offball_features")
    build_fn = get_func("build_features_and_labels")
    for mid, df_game in matches.items():
        df_game = df_game.sort_values(["period_id","seconds"]).reset_index(drop=True)
        feat_df = onoff_fn(df_game.to_dict("records"), line_gap, in_line_gap, nr_actions)
        if "index" in feat_df.columns:
            feat_df = feat_df.drop_duplicates(subset=["event_uuid"])
            try: feat_df = feat_df.drop(columns=["index"])
            except Exception: pass
        enriched = df_game.merge(feat_df, on="event_uuid", how="left")
        X, y = build_fn(enriched, nr_actions=nr_actions)
        X_all.append(X); y_all.append(y)
    Xe = pd.concat(X_all, axis=0).reset_index(drop=True)
    ye = pd.concat(y_all, axis=0).reset_index(drop=True)
    ye = ye.set_index('event_uuid').reindex(Xe['event_uuid']).reset_index()
    return Xe, ye

def ensure_stage1_xy(features_h5, labels_h5, feature_names_txt, merged_h5,
                     default_line_gap=5.0, default_in_line_gap=5.0, default_nr_actions=10):
    need_build = (not os.path.exists(features_h5)) or (not os.path.exists(labels_h5))
    if not need_build:
        print("[INFO] Found existing Stage1 X/y files.")
        return
    if not os.path.exists(merged_h5):
        raise FileNotFoundError(f"Stage1 X/y missing and merged_h5 not found: {merged_h5}")
    print("[INFO] Stage1 X/y not found. Building from merged_h5 with defaults:",
          f"line_gap={default_line_gap}, in_line_gap={default_in_line_gap}, nr_actions={default_nr_actions}")
    Xe, ye = rebuild_Xy_for_combo(merged_h5, default_line_gap, default_in_line_gap, default_nr_actions)
    os.makedirs(os.path.dirname(features_h5), exist_ok=True)
    Xe.to_hdf(features_h5, key="X", mode="w")
    ye.to_hdf(labels_h5, key="y", mode="w")
    feat_cols = [c for c in Xe.columns if c not in ["event_uuid","game_id"]]
    with open(feature_names_txt, "w", encoding="utf-8") as f:
        for c in feat_cols:
            f.write(c + "\n")
    print("[INFO] Built and saved Stage1 X/y at:")
    print("  features_h5:", features_h5)
    print("  labels_h5  :", labels_h5)
    print("  feature_names_txt:", feature_names_txt)

# Stages
def stage1_model_selection(features_h5, labels_h5, feature_names_txt, outdir, models=("xgb","catboost","logreg")):
    X, y_score, y_concede, groups = load_xy_from_files(features_h5, labels_h5, feature_names_txt)
    rows = []
    for target_name, y in [("score", y_score), ("concede", y_concede)]:
        for m in models:
            if m not in MODEL_MAKERS:
                continue
            try:
                model = MODEL_MAKERS[m]()
                feat_cols, Xnum = select_feature_columns(X)
                Xmat = Xnum[feat_cols].astype(np.float32).values
                y_proba = cv_predict_model(model, Xmat, y, groups)
                metrics = eval_metrics(y, y_proba)
                rows.append({
                    "target": target_name, "model": m,
                    "auc": metrics["auc"], "brier": metrics["brier"], "logloss": metrics["logloss"],
                    "n_features": len(feat_cols), "best_params_json": json.dumps({})
                })
            except Exception as e:
                rows.append({
                    "target": target_name, "model": m,
                    "auc": float("nan"), "brier": float("inf"), "logloss": float("inf"),
                    "n_features": 0, "best_params_json": json.dumps({"error": str(e)})
                })
    df = pd.DataFrame(rows)
    path = os.path.join(outdir, "stage1_model_results.csv"); df.to_csv(path, index=False)

    best = {}
    for t in ["score","concede"]:
        sub = df[df["target"]==t].copy()
        sub = sub[np.isfinite(sub["logloss"]) & np.isfinite(sub["brier"])].copy()
        if sub.empty:
            print(f"[Stage 1][WARN] All models failed on target={t}. Falling back to 'logreg'.")
            best[t] = {"model": "logreg", "auc": float("nan"), "brier": float("nan"),
                       "logloss": float("nan"), "best_params": {}}
        else:
            ranked = rank_df(sub, RANK_PRIMARY, RANK_TIEBREAKERS)
            row = ranked.iloc[0].to_dict()
            best[t] = {"model": row["model"], "auc": row["auc"], "brier": row["brier"],
                       "logloss": row["logloss"], "best_params": {}}

    with open(os.path.join(outdir, "stage1_best_model.json"), "w") as f:
        json.dump(best, f, indent=2)
    print("[Stage 1] Saved:", path)
    print("[Stage 1] Best per target:", best)
    return best, df

def stage2_feature_sweep(best_models, merged_h5, outdir, line_vals, in_line_vals, nr_actions_vals):
    rows = []
    for line_gap in line_vals:
        for in_line_gap in in_line_vals:
            for nr_actions in nr_actions_vals:
                Xe, ye = rebuild_Xy_for_combo(merged_h5, line_gap, in_line_gap, nr_actions)
                groups = Xe["game_id"] if "game_id" in Xe.columns else None
                feat_cols, Xnum = select_feature_columns(Xe)
                Xmat = Xnum[feat_cols].astype(np.float32).values
                for target_name, col in [("score","scores"), ("concede","concedes")]:
                    y = ye[col].astype(int).values
                    mname = best_models[target_name]["model"]
                    model = MODEL_MAKERS[mname]()
                    y_proba = cv_predict_model(model, Xmat, y, groups)
                    metrics = eval_metrics(y, y_proba)
                    rows.append({"line_gap": line_gap, "in_line_gap": in_line_gap, "nr_actions": nr_actions,
                                 "target": target_name, "model": mname, **metrics})
    df = pd.DataFrame(rows)
    path = os.path.join(outdir, "stage2_feature_sweep_metrics.csv"); df.to_csv(path, index=False)
    pivot = df.pivot_table(index=["line_gap","in_line_gap","nr_actions","model"],
                           columns="target", values=RANK_PRIMARY, aggfunc="mean").reset_index()
    if set(["score","concede"]).issubset(pivot.columns):
        pivot["sum_metric"] = -(pivot["score"] + pivot["concede"]) if RANK_PRIMARY=="auc" else (pivot["score"] + pivot["concede"])
        best_idx = pivot["sum_metric"].idxmin()
        best_combo = pivot.loc[best_idx, ["line_gap","in_line_gap","nr_actions","model"]].to_dict()
    else:
        pivot["sum_metric"] = pivot.select_dtypes(include=[np.number]).mean(axis=1)
        best_idx = pivot["sum_metric"].idxmin()
        best_combo = pivot.loc[best_idx, ["line_gap","in_line_gap","nr_actions","model"]].to_dict()
    with open(os.path.join(outdir, "stage2_best_feature_combo.json"), "w") as f:
        json.dump(best_combo, f, indent=2)
    print("[Stage 2] Saved:", path)
    print("[Stage 2] Best feature combo:", best_combo)
    return df, best_combo

def stage3_hyperparam_on_best(best_feature_combo, merged_h5, outdir, n_iter=25):
    Xe, ye = rebuild_Xy_for_combo(merged_h5, best_feature_combo["line_gap"], best_feature_combo["in_line_gap"], best_feature_combo["nr_actions"])
    groups = Xe["game_id"] if "game_id" in Xe.columns else None
    rows = []
    params = PARAM_SPACE[best_feature_combo["model"]]
    for target_name, col in [("score","scores"), ("concede","concedes")]:
        y = ye[col].astype(int).values
        mname = best_feature_combo["model"]
        model = MODEL_MAKERS[mname]()
        feat_cols, Xnum = select_feature_columns(Xe)
        Xmat = Xnum[feat_cols].astype(np.float32).values

        splitter, groups_arg = get_cv_splitter(y, groups)

        search = RandomizedSearchCV(
            estimator=model,
            param_distributions=params,
            n_iter=n_iter,
            scoring="neg_log_loss",
            cv=splitter.split(Xmat, y, groups_arg),
            refit=True,
            n_jobs=-1,
            verbose=0,
            random_state=42
        )
        search.fit(Xmat, y)
        y_proba = search.predict_proba(Xmat)[:,1]
        metrics = eval_metrics(y, y_proba)
        rows.append({"target": target_name, "model": mname,
                     "auc": metrics["auc"], "brier": metrics["brier"], "logloss": metrics["logloss"],
                     "n_features": len(feat_cols),
                     "best_params_json": json.dumps(search.best_params_)})
    df = pd.DataFrame(rows)
    path = os.path.join(outdir, "stage3_hyperparam_results.csv"); df.to_csv(path, index=False)
    print("[Stage 3] Saved:", path)
    return df

def stage4_ablation(best_feature_combo, merged_h5, outdir):
    Xe, ye = rebuild_Xy_for_combo(merged_h5, best_feature_combo["line_gap"], best_feature_combo["in_line_gap"], best_feature_combo["nr_actions"])
    groups = Xe["game_id"] if "game_id" in Xe.columns else None
    rows = []
    for feature_set, drop_flag in [("all", False), ("no_onoff", True)]:
        feat_cols, Xnum = select_feature_columns(Xe, drop_onoff=drop_flag)
        Xmat = Xnum[feat_cols].astype(np.float32).values
        for target_name, col in [("score","scores"), ("concede","concedes")]:
            y = ye[col].astype(int).values
            mname = best_feature_combo["model"]
            model = MODEL_MAKERS[mname]()
            y_proba = cv_predict_model(model, Xmat, y, groups)
            metrics = eval_metrics(y, y_proba)
            rows.append({"feature_set": feature_set, "target": target_name, "model": mname,
                         "auc": metrics["auc"], "brier": metrics["brier"], "logloss": metrics["logloss"],
                         "n_features": len(feat_cols)})
    df = pd.DataFrame(rows)
    path = os.path.join(outdir, "stage4_ablation_metrics.csv"); df.to_csv(path, index=False)
    print("[Stage 4] Saved:", path)
    return df

# High-level runner
def run_pipeline(
    mount_drive: bool = True,
    outputs_dir: str = "/content/drive/MyDrive/VAEP_outputs",
    features_h5: str = "/content/drive/MyDrive/Data/Processed/Compute_Features_and_Labels/euro2020_vaep_features.h5",
    labels_h5: str = "/content/drive/MyDrive/Data/Processed/Compute_Features_and_Labels/euro2020_vaep_labels.h5",
    feature_names_txt: str = "/content/drive/MyDrive/Data/Processed/Compute_Features_and_Labels/euro2020_feature_names.txt",
    merged_h5: str = "/content/drive/My Drive/Data/Processed/euro2020_spadl_merged.h5",
    line_gaps: Tuple[float, ...] = (3.0, 5.0, 7.0),
    in_line_gaps: Tuple[float, ...] = (3.0, 5.0, 7.0),
    nr_actions_vals: Tuple[int, ...] = (5, 10),
    n_iter_stage3: int = 25,
    nb_spadl: Optional[str] = None,
    nb_onoff: Optional[str] = None,
    nb_convert: Optional[str] = None,
    stage: str = "all",  # "all"|"stage1"|"stage2"|"stage3"|"stage4"
):
    ensure_packages()
    refresh_model_registry()
    maybe_mount_drive(mount_drive)
    os.makedirs(outputs_dir, exist_ok=True)

    if nb_spadl is None or nb_onoff is None or nb_convert is None:
        patterns = ["my_SPADL_Converter.ipynb", "On_Offball_Features_Merge.ipynb", "Convert_into_Features_and_Labels.ipynb"]
        auto = auto_locate(patterns)
        nb_spadl, nb_onoff, nb_convert = auto
        print("[INFO] Auto-located notebooks:", auto)

    staging = "/content/nb_imports"
    nb_spadl = sanitize_notebook(nb_spadl, staging) if nb_spadl else None
    nb_onoff = sanitize_notebook(nb_onoff, staging) if nb_onoff else None
    nb_convert = sanitize_notebook(nb_convert, staging) if nb_convert else None

    mod_spadl = import_notebook_module(nb_spadl)
    mod_onoff = import_notebook_module(nb_onoff)
    mod_convert = import_notebook_module(nb_convert)
    adopt_functions_from_notebooks(mod_spadl, mod_onoff, mod_convert)

    ensure_stage1_xy(features_h5, labels_h5, feature_names_txt, merged_h5,
                     default_line_gap=5.0, default_in_line_gap=5.0, default_nr_actions=10)
    # Stage 1
    if stage in ("all","stage1"):
        stage1_model_selection(features_h5, labels_h5, feature_names_txt, outputs_dir)

    s1_json = os.path.join(outputs_dir, "stage1_best_model.json")
    if not os.path.exists(s1_json):
        print("[WARN] stage1_best_model.json not found. Using default {'score':'logreg','concede':'logreg'}")
        best_models = {"score":{"model":"logreg","best_params":{}}, "concede":{"model":"logreg","best_params":{}}}
    else:
        with open(s1_json, "r") as f:
            best_models = json.load(f)

    # Stage 2
    if stage in ("all","stage2"):
        stage2_feature_sweep(best_models, merged_h5, outputs_dir, list(line_gaps), list(in_line_gaps), list(nr_actions_vals))

    s2_json = os.path.join(outputs_dir, "stage2_best_feature_combo.json")
    if not os.path.exists(s2_json):
        print("[WARN] stage2_best_feature_combo.json not found. Using defaults (line=5,in_line=5,n=10, model from score in stage1).")
        best_combo = {"line_gap":5.0, "in_line_gap":5.0, "nr_actions":10, "model": best_models.get("score",{}).get("model","logreg")}
    else:
        with open(s2_json, "r") as f:
            best_combo = json.load(f)

    # Stage 3
    if stage in ("all","stage3"):
        stage3_hyperparam_on_best(best_combo, merged_h5, outputs_dir, n_iter=n_iter_stage3)

    # Stage 4
    if stage in ("all","stage4"):
        stage4_ablation(best_combo, merged_h5, outputs_dir)

    print("\n[Done] Outputs in:", outputs_dir)
    print(" - stage1_model_results.csv / stage1_best_model.json")
    print(" - stage2_feature_sweep_metrics.csv / stage2_best_feature_combo.json")
    print(" - stage3_hyperparam_results.csv")
    print(" - stage4_ablation_metrics.csv")

In [2]:
run_pipeline(
    mount_drive=True,
    outputs_dir="/content/drive/MyDrive/Data/VAEP_outputs",
    features_h5="/content/drive/MyDrive/Data/Processed/Compute_Features_and_Labels/euro2020_vaep_features.h5",
    labels_h5="/content/drive/MyDrive/Data/Processed/Compute_Features_and_Labels/euro2020_vaep_labels.h5",
    feature_names_txt="/content/drive/MyDrive/Data/Processed/Compute_Features_and_Labels/euro2020_feature_names.txt",
    merged_h5="/content/drive/MyDrive/Data/Processed/euro2020_spadl_merged.h5",
    line_gaps=(3.0,5.0,7.0),
    in_line_gaps=(3.0,5.0,7.0),
    nr_actions_vals=(5,10),
    n_iter_stage3=25,
    nb_spadl="/content/drive/MyDrive/Data/Processed/vaep_code/my_spadl_converter.py",
    nb_onoff="/content/drive/MyDrive/Data/Processed/vaep_code/on_offball_features_Merge.py",
    nb_convert="/content/drive/MyDrive/Data/Processed/vaep_code/convert_into_features_and_labels.py",
    stage="stage4"      # "all"|"stage1"|"stage2"|"stage3"|"stage4"
)

[INFO] CatBoost available. Added to model registry.
Mounted at /content/drive
[INFO] Google Drive mounted at /content/drive
[WARN] Missing notebook functions, will use built-ins for: ['compute_on_offball_features', 'build_features_and_labels']
[INFO] Found existing Stage1 X/y files.
[Stage 4] Saved: /content/drive/MyDrive/Data/VAEP_outputs/stage4_ablation_metrics.csv

[Done] Outputs in: /content/drive/MyDrive/Data/VAEP_outputs
 - stage1_model_results.csv / stage1_best_model.json
 - stage2_feature_sweep_metrics.csv / stage2_best_feature_combo.json
 - stage3_hyperparam_results.csv
 - stage4_ablation_metrics.csv


In [3]:
import os, json, glob, numpy as np, pandas as pd, joblib
from google.colab import drive
if not os.path.ismount('/content/drive'):
    drive.mount('/content/drive')

import sys, subprocess

try:
    import catboost
except Exception:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "catboost"])
    import catboost

from catboost import CatBoostClassifier

def make_catboost():
    return CatBoostClassifier(
        loss_function="Logloss",
        verbose=False,
        random_seed=42,
        iterations=700,
        allow_writing_files=False
    )

MODEL_MAKERS["catboost"] = make_catboost
print("MODEL_MAKERS:", list(MODEL_MAKERS.keys()))

def read_best_combo_and_params(outputs_dir):
    with open(os.path.join(outputs_dir, "stage2_best_feature_combo.json"), "r") as f:
        combo = json.load(f)
    best_params = {"score": {}, "concede": {}}
    s3 = os.path.join(outputs_dir, "stage3_hyperparam_results.csv")
    if os.path.exists(s3):
        df3 = pd.read_csv(s3)
        for t in ["score","concede"]:
            row = df3[df3["target"]==t].iloc[0]
            best_params[t] = json.loads(row["best_params_json"]) if isinstance(row["best_params_json"], str) else {}
    return combo, best_params

def align_columns(X_df: pd.DataFrame, feat_cols):
    Xw = X_df.copy()
    for c in feat_cols:
        if c not in Xw.columns:
            Xw[c] = 0.0
    return Xw[feat_cols]

def feature_columns_by_mode(X: pd.DataFrame, mode: str):
    assert mode in ("all","ononly","no_onoff")
    feat_cols, Xnum = select_feature_columns(X, drop_onoff=False)
    if "onball_contribution_score" in feat_cols and mode in ("no_onoff",):
        feat_cols.remove("onball_contribution_score")
    if "offball_contribution_score" in feat_cols and mode in ("ononly","no_onoff"):
        feat_cols.remove("offball_contribution_score")

    return feat_cols, Xnum

def stage5_train_final_on_euro(outputs_dir, euro_merged_h5):
    combo, best_params = read_best_combo_and_params(outputs_dir)

    # EURO 2020 features/labels (regenerated with optimal feature parameters)
    Xe, ye = rebuild_Xy_for_combo(
        merged_h5=euro_merged_h5,
        line_gap=float(combo["line_gap"]),
        in_line_gap=float(combo["in_line_gap"]),
        nr_actions=int(combo["nr_actions"]),
    )

    artifacts = {}
    feature_files = {}

    for mode in ("all","ononly","no_onoff"):
        feat_cols, Xnum = feature_columns_by_mode(Xe, mode)
        Xmat = Xnum[feat_cols].astype(np.float32).values
        feature_files[mode] = f"final_feature_columns_{mode}.json"
        with open(os.path.join(outputs_dir, feature_files[mode]), "w") as f:
            json.dump({"feature_columns": feat_cols}, f, indent=2)

        for target_name, col in [("score","scores"), ("concede","concedes")]:
            y = ye[col].astype(int).values
            mname = combo["model"]
            model = MODEL_MAKERS[mname]()
            params = best_params.get(target_name, {})
            if params:
                try: model.set_params(**params)
                except Exception as e: print(f"[Stage 5][WARN] set_params failed ({mode}/{mname}/{target_name}): {e}")
            model.fit(Xmat, y)
            mdl_path = os.path.join(outputs_dir, f"final_{mname}_{mode}_{target_name}.pkl")
            joblib.dump(model, mdl_path)
            artifacts[f"{mode}_{target_name}"] = {"model_name": mname, "path": mdl_path}

    with open(os.path.join(outputs_dir, "final_feature_combo.json"), "w") as f:
        json.dump(combo, f, indent=2)
    with open(os.path.join(outputs_dir, "final_best_params.json"), "w") as f:
        json.dump(best_params, f, indent=2)

    print("[Stage 5] Saved models & meta:", artifacts)
    print("[Stage 5] Feature lists:", feature_files)
    return artifacts, feature_files, combo



MODEL_MAKERS: ['xgb', 'logreg', 'catboost']


In [4]:
# 5) EURO 2020 fit (ALL / ON-ONLY / NO-ONOFF)
artifacts, feature_files, combo = stage5_train_final_on_euro(
    outputs_dir="/content/drive/MyDrive/Data/VAEP_outputs",
    euro_merged_h5="/content/drive/MyDrive/Data/Processed/euro2020_spadl_merged.h5"
)

[Stage 5] Saved models & meta: {'all_score': {'model_name': 'catboost', 'path': '/content/drive/MyDrive/Data/VAEP_outputs/final_catboost_all_score.pkl'}, 'all_concede': {'model_name': 'catboost', 'path': '/content/drive/MyDrive/Data/VAEP_outputs/final_catboost_all_concede.pkl'}, 'ononly_score': {'model_name': 'catboost', 'path': '/content/drive/MyDrive/Data/VAEP_outputs/final_catboost_ononly_score.pkl'}, 'ononly_concede': {'model_name': 'catboost', 'path': '/content/drive/MyDrive/Data/VAEP_outputs/final_catboost_ononly_concede.pkl'}, 'no_onoff_score': {'model_name': 'catboost', 'path': '/content/drive/MyDrive/Data/VAEP_outputs/final_catboost_no_onoff_score.pkl'}, 'no_onoff_concede': {'model_name': 'catboost', 'path': '/content/drive/MyDrive/Data/VAEP_outputs/final_catboost_no_onoff_concede.pkl'}}
[Stage 5] Feature lists: {'all': 'final_feature_columns_all.json', 'ononly': 'final_feature_columns_ononly.json', 'no_onoff': 'final_feature_columns_no_onoff.json'}


In [5]:
# Stage 7
def load_wc2022_actions_index(merged_h5):
    rows = []
    with pd.HDFStore(merged_h5, mode='r') as store:
        keys = [k for k in store.keys() if k.startswith('/actions/')]
        for k in keys:
            df = store[k].copy()
            if 'index' not in df.columns:
                df = df.reset_index().rename(columns={'index':'index'})
            out = pd.DataFrame({
                'event_uuid': df.get('event_uuid', np.nan),
                'game_id'   : df.get('game_id',   df.get('match_id', np.nan)),
                'team_id'   : df.get('team_id',   np.nan),
                'player_id' : df.get('player_id', np.nan),
                'action_type': df.get('action_type', np.nan),
                'period_id' : df.get('period_id',  df.get('period', np.nan)),
                'seconds'   : df.get('seconds',    df.get('time_seconds', df.get('time', np.nan))),
                'order_idx' : df['index'],
            })
            out['_sort_key1'] = out['period_id'].fillna(0)
            out['_sort_key2'] = out['seconds'].fillna(0)
            out['_sort_key3'] = out['order_idx'].fillna(0)
            rows.append(out)
    return pd.concat(rows, axis=0, ignore_index=True)

def stage7_compute_vaep_actions_and_players(outputs_dir, wc2022_merged_h5,
                                            predictions_csv, save_prefix="wc2022_all"):
    if not os.path.exists(predictions_csv):
        raise FileNotFoundError(f"Predictions not found: {predictions_csv}")

    pred = pd.read_csv(predictions_csv)
    need_cols = {'event_uuid','game_id','p_score','p_concede'}
    missing = need_cols - set(pred.columns)
    if missing:
        raise ValueError(f"Missing columns in predictions: {missing}")

    actions = load_wc2022_actions_index(wc2022_merged_h5)

    df = actions.merge(pred[['event_uuid','p_score','p_concede']], on='event_uuid', how='left')
    df = df.dropna(subset=['p_score','p_concede']).copy()
    df = df.sort_values(['game_id','_sort_key1','_sort_key2','_sort_key3']).reset_index(drop=True)

    # current - previous
    grp = df.groupby('game_id', sort=False, group_keys=False)
    df['p_score_prev']   = grp['p_score'].shift(1)
    df['p_concede_prev'] = grp['p_concede'].shift(1)
    for c in ['p_score_prev','p_concede_prev']:
        miss = df[c].isna()
        if miss.any():
            base = c.replace('_prev','')
            df.loc[miss, c] = df.loc[miss, base]

    df["offensive_value"] = df["p_score"]   - df["p_score_prev"]
    df["defensive_value"] = -(df["p_concede"] - df["p_concede_prev"])
    df["vaep_value"] = df["offensive_value"] + df["defensive_value"]

    actions_out = os.path.join(outputs_dir, f"{save_prefix}_vaep_actions.csv")
    cols_out = ['event_uuid','game_id','team_id','player_id','action_type',
                'p_score','p_concede','p_score_prev','p_concede_prev',
                'offensive_value','defensive_value','vaep_value']
    df[cols_out].to_csv(actions_out, index=False)

    players = (df.groupby(['player_id','team_id'], dropna=False)['vaep_value']
                 .agg(total_vaep='sum', mean_vaep='mean', n_actions='count')
                 .reset_index()
                 .sort_values('total_vaep', ascending=False))
    teams = (df.groupby('team_id', dropna=False)['vaep_value']
               .agg(total_vaep='sum', mean_vaep='mean', n_actions='count')
               .reset_index()
               .sort_values('total_vaep', ascending=False))

    players_out = os.path.join(outputs_dir, f"{save_prefix}_vaep_players.csv")
    teams_out   = os.path.join(outputs_dir, f"{save_prefix}_vaep_teams.csv")
    players.to_csv(players_out, index=False)
    teams.to_csv(teams_out, index=False)

    print("[Stage 7] Saved:")
    print(" - Actions:", actions_out)
    print(" - Players:", players_out)
    print(" - Teams  :", teams_out)
    return actions_out, players_out, teams_out

def stage6_predict_on_wc2022_composed(outputs_dir, wc2022_merged_h5, mode="all"):
    with open(os.path.join(outputs_dir, "final_feature_combo.json"), "r") as f:
        combo = json.load(f)
    Xw, _ = rebuild_Xy_for_combo(
        merged_h5=wc2022_merged_h5,
        line_gap=float(combo["line_gap"]),
        in_line_gap=float(combo["in_line_gap"]),
        nr_actions=int(combo["nr_actions"]),
    )

    def load_featcols(track):
        with open(os.path.join(outputs_dir, f"final_feature_columns_{track}.json"), "r") as f:
            return json.load(f)["feature_columns"]

    featcols_score = load_featcols(mode)
    featcols_concede = load_featcols("no_onoff")

    def X_for(track, featcols):
        _, Xnum = feature_columns_by_mode(Xw, track)
        Xmat_df = Xnum.copy()
        for c in featcols:
            if c not in Xmat_df.columns:
                Xmat_df[c] = 0.0
        Xmat = Xmat_df[featcols].astype(np.float32).values
        return Xmat

    X_score   = X_for(mode,       featcols_score)
    X_concede = X_for("no_onoff", featcols_concede)

    def _load_model(track, target):
        pat = f"final_*_{track}_{target}.pkl"
        hits = glob.glob(os.path.join(outputs_dir, pat))
        if not hits:
            raise FileNotFoundError(f"Model not found for track={track}, target={target}")
        return joblib.load(hits[0])

    mdl_score   = _load_model(mode,       "score")
    mdl_concede = _load_model("no_onoff", "concede")

    pred = pd.DataFrame({"event_uuid": Xw["event_uuid"], "game_id": Xw.get("game_id", np.nan)})

    p_s = mdl_score.predict_proba(X_score)[:,1]
    p_c = mdl_concede.predict_proba(X_concede)[:,1]
    p_s = np.clip(np.nan_to_num(p_s, nan=0.5, posinf=1-1e-7, neginf=1e-7), 1e-7, 1-1e-7)
    p_c = np.clip(np.nan_to_num(p_c, nan=0.5, posinf=1-1e-7, neginf=1e-7), 1e-7, 1-1e-7)

    pred["p_score"] = p_s
    pred["p_concede"] = p_c
    pred["vaep_proxy"] = pred["p_score"] - pred["p_concede"]

    out_csv = os.path.join(outputs_dir, f"wc2022_predictions_{mode}_COMPOSED.csv")
    pred.to_csv(out_csv, index=False)
    print(f"[Stage 6*] Saved composed predictions: {out_csv}")
    return out_csv

import os, json, numpy as np, pandas as pd
try:
    from sklearn.metrics import roc_auc_score, log_loss as sk_log_loss
except Exception:
    roc_auc_score, sk_log_loss = None, None

def safe_auc(y, p):
    y = np.asarray(y, dtype=int)
    if len(np.unique(y)) < 2 or roc_auc_score is None:
        return np.nan
    return float(roc_auc_score(y, p))

def safe_logloss(y, p, eps=1e-7):
    p = np.clip(np.nan_to_num(p, nan=0.5), eps, 1 - eps)
    if sk_log_loss is not None:
        return float(sk_log_loss(y, p))
    y = np.asarray(y, dtype=float)
    return float(-(y * np.log(p) + (1 - y) * np.log(1 - p)).mean())

def safe_brier(y, p):
    y = np.asarray(y, dtype=float); p = np.asarray(p, dtype=float)
    return float(np.mean((y - p) ** 2))

def evaluate_wc2022_predictions(outputs_dir, wc2022_merged_h5, predictions_csv, save_suffix=None):
    with open(os.path.join(outputs_dir, "final_feature_combo.json"), "r") as f:
        combo = json.load(f)
    Xw, yw = rebuild_Xy_for_combo(
        merged_h5=wc2022_merged_h5,
        line_gap=float(combo["line_gap"]),
        in_line_gap=float(combo["in_line_gap"]),
        nr_actions=int(combo["nr_actions"]),
    )
    lab = Xw[["event_uuid"]].copy()
    lab["y_score"]   = yw["scores"].astype(int).values
    lab["y_concede"] = yw["concedes"].astype(int).values

    pred = pd.read_csv(predictions_csv)[["event_uuid","p_score","p_concede"]]
    df = lab.merge(pred, on="event_uuid", how="inner")

    rows = []
    for target, ycol, pcol in [("score","y_score","p_score"),
                               ("concede","y_concede","p_concede")]:
        y = df[ycol].values; p = df[pcol].values
        rows.append({
            "target": target,
            "n": int(len(y)),
            "auc":    safe_auc(y, p),
            "brier":  safe_brier(y, p),
            "logloss": safe_logloss(y, p),
        })
    res = pd.DataFrame(rows)

    if save_suffix is None:
        base = os.path.basename(predictions_csv).replace(".csv","")
        save_suffix = base.replace("wc2022_predictions_","")
    out_csv = os.path.join(outputs_dir, f"wc2022_eval_{save_suffix}.csv")
    res.to_csv(out_csv, index=False)
    print(f"[EVAL] saved → {out_csv}")
    return out_csv

def stage6_predict_and_eval_composed(outputs_dir, wc2022_merged_h5, modes=("all","ononly","no_onoff")):
    paths = {}
    for m in modes:
        pred_csv = stage6_predict_on_wc2022_composed(outputs_dir, wc2022_merged_h5, mode=m)
        eval_csv = evaluate_wc2022_predictions(outputs_dir, wc2022_merged_h5, pred_csv)
        paths[m] = {"pred": pred_csv, "eval": eval_csv}
    return paths

In [6]:
OUT = "/content/drive/MyDrive/Data/VAEP_outputs"
H5  = "/content/drive/MyDrive/Data/Processed/worldcup2022_spadl_merged.h5"

# 6)
paths = stage6_predict_and_eval_composed(
    outputs_dir=OUT,
    wc2022_merged_h5=H5,
    modes=("all","ononly","no_onoff")
)

# Pred CSV path
pred_all  = paths["all"]["pred"]
pred_on   = paths["ononly"]["pred"]
pred_base = paths["no_onoff"]["pred"]

# 7) VAEP Calculate
actions_all, players_all, teams_all = stage7_compute_vaep_actions_and_players(
    outputs_dir=OUT,
    wc2022_merged_h5=H5,
    predictions_csv=pred_all,
    save_prefix="wc2022_all_COMPOSED"
)

actions_on, players_on, teams_on = stage7_compute_vaep_actions_and_players(
    outputs_dir=OUT,
    wc2022_merged_h5=H5,
    predictions_csv=pred_on,
    save_prefix="wc2022_ononly_COMPOSED"
)

actions_base, players_base, teams_base = stage7_compute_vaep_actions_and_players(
    outputs_dir=OUT,
    wc2022_merged_h5=H5,
    predictions_csv=pred_base,
    save_prefix="wc2022_no_onoff_COMPOSED"
)

print("[EVAL CSVs]")
print(paths["all"]["eval"])
print(paths["ononly"]["eval"])
print(paths["no_onoff"]["eval"])

[Stage 6*] Saved composed predictions: /content/drive/MyDrive/Data/VAEP_outputs/wc2022_predictions_all_COMPOSED.csv
[EVAL] saved → /content/drive/MyDrive/Data/VAEP_outputs/wc2022_eval_all_COMPOSED.csv
[Stage 6*] Saved composed predictions: /content/drive/MyDrive/Data/VAEP_outputs/wc2022_predictions_ononly_COMPOSED.csv
[EVAL] saved → /content/drive/MyDrive/Data/VAEP_outputs/wc2022_eval_ononly_COMPOSED.csv
[Stage 6*] Saved composed predictions: /content/drive/MyDrive/Data/VAEP_outputs/wc2022_predictions_no_onoff_COMPOSED.csv
[EVAL] saved → /content/drive/MyDrive/Data/VAEP_outputs/wc2022_eval_no_onoff_COMPOSED.csv
[Stage 7] Saved:
 - Actions: /content/drive/MyDrive/Data/VAEP_outputs/wc2022_all_COMPOSED_vaep_actions.csv
 - Players: /content/drive/MyDrive/Data/VAEP_outputs/wc2022_all_COMPOSED_vaep_players.csv
 - Teams  : /content/drive/MyDrive/Data/VAEP_outputs/wc2022_all_COMPOSED_vaep_teams.csv
[Stage 7] Saved:
 - Actions: /content/drive/MyDrive/Data/VAEP_outputs/wc2022_ononly_COMPOSED_vae

In [7]:
try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=False)
except Exception:
    pass

import os, glob, pandas as pd
pd.set_option("display.max_columns", 200)

PATTERNS = [
    "/content/drive/**/wc2022_eval_*_COMPOSED.csv",
    "/content/drive/**/wc2022_eval_*.csv",
]

hits = []
for pat in PATTERNS:
    hits += glob.glob(pat, recursive=True)

hits = sorted(set(hits), key=os.path.getmtime)
if not hits:
    raise FileNotFoundError("Can't find CSV")

print("[FOUND eval CSV files]")
for p in hits:
    print(" -", p)

dfs = []
for f in hits:
    mode = os.path.basename(f).replace("wc2022_eval_","").replace(".csv","")
    df = pd.read_csv(f)
    df.insert(0, "mode", mode)
    dfs.append(df)

all_eval = pd.concat(dfs, ignore_index=True)

disp = all_eval.copy()
for c in ["auc", "brier", "logloss"]:
    if c in disp.columns:
        disp[c] = disp[c].round(6)

print("\n=== Evaluation metrics (mode x target) ===")
display(disp.sort_values(["target","logloss","brier","auc"], ascending=[True,True,True,False]))

# Pivot comparison
print("=== AUC pivot ===")
display(all_eval.pivot_table(index="target", columns="mode", values="auc").round(6))
print("=== Brier pivot ===")
display(all_eval.pivot_table(index="target", columns="mode", values="brier").round(6))
print("=== LogLoss pivot ===")
display(all_eval.pivot_table(index="target", columns="mode", values="logloss").round(6))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[FOUND eval CSV files]
 - /content/drive/MyDrive/Data/VAEP_outputs/wc2022_eval_all_COMPOSED.csv
 - /content/drive/MyDrive/Data/VAEP_outputs/wc2022_eval_ononly_COMPOSED.csv
 - /content/drive/MyDrive/Data/VAEP_outputs/wc2022_eval_no_onoff_COMPOSED.csv

=== Evaluation metrics (mode x target) ===


Unnamed: 0,mode,target,n,auc,brier,logloss
1,all_COMPOSED,concede,120465,0.811575,0.001369,0.009392
3,ononly_COMPOSED,concede,120465,0.811575,0.001369,0.009392
5,no_onoff_COMPOSED,concede,120465,0.811575,0.001369,0.009392
2,ononly_COMPOSED,score,120465,0.847471,0.005832,0.030894
0,all_COMPOSED,score,120465,0.846751,0.005835,0.030925
4,no_onoff_COMPOSED,score,120465,0.842992,0.005844,0.031025


=== AUC pivot ===


mode,all_COMPOSED,no_onoff_COMPOSED,ononly_COMPOSED
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
concede,0.811575,0.811575,0.811575
score,0.846751,0.842992,0.847471


=== Brier pivot ===


mode,all_COMPOSED,no_onoff_COMPOSED,ononly_COMPOSED
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
concede,0.001369,0.001369,0.001369
score,0.005835,0.005844,0.005832


=== LogLoss pivot ===


mode,all_COMPOSED,no_onoff_COMPOSED,ononly_COMPOSED
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
concede,0.009392,0.009392,0.009392
score,0.030925,0.031025,0.030894
