### Family: Gender only (Indoors + Outdoors together)
### Hierarchical LSTM:
- Season Encoder LSTM over mark-level sequences -> season embedding
- Add SeasonLabel embedding (Indoor/Outdoor) to each season embedding
- Across-Season LSTM over K season embeddings -> athlete + family representation
- Per-event heads (Linear) with per-event z-scored targets; report MAE in natural/native units
### Sliding Windows:
For each athlete & event_family, take up to K past seasons ending at season t (Indoor or Outdoor), and predict the peak in the next season with the SAME label (t+1, same SeasonLabel).

# Configuration

In [3]:
import argparse, os, re, json, random
from datetime import datetime
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence
from types import SimpleNamespace


NPY_PATH_DEFAULT = r"C:\Users\danie\OneDrive\Desktop\FlightPhase\tfrrs_performances_fast.ckpt_22000.npy"
OUT_DIR_DEFAULT = "models_HierGenderFamilies"
CACHE_SUBDIR        = "cache"

SEQLEN_MARKS = 32    # marks kept per season
K_SEASONS = 4        # seasons kept in the window (history) → predict next season (same label)
HID_SEASON = 64      # hidden size of season (mark-level) LSTM
HID_ACROSS = 64      # hidden size of across-season LSTM
EMB_LABEL = 4        # embedding dim for SeasonLabel (Indoor/Outdoor)
BATCH = 128
EPOCHS = 30
LR = 1e-3
MIN_TRAIN_PER_EVENT = 60   # filter rare events inside each gender
SEED = 42

SEASONLABEL_ORDER = {"Indoors": 0, "Outdoors": 1}
SEASONLABEL_ID = {"Indoors": 0, "Outdoors": 1}
SEASONLABEL_FROM_ID = {0: "Indoors", 1: "Outdoors"}

SystemError: <class 'numpy.iinfo'> returned a result with an exception set

# Utility Functions

In [2]:
def set_seed(seed=42):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.manual_seed(seed)

RUN_EVENTS = {
    "60 Meters","100 Meters","200 Meters","400 Meters","800 Meters","1500 Meters",
    "Mile","3000 Meters","5000 Meters","10000 Meters","60 Hurdles","110 Hurdles","100 Hurdles",
    "400 Hurdles","3000 Steeplechase","DMR","4 x 100 Relay","4 x 400 Relay"
}
FIELD_EVENTS = {
    "Long Jump","Triple Jump","High Jump","Pole Vault",
    "Shot Put","Discus","Hammer","Javelin","Weight Throw"
}

def canonical_event(e: str) -> str:
    if e in RUN_EVENTS or e in FIELD_EVENTS: return e
    el = (e or "").strip().lower()
    alias = {
        "60m":"60 Meters","100m":"100 Meters","200m":"200 Meters","400m":"400 Meters",
        "800m":"800 Meters","1500m":"1500 Meters","mile":"Mile",
        "3k":"3000 Meters","5k":"5000 Meters","10k":"10000 Meters",
        "60h":"60 Hurdles","110h":"110 Hurdles","100h":"100 Hurdles",
        "4x1":"4 x 100 Relay","4x4":"4 x 400 Relay"
    }
    return alias.get(el, e)

def parse_time_to_seconds(s: str) -> float | None:
    if not s: return None
    sl = s.strip().lower()
    if sl in {"dnf","dq","fs","nt","ns"}: return None
    s = s.replace(" ", "")
    if ":" in s:
        parts = s.split(":")
        try:
            if len(parts)==2: m,sec = int(parts[0]), float(parts[1]); return m*60+sec
            if len(parts)==3: h,m,sec = int(parts[0]),int(parts[1]),float(parts[2]); return h*3600+m*60+sec
        except: return None
    try: return float(s)
    except: return None

def parse_distance_to_meters(s: str) -> float | None:
    if not s: return None
    sl = s.strip().lower()
    if sl in {"nm"}: return None
    if re.match(r"^\d{1,3}-\d{1,2}(\.\d+)?$", sl):  # ft-in
        ft, inch = sl.split("-")
        try:
            total_inches = int(ft)*12 + float(inch)
            return total_inches * 0.0254
        except: return None
    if sl.endswith("m"): sl = sl[:-1]
    try: return float(sl)
    except: return None

def mark_to_numeric(event: str, mark: str) -> tuple[float|None, bool]:
    if event in RUN_EVENTS:   return parse_time_to_seconds(mark), True
    if event in FIELD_EVENTS: return parse_distance_to_meters(mark), False
    t = parse_time_to_seconds(mark)
    if t is not None and (":" in (mark or "") or t < 30): return t, True
    d = parse_distance_to_meters(mark)
    return ((d, False) if d is not None else (None, True))

def safe_slug(s: str) -> str:
    return re.sub(r"[^A-Za-z0-9]+", "_", s).strip("_")

# Event-Family Mapping
### Families let Indoor/Outdoor analogs share the same timeline:
- short_sprint: 60m(Indoor) <-> 100m(Outdoor)
- short_hurdles: 60H(Indoor) <-> 110H men / 100H women (Outdoor)
- "same name" events (200, 400, TJ, etc.) map to themselves as families

In [3]:
SAME_NAME_FAMILIES = {
    "200 Meters","400 Meters","800 Meters","3000 Meters","5000 Meters","10000 Meters",
    "400 Hurdles","3000 Steeplechase","4 x 400 Relay",
    "Long Jump","Triple Jump","High Jump","Pole Vault",
    "Shot Put","Discus","Hammer","Javelin","Weight Throw"
}

def event_to_family(event: str, gender: str) -> str:
    e = canonical_event(event)
    if e in SAME_NAME_FAMILIES or e in {"DMR"}:
        return e  # family == event
    # special pairings
    if e in {"60 Meters","100 Meters"}:
        return "short_sprint"
    if e == "Mile" or e == "1500 Meters":
        return "metric_mile"
    if e == "60 Hurdles":
        return "short_hurdles"
    if e == "110 Hurdles" and gender.strip().lower().startswith("men"):
        return "short_hurdles"
    if e == "100 Hurdles" and gender.strip().lower().startswith("women"):
        return "short_hurdles"
    # else fallback
    return e

# Data Pre-Processing

In [None]:
def load_df(npy_path: str) -> pd.DataFrame:
    arr = np.load(npy_path, allow_pickle=False)
    df = pd.DataFrame.from_records(arr)
    keep = ["Division","School","Gender","SeasonLabel","SeasonYear","Event",
            "Athlete","MarkOrTime","Wind","Meet","MeetDate"]
    df = df[keep].copy()
    for c in keep: df[c] = df[c].astype("string").str.strip()
    df["Event"] = df["Event"].map(canonical_event)
    df["date"]  = pd.to_datetime(df["MeetDate"], errors="coerce")
    df = df.dropna(subset=["date","Event","Athlete","SeasonLabel","SeasonYear"])

    # numeric mark and orientation
    val_is_time = df.apply(lambda r: mark_to_numeric(r["Event"], r["MarkOrTime"]), axis=1, result_type="expand")
    df["value"]   = val_is_time[0]
    df["is_time"] = val_is_time[1]
    df = df.dropna(subset=["value"])

    # event-family column
    df["family"] = df.apply(lambda r: event_to_family(r["Event"], r["Gender"]), axis=1)

    # keys
    df["ath_key"] = (df["Athlete"].str.lower().str.strip() + "||" + df["Gender"].str.strip())
    df["fam_key"] = (df["ath_key"] + "||" + df["family"])
    df["season_key"] = df["SeasonYear"].astype("Int64").astype("string") + "-" + df["SeasonLabel"]

    return df

def agg_peak(g: pd.DataFrame) -> float:
    # family-level season peak: min for times, max for distances
    return g["value"].min() if g["is_time"].iloc[0] else g["value"].max()

def build_family_season_peaks(df: pd.DataFrame) -> pd.DataFrame:
    # one row per (fam_key, season)
    peaks = (df.groupby(["fam_key","season_key"], as_index=False)
               .apply(agg_peak).rename(columns={None:"peak_value"}))
    meta = (df.groupby(["fam_key","season_key"], as_index=False)
              .agg(SeasonYear=("SeasonYear","first"),
                   SeasonLabel=("SeasonLabel","first"),
                   Gender=("Gender","first"),
                   family=("family","first"),
                   is_time=("is_time","first")))
    peaks = peaks.merge(meta, on=["fam_key","season_key"], how="left")
    return peaks

def build_mark_sequences_by_family(df: pd.DataFrame, seqlen=SEQLEN_MARKS):
    """
    Precompute, for each (fam_key, season_key), a mark-level sequence X[L,F] and length L.
    Use ONLY rows from that family in that season (e.g., short_sprint == 60m indoor, 100m outdoor).
    """
    df = df.sort_values(["fam_key","season_key","date"]).reset_index(drop=True)
    seqs = {}
    lens = {}
    # also record the ACTUAL EVENT used for this family+season (e.g., "60 Meters" vs "100 Meters")
    evt_used = {}
    for (fk, sk), g in df.groupby(["fam_key","season_key"]):
        vals = g["value"].to_numpy()
        days = g["date"].diff().dt.days.fillna(0).clip(lower=0).to_numpy()
        wind = pd.to_numeric(g["Wind"].str.replace("+","", regex=False).str.replace("m/s","", regex=False),
                             errors="coerce").fillna(0.0).to_numpy()
        X = np.stack([vals, days, wind], axis=1).astype("float32")
        l = len(X)
        if l > seqlen:
            X = X[-seqlen:]
            l = seqlen
        else:
            pad = np.zeros((seqlen-l, X.shape[1]), dtype="float32")
            X = np.vstack([pad, X])
        seqs[(fk,sk)] = X
        lens[(fk,sk)] = l
        # pick the most frequent event string used in this family/season block (should be unique)
        evt_used[(fk,sk)] = g["Event"].value_counts().idxmax()
    return seqs, lens, evt_used

def build_windows_gender_family(df: pd.DataFrame, peaks: pd.DataFrame,
                                seqs: dict, lens: dict, evt_used: dict,
                                k_seasons=K_SEASONS):
    """
    Build sliding windows of up to K past seasons to predict NEXT season with the SAME label as the last season in window.
    Grouped by fam_key (athlete+gender+event_family). Returns:
      X[N,K,L,F], L[N,K], label_ids[N,K], y_raw[N], evt_name[N], evt_id placeholder later, years_t[N]
    """
    # sort seasons within each fam_key by (Year, label order)
    peaks = peaks.sort_values(["fam_key","SeasonYear","SeasonLabel"], key=lambda s: s.map(SEASONLABEL_ORDER) if s.name=="SeasonLabel" else s)
    # collect season list per fam_key
    seasons_by_fk = {}
    label_by_fk_sk = {}
    year_by_fk_sk = {}
    for (fk), g in peaks.groupby(["fam_key"]):
        g = g.sort_values(["SeasonYear","SeasonLabel"], key=lambda s: s.map(SEASONLABEL_ORDER) if s.name=="SeasonLabel" else s)
        seasons = list(g["season_key"])
        seasons_by_fk[fk] = seasons
        for _, r in g.iterrows():
            label_by_fk_sk[(fk, r["season_key"])] = r["SeasonLabel"]
            year_by_fk_sk[(fk, r["season_key"])]  = int(r["SeasonYear"])

    # quick lookup for y_next (same label next year)
    peak_map = {(r["fam_key"], r["season_key"]): float(r["peak_value"]) for _, r in peaks.iterrows()}

    X_list, L_list, lab_list, y_list, evt_list, years_t = [], [], [], [], [], []

    for fk, seasons in seasons_by_fk.items():
        if len(seasons) == 0: continue
        # iterate windows which have a NEXT season of same label
        for i in range(len(seasons)):
            sk_t = seasons[i]
            lab_t = label_by_fk_sk[(fk, sk_t)]
            yr_t  = year_by_fk_sk[(fk, sk_t)]
            # next same label (t+1, same SeasonLabel)
            next_key = f"{yr_t+1}-{lab_t}"
            if (fk, next_key) not in peak_map:
                continue  # cannot make a labeled sample

            # window of up to K seasons ending at i
            start = max(0, i-(k_seasons-1))
            window = seasons[start:i+1]
            # left pad to K
            pad_n = k_seasons - len(window)
            window = ["__PAD__"]*pad_n + window

            # assemble blocks
            blocks, lengths, label_ids = [], [], []
            for sk in window:
                if sk == "__PAD__":
                    blocks.append(np.zeros((SEQLEN_MARKS, 3), dtype="float32"))
                    lengths.append(0)
                    label_ids.append(0)  # dummy; will be masked by length==0
                else:
                    blocks.append(seqs.get((fk, sk), np.zeros((SEQLEN_MARKS,3), dtype="float32")))
                    lengths.append(lens.get((fk, sk), 0))
                    label_ids.append(SEASONLABEL_ID.get(label_by_fk_sk[(fk, sk)], 0))
            Xk = np.stack(blocks, axis=0)              # [K,L,F]
            Lk = np.array(lengths, dtype=np.int64)     # [K]
            Labk = np.array(label_ids, dtype=np.int64) # [K]

            # label in native units
            y_next = peak_map[(fk, next_key)]
            # event head to use = the ACTUAL EVENT name for the LAST season in window (sk_t)
            ev_here = evt_used.get((fk, sk_t), None)
            if ev_here is None:  # should not happen; but skip just in case
                continue

            X_list.append(Xk)
            L_list.append(Lk)
            lab_list.append(Labk)
            y_list.append(y_next)
            evt_list.append(ev_here)
            years_t.append(yr_t)

    if not X_list:
        return None
    X = np.stack(X_list)                      # [N,K,L,F]
    L = np.stack(L_list)                      # [N,K]
    LAB = np.stack(lab_list)                  # [N,K]
    y = np.array(y_list, dtype="float32")     # [N]
    events = np.array(evt_list)
    years_t = np.array(years_t, dtype=np.int32)
    return X, L, LAB, y, events, years_t

# DataSet

In [5]:
class HierDS(Dataset):
    def __init__(self, X, L, LAB, evt_id, y_z):
        self.X = torch.tensor(X, dtype=torch.float32)       # [N,K,L,F]
        self.L = torch.tensor(L, dtype=torch.long)          # [N,K]
        self.LAB = torch.tensor(LAB, dtype=torch.long)      # [N,K]
        self.e = torch.tensor(evt_id, dtype=torch.long)     # [N]
        self.y = torch.tensor(y_z, dtype=torch.float32)     # [N]
    def __len__(self): return len(self.X)
    def __getitem__(self, i): return self.X[i], self.L[i], self.LAB[i], self.e[i], self.y[i]

# Architecture

In [6]:
class HierGenderFamilies(nn.Module):
    def __init__(self, in_dim, hid_season, hid_across, n_events, emb_label_dim=4):
        super().__init__()
        self.season_lstm = nn.LSTM(in_dim, hid_season, batch_first=True)
        self.label_emb = nn.Embedding(num_embeddings=2, embedding_dim=emb_label_dim)  # 0=Indoor, 1=Outdoor
        self.across_lstm = nn.LSTM(hid_season + emb_label_dim, hid_across, batch_first=True)
        self.heads = nn.ModuleList([nn.Sequential(nn.LayerNorm(hid_across), nn.Linear(hid_across, 1))
                                    for _ in range(n_events)])

    def forward(self, x, l_season, lab_ids, evt_id):
        """
        x: [B,K,L,F]
        l_season: [B,K] (lengths per season)
        lab_ids: [B,K] (0=Indoor,1=Outdoor) -- for non-PAD seasons
        evt_id: [B] (which event head to use; defined by last season's event)
        """
        B, K, L, F = x.shape
        # ---- season encoder ----
        x_flat = x.view(B*K, L, F)                   # [B*K,L,F]
        l_flat = l_season.view(B*K)                  # [B*K]
        l_flat = torch.clamp(l_flat, min=1)          # pack() disallows zero; PAD will be all-zeros anyway
        packed = pack_padded_sequence(x_flat, lengths=l_flat.cpu(), batch_first=True, enforce_sorted=False)
        _, (h_n, _) = self.season_lstm(packed)       # h_n: [1, B*K, hid_season]
        season_emb = h_n[-1].view(B, K, -1)          # [B,K,hid_season]

        # ---- add SeasonLabel embedding to each season ----
        lab_emb = self.label_emb(lab_ids)            # [B,K,emb_label]
        season_plus = torch.cat([season_emb, lab_emb], dim=-1)  # [B,K,hid_season+emb_label]

        # ---- across-season encoder ----
        # effective K lengths = number of non-empty seasons
        k_len = (l_season > 0).sum(dim=1)            # [B]
        k_len = torch.clamp(k_len, min=1)
        packed_k = pack_padded_sequence(season_plus, lengths=k_len.cpu(), batch_first=True, enforce_sorted=False)
        _, (h2, _) = self.across_lstm(packed_k)      # h2: [1,B,hid_across]
        fam_emb = h2[-1]                              # [B,hid_across]

        # ---- event-specific heads ----
        preds = torch.empty(B, device=x.device, dtype=torch.float32)
        for eid in torch.unique(evt_id):
            mask = (evt_id == eid)
            preds[mask] = self.heads[int(eid)](fam_emb[mask]).squeeze(-1)
        return preds

# PreCompute and Cahce

In [1]:
import os, json, random
from datetime import datetime
from types import SimpleNamespace

import numpy as np
import pandas as pd
import torch

# Prefer fastparquet if available; never import pyarrow to avoid extension clashes
try:
    import fastparquet  # noqa: F401
    _HAS_FASTPARQUET = True
except Exception:
    _HAS_FASTPARQUET = False

CACHE_SUBDIR = "cache"

def _pkl_path(parquet_path: str) -> str:
    root, _ = os.path.splitext(parquet_path)
    return root + ".pkl"

def deflate_df(df: pd.DataFrame) -> pd.DataFrame:
    """Normalize to plain dtypes (no Pandas/Arrow extension types)."""
    out = df.copy()
    import pandas.api.types as ptypes
    for c in out.columns:
        s = out[c]
        # handle pandas Period explicitly
        if getattr(s.dtype, "kind", None) is None and str(s.dtype).startswith("period["):
            out[c] = s.astype("string").astype(str)
        elif ptypes.is_datetime64_any_dtype(s):
            out[c] = pd.to_datetime(s).astype("datetime64[ns]")
        elif ptypes.is_integer_dtype(s):
            out[c] = s.astype("int64")
        elif ptypes.is_float_dtype(s):
            out[c] = s.astype("float64")
        elif ptypes.is_bool_dtype(s):
            out[c] = s.astype(bool)
        else:
            out[c] = s.astype(str)
    return out

def save_table(df: pd.DataFrame, parquet_path: str):
    df2 = deflate_df(df)
    if _HAS_FASTPARQUET:
        df2.to_parquet(parquet_path, index=False, engine="fastparquet")
    else:
        df2.to_pickle(_pkl_path(parquet_path))

def load_table(parquet_path: str) -> pd.DataFrame:
    pkl = _pkl_path(parquet_path)
    if _HAS_FASTPARQUET and os.path.exists(parquet_path):
        return pd.read_parquet(parquet_path, engine="fastparquet")
    if os.path.exists(pkl):
        return pd.read_pickle(pkl)
    raise FileNotFoundError(f"Not found: {parquet_path} or {pkl}")

def save_dict_table(d: dict, parquet_path: str, value_name: str):
    ser = pd.Series(d)
    ser.index = pd.MultiIndex.from_tuples(ser.index, names=["fam_key","season_key"])
    df = ser.to_frame(name=value_name).reset_index()
    save_table(df, parquet_path)

def load_dict_table(parquet_path: str) -> dict:
    df = load_table(parquet_path)
    val_col = df.columns[-1]
    return {(r.fam_key, r.season_key): r[val_col] for _, r in df.iterrows()}

def save_seqs_npz(seqs: dict, path: str):
    np.savez_compressed(path, **{f"{fk}§{sk}": arr for (fk, sk), arr in seqs.items()})

def load_seqs_npz(path: str) -> dict:
    data = np.load(path, allow_pickle=False)
    return {tuple(k.split("§", 1)): data[k] for k in data.files}

# ---------------- your functions using the helpers ----------------
def cache_paths(outdir: str) -> dict:
    cdir = os.path.join(outdir, CACHE_SUBDIR)
    os.makedirs(cdir, exist_ok=True)
    return {
        "dir": cdir,
        "df": os.path.join(cdir, "marks_prepared.parquet"),  # may write .pkl
        "peaks": os.path.join(cdir, "peaks_all.parquet"),
        "lens": os.path.join(cdir, "lens.parquet"),
        "evt": os.path.join(cdir, "evt_used.parquet"),
        "seqs": os.path.join(cdir, "seqs.npz"),
        "manifest": os.path.join(cdir, "manifest.json"),
    }

def save_dict_to_parquet(d: dict, path: str, value_name: str):
    # kept for compatibility
    save_dict_table(d, path, value_name)

def load_dict_from_parquet(path: str) -> dict:
    return load_dict_table(path)

def set_seed_all(seed: int):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

# ------------------ configs -----------------------
def build_defaults() -> SimpleNamespace:
    return SimpleNamespace(
        input=NPY_PATH_DEFAULT,
        outdir=OUT_DIR_DEFAULT,
        seqlen=SEQLEN_MARKS,
        kseasons=K_SEASONS,
        hid_season=HID_SEASON,
        hid_across=HID_ACROSS,
        emb_label=EMB_LABEL,
        epochs=EPOCHS,
        min_train_per_event=MIN_TRAIN_PER_EVENT,
    )

# ============================================================
# %% BLOCK 1 — PRECOMPUTE & CACHE (run this once)
# ============================================================

args = build_defaults()
set_seed_all(SEED)

os.makedirs(args.outdir, exist_ok=True)
paths = cache_paths(args.outdir)

print("🧮 Loading and preparing marks…")
df = load_df(args.input)  # your heavy step (parses times/distances, dates, etc.)
print(f"   rows in df: {len(df):,}")
save_table(df, paths["df"])

print("🏔️  Building family-season peaks…")
peaks_all = build_family_season_peaks(df)
print(f"   rows in peaks_all: {len(peaks_all):,}")
save_table(peaks_all, paths["peaks"])

print(f"📦 Building per-season sequences (seqlen={args.seqlen}) …")
seqs, lens, evt_used = build_mark_sequences_by_family(df, seqlen=args.seqlen)
print(f"   seasons with sequences: {len(seqs):,}")

# ---- save all artifacts ----
print(f"💾 Saving cache → {paths['dir']}")
save_table(df, paths["df"])
save_table(peaks_all, paths["peaks"])
save_dict_table(lens, paths["lens"], "orig_len")
save_dict_table(evt_used, paths["evt"], "event")
save_seqs_npz(seqs, paths["seqs"])
with open(paths["manifest"], "w", encoding="utf-8") as f:
    json.dump({"created_utc": datetime.utcnow().isoformat()+"Z"}, f, indent=2)

KeyboardInterrupt: 

In [None]:
def main(args: SimpleNamespace | None = None):  # <-- set to True to run this block
    args = build_defaults()
    set_seed_all()

    os.makedirs(args.outdir, exist_ok=True)
    paths = cache_paths(args.outdir)

    # ---- load cached artifacts (no recompute!) ----
    print("📦 Loading cache…")
    if not all(os.path.exists(p) for p in [paths["df"], paths["peaks"], paths["lens"], paths["evt"], paths["seqs"]]):
        raise FileNotFoundError(
            f"Cache missing. Run the PRECOMPUTE block first.\n"
            f"Expected: {paths['df']}, {paths['peaks']}, {paths['lens']}, {paths['evt']}, {paths['seqs']}"
        )
    df         = pd.read_parquet(paths["df"])
    peaks_all  = pd.read_parquet(paths["peaks"])
    lens       = load_dict_from_parquet(paths["lens"])
    evt_used   = load_dict_from_parquet(paths["evt"])
    seqs       = load_seqs_npz(paths["seqs"])

    # ---- training loop (unchanged, but using cached data) ----
    device = "cuda" if torch.cuda.is_available() else "cpu"
    genders = sorted(df["Gender"].dropna().unique())
    summary_rows = []

    for gender in genders:
        F_marks = df[df["Gender"]==gender].copy()
        F_peaks = peaks_all[peaks_all["Gender"]==gender].copy()
        if F_marks.empty or F_peaks.empty:
            continue

        built = build_windows_gender_family(F_marks, F_peaks, seqs, lens, evt_used,
                                            k_seasons=args.kseasons)
        if built is None:
            continue
        X, Lk, LAB, y_raw, events, years_t = built   # [N,K,L,F], [N,K], [N,K], [N], [N], [N]

        # ====== TIME-BASED SPLIT (no test) ======
        tr_m = years_t <= 2024
        va_m = years_t == 2025

        if not tr_m.any():
            idx = np.arange(len(X)); np.random.shuffle(idx)
            n=len(idx); a=int(0.8*n)
            tr_m = np.zeros(n,bool); va_m=np.zeros(n,bool)
            tr_m[idx[:a]]=True; va_m[idx[a:]]=True

        # Filter events by TRAIN count (per-event head)
        tr_events, tr_counts = np.unique(events[tr_m], return_counts=True)
        allowed = set(e for e,c in zip(tr_events, tr_counts) if c >= args.min_train_per_event)
        keep = np.array([e in allowed for e in events])
        X, Lk, LAB, y_raw, events, years_t = X[keep], Lk[keep], LAB[keep], y_raw[keep], events[keep], years_t[keep]
        tr_m, va_m = tr_m[keep], va_m[keep]

        if len(allowed) == 0:
            print(f"Skipping gender {gender}: no event with ≥{args.min_train_per_event} train samples")
            continue

        # event ids (per-event heads & per-event standardization)
        evt_list = sorted(list(allowed))
        evt2id = {e:i for i,e in enumerate(evt_list)}
        evt_id = np.array([evt2id[e] for e in events], dtype=np.int64)

        # per-event z-score: fit on TRAIN only
        y_z = y_raw.copy()
        mu_by_evt, sd_by_evt = {}, {}
        for e in evt_list:
            idx_tr = (evt_id==evt2id[e]) & tr_m
            mu = y_raw[idx_tr].mean() if idx_tr.any() else 0.0
            sd = y_raw[idx_tr].std()  if idx_tr.any() else 1.0
            if sd < 1e-6: sd = 1.0
            mu_by_evt[e], sd_by_evt[e] = float(mu), float(sd)
            idx_all = (evt_id==evt2id[e])
            y_z[idx_all] = (y_raw[idx_all] - mu) / sd

        # DataLoaders (train + val only)
        def pack(mask):
            return X[mask], Lk[mask], LAB[mask], evt_id[mask], y_z[mask], y_raw[mask]
        Xtr,Ltr,LABtr,Etr,ytr_z,ytr = pack(tr_m)
        Xva,Lva,LABva,Eva,yva_z,yva = pack(va_m)

        dl_tr = DataLoader(HierDS(Xtr,Ltr,LABtr,Etr,ytr_z), batch_size=BATCH, shuffle=True)
        dl_va = DataLoader(HierDS(Xva,Lva,LABva,Eva,yva_z), batch_size=256)

        # Model
        model = HierGenderFamilies(in_dim=X.shape[-1], hid_season=args.hid_season,
                                   hid_across=args.hid_across, n_events=len(evt_list),
                                   emb_label_dim=args.emb_label).to(device)
        opt = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-4)
        loss_fn = nn.L1Loss()

        def run_epoch(dl, train=True):
            model.train(train); tot=0; n=0
            for xb, lb, labb, eb, yb in dl:
                xb, lb, labb, eb, yb = xb.to(device), lb.to(device), labb.to(device), eb.to(device), yb.to(device)
                with torch.set_grad_enabled(train):
                    pred = model(xb, lb, labb, eb)
                    loss = loss_fn(pred, yb)
                if train:
                    opt.zero_grad(); loss.backward()
                    nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                    opt.step()
                tot += loss.item()*len(xb); n+=len(xb)
            return tot/max(n,1)

        best=1e9; patience=0; best_state=None
        for ep in range(EPOCHS):
            tr = run_epoch(dl_tr, True)
            va = run_epoch(dl_va, False)
            print(f"[Hier-GenderFamilies] {gender} ep {ep:02d}  train L1(z)={tr:.3f}  val L1(z)={va:.3f}")
            if va < best - 1e-3:
                best, patience = va, 0
                best_state = {k:v.detach().cpu().clone() for k,v in model.state_dict().items()}
            else:
                patience += 1
            if patience >= 5: break
        if best_state is not None: model.load_state_dict(best_state)

        # ==== VALIDATION METRICS IN NATIVE UNITS ====
        model.eval()
        preds_z_val, Eva_list = [], []
        with torch.no_grad():
            for xb, lb, labb, eb, yb in dl_va:
                xb, lb, labb, eb = xb.to(device), lb.to(device), labb.to(device), eb.to(device)
                preds_z_val.append(model(xb, lb, labb, eb).cpu().numpy())
                Eva_list.append(eb.cpu().numpy())
        preds_z_val = np.concatenate(preds_z_val) if preds_z_val else np.array([])
        Eva_all = np.concatenate(Eva_list) if Eva_list else np.array([], dtype=np.int64)
        yhat_val = np.empty_like(preds_z_val)
        for eid, e in enumerate(evt_list):
            idx = (Eva_all == eid)
            if idx.any():
                mu, sd = mu_by_evt[e], sd_by_evt[e]
                yhat_val[idx] = preds_z_val[idx]*sd + mu
        overall_val_mae = float(np.mean(np.abs(yhat_val - yva))) if len(yhat_val) else float("nan")

        # per-event MAE (validation)
        per_event_val = []
        for eid, e in enumerate(evt_list):
            idx = (Eva_all == eid)
            if idx.any():
                mae = float(np.mean(np.abs(yhat_val[idx] - yva[idx])))
                per_event_val.append({"event":e, "val_mae":mae, "n":int(idx.sum())})

        fam_dir = os.path.join(args.outdir, f"{safe_slug(gender)}")
        os.makedirs(fam_dir, exist_ok=True)
        torch.save(model.state_dict(), os.path.join(fam_dir,"model.pth"))
        meta = {
            "model_name":"Hier-GenderFamilies",
            "gender":gender,
            "events":evt_list, "event2id": {e:i for i,e in enumerate(evt_list)},
            "mu_by_event":mu_by_evt, "sd_by_event":sd_by_evt,
            "seqlen_marks":args.seqlen, "k_seasons":args.kseasons,
            "hid_season":args.hid_season, "hid_across":args.hid_across, "emb_label":args.emb_label,
            "in_dim":X.shape[-1],
            "created": datetime.utcnow().isoformat() + "Z",
            "val_L1_z": float(best),
            "val_mae_overall_native": overall_val_mae,
            "per_event_val_metrics": per_event_val,
            "test_mae_overall": None,
            "per_event_test_metrics": []
        }
        with open(os.path.join(fam_dir,"meta.json"),"w",encoding="utf-8") as f:
            json.dump(meta,f,indent=2)

        row = {"gender":gender,"overall_val_mae":overall_val_mae,"n_events":len(evt_list)}
        for pe in per_event_val:
            row[f"ValMAE_{pe['event']}"]=pe["val_mae"]
        summary_rows.append(row)

    # save summary
    if summary_rows:
        pd.DataFrame(summary_rows).to_csv(os.path.join(args.outdir,"summary_metrics.csv"), index=False)
        print(f"✅ Saved models → {args.outdir}")
    else:
        print("⚠️ No models trained (insufficient data or filters too strict).")

In [35]:
if __name__ == "__main__":
    main()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["family"] = df.apply(lambda r: event_to_family(r["Event"], r["Gender"]), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["ath_key"] = (df["Athlete"].str.lower().str.strip() + "||" + df["Gender"].str.strip())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["fam_key"] = (df["ath_key"] 

KeyboardInterrupt: 