# Pre Process and configurations

In [None]:

import argparse, os, re, json, random, math
from datetime import datetime
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

# ------------------ config ------------------
SEQLEN = 32
EPOCHS = 30
BATCH = 128
LR = 1e-3
HID = 64
MIN_TRAIN_SAMPLES = 60   # skip tiny tasks
OUT_DIR_DEFAULT = "models_ESS_LSTM"
NPY_PATH_DEFAULT = "tfrrs_performances_fast.npy"
SEED = 42

# ------------------ utils -------------------
def set_seed(seed=42):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

RUN_EVENTS = {
    "60 Meters","100 Meters","200 Meters","400 Meters","800 Meters","1500 Meters",
    "Mile","3000 Meters","5000 Meters","10000 Meters","60 Hurdles","110 Hurdles",
    "400 Hurdles","3000 Steeplechase","DMR","4 x 100 Relay","4 x 400 Relay"
}
FIELD_EVENTS = {
    "Long Jump","Triple Jump","High Jump","Pole Vault",
    "Shot Put","Discus","Hammer","Javelin","Weight Throw"
}

# Define Clean Up Functions

In [None]:
def canonical_event(e: str) -> str:
    if e in RUN_EVENTS or e in FIELD_EVENTS: return e
    el = (e or "").strip().lower()
    alias = {
        "60m":"60 Meters","100m":"100 Meters","200m":"200 Meters","400m":"400 Meters",
        "800m":"800 Meters","1500m":"1500 Meters","mile":"Mile",
        "3k":"3000 Meters","5k":"5000 Meters","10k":"10000 Meters",
    }
    return alias.get(el, e)

def parse_time_to_seconds(s: str) -> float | None:
    if not s: return None
    sl = s.strip().lower()
    if sl in {"dnf","dq","fs","nt","ns"}: return None
    s = s.replace(" ", "")
    if ":" in s:
        parts = s.split(":")
        try:
            if len(parts)==2: m,sec = int(parts[0]), float(parts[1]); return m*60+sec
            if len(parts)==3: h,m,sec = int(parts[0]),int(parts[1]),float(parts[2]); return h*3600+m*60+sec
        except: return None
    try: return float(s)
    except: return None

def parse_distance_to_meters(s: str) -> float | None:
    if not s: return None
    sl = s.strip().lower()
    if sl in {"nm"}: return None
    if re.match(r"^\d{1,3}-\d{1,2}(\.\d+)?$", sl):  # ft-in
        ft, inch = sl.split("-")
        try:
            total_inches = int(ft)*12 + float(inch)
            return total_inches * 0.0254
        except: return None
    if sl.endswith("m"): sl = sl[:-1]
    try: return float(sl)
    except: return None

def mark_to_numeric(event: str, mark: str) -> tuple[float|None, bool]:
    if event in RUN_EVENTS:   return parse_time_to_seconds(mark), True
    if event in FIELD_EVENTS: return parse_distance_to_meters(mark), False
    t = parse_time_to_seconds(mark)
    if t is not None and (":" in (mark or "") or t < 30): return t, True
    d = parse_distance_to_meters(mark)
    return ((d, False) if d is not None else (None, True))

def load_df(npy_path: str) -> pd.DataFrame:
    arr = np.load(npy_path, allow_pickle=False)
    df = pd.DataFrame.from_records(arr)
    keep = ["Division","School","Gender","SeasonLabel","SeasonYear","Event",
            "Athlete","MarkOrTime","Wind","Meet","MeetDate"]
    df = df[keep].copy()
    for c in keep: df[c] = df[c].astype("string").str.strip()
    df["Event"] = df["Event"].map(canonical_event)
    df["date"] = pd.to_datetime(df["MeetDate"], errors="coerce")
    df = df.dropna(subset=["date","Event","Athlete","SeasonLabel","SeasonYear"])
    val_is_time = df.apply(lambda r: mark_to_numeric(r["Event"], r["MarkOrTime"]), axis=1, result_type="expand")
    df["value"] = val_is_time[0]; df["is_time"] = val_is_time[1]
    df = df.dropna(subset=["value"])
    df["key"] = df["Athlete"].str.lower().str.strip() + "||" + df["Event"].str.lower().str.strip() + "||" + df["Gender"]
    df["season_key"] = df["SeasonYear"].astype("Int64").astype("string") + "-" + df["SeasonLabel"]
    return df

# Next Season Peak Per Athelte + Event

In [None]:
def agg_peak(g: pd.DataFrame) -> float:
    return g["value"].min() if g["is_time"].iloc[0] else g["value"].max()

def build_targets(df: pd.DataFrame) -> pd.DataFrame:
    peaks = (df.groupby(["key","season_key"], as_index=False)
               .apply(agg_peak).rename(columns={None:"peak_value"}))
    meta = (df.groupby(["key","season_key"], as_index=False)
              .agg(SeasonYear=("SeasonYear","first"),
                   SeasonLabel=("SeasonLabel","first"),
                   Event=("Event","first"),
                   Gender=("Gender","first"),
                   is_time=("is_time","first")))
    peaks = peaks.merge(meta, on=["key","season_key"], how="left")
    def next_season(label, year): return f"{int(year+1)}-{label}"
    peaks["next_season_key"] = peaks.apply(lambda r: next_season(r["SeasonLabel"], int(r["SeasonYear"])), axis=1)
    out = peaks.merge(peaks[["key","season_key","peak_value"]]
                      .rename(columns={"season_key":"next_season_key","peak_value":"y_next"}),
                      on=["key","next_season_key"], how="left").drop(columns=["next_season_key"])
    out = out.dropna(subset=["y_next"]).reset_index(drop=True)
    return out

# Build SequencesFrom Raw Marks Up to the End of the Season

In [None]:
def build_sequences(df: pd.DataFrame, seqlen=SEQLEN):
    df = df.sort_values(["key","date"]).reset_index(drop=True)
    def make_seq(g):
        vals = g["value"].to_numpy()
        days = g["date"].diff().dt.days.fillna(0).clip(lower=0).to_numpy()
        wind = pd.to_numeric(g["Wind"].str.replace("+","", regex=False).str.replace("m/s","", regex=False),
                             errors="coerce").fillna(0.0).to_numpy()
        X = np.stack([vals, days, wind], axis=1).astype("float32")
        return X[-seqlen:] if len(X)>seqlen else X
    seqs = {}
    for (k,s), g in df.groupby(["key","season_key"]): seqs[(k,s)] = make_seq(g)
    return seqs

def pad_seq(X, L=SEQLEN):
    if len(X)>=L: return X
    pad = np.zeros((L-len(X), X.shape[1]), dtype="float32")
    return np.vstack([pad, X])

class FamDS(Dataset):
    def __init__(self, X, evt_ids, y_z):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.evt = torch.tensor(evt_ids, dtype=torch.long)
        self.y = torch.tensor(y_z, dtype=torch.float32)
    def __len__(self): return len(self.X)
    def __getitem__(self, i): return self.X[i], self.evt[i], self.y[i]

# Architecture

In [None]:
class SharedEncoderPerEventHeads(nn.Module):
    def __init__(self, in_dim, hid, n_events):
        super().__init__()
        self.lstm = nn.LSTM(in_dim, hid, batch_first=True)
        self.heads = nn.ModuleList([nn.Sequential(nn.LayerNorm(hid), nn.Linear(hid, 1))
                                    for _ in range(n_events)])
    def forward(self, x, evt_id):
        out, _ = self.lstm(x)
        last = out[:, -1, :]
        preds = torch.empty(x.size(0), device=x.device, dtype=torch.float32)
        # route each sub-batch to its event head
        for eid in torch.unique(evt_id):
            mask = (evt_id == eid)
            preds[mask] = self.heads[int(eid)](last[mask]).squeeze(-1)
        return preds

def safe_slug(s: str) -> str:
    return re.sub(r"[^A-Za-z0-9]+", "_", s).strip("_")

# Training All Families

In [None]:
def main():
    set_seed(SEED)
    ap = argparse.ArgumentParser()
    ap.add_argument("-i","--input", default=NPY_PATH_DEFAULT)
    ap.add_argument("-o","--outdir", default=OUT_DIR_DEFAULT)
    ap.add_argument("--min-train-per-event", type=int, default=MIN_TRAIN_PER_EVENT)
    ap.add_argument("--epochs", type=int, default=EPOCHS)
    ap.add_argument("--hid", type=int, default=HID)
    ap.add_argument("--seqlen", type=int, default=SEQLEN)
    args = ap.parse_args()

    os.makedirs(args.outdir, exist_ok=True)
    df = load_df(args.input)
    targets_all = build_targets(df)
    seqs_all = build_sequences(df, args.seqlen)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    families = (targets_all[["Gender","SeasonLabel"]].drop_duplicates()
                .sort_values(["Gender","SeasonLabel"]))

    family_metrics = []
    for _, fam in families.iterrows():
        gender, season = fam["Gender"], fam["SeasonLabel"]
        F = targets_all[(targets_all["Gender"]==gender) & (targets_all["SeasonLabel"]==season)].copy()
        if F.empty: continue

        # Build aligned dataset
        Xs, ys, years, events = [], [], [], []
        for _, r in F.iterrows():
            X = seqs_all.get((r["key"], r["season_key"]))
            if X is None: continue
            Xs.append(pad_seq(X, args.seqlen))
            ys.append(float(r["y_next"]))
            years.append(int(r["SeasonYear"]))
            events.append(r["Event"])
        if not Xs: continue

        X = np.stack(Xs); y = np.array(ys, dtype="float32"); years = np.array(years); events = np.array(events)

        # time-based split (fallback randomized if empty)
        train_m = years <= 2023; val_m = years == 2024; test_m = years >= 2025
        if not train_m.any():
            idx = np.arange(len(X)); np.random.shuffle(idx)
            n=len(idx); a=int(0.7*n); b=int(0.85*n)
            train_m = np.zeros(n,bool); val_m=np.zeros(n,bool); test_m=np.zeros(n,bool)
            train_m[idx[:a]]=True; val_m[idx[a:b]]=True; test_m[idx[b:]]=True

        # include only events with enough TRAIN samples
        tr_events, tr_counts = np.unique(events[train_m], return_counts=True)
        allowed = set(e for e,c in zip(tr_events, tr_counts) if c >= args.min_train_per_event)
        keep = np.array([e in allowed for e in events])
        X, y, years, events = X[keep], y[keep], years[keep], events[keep]
        train_m, val_m, test_m = train_m[keep], val_m[keep], test_m[keep]

        if len(allowed) == 0:
            print(f"Skipping family {gender} | {season} (no event has ≥{args.min_train_per_event} train samples)")
            continue

        evt_list = sorted(list(allowed))
        evt2id = {e:i for i,e in enumerate(evt_list)}
        evt_id = np.array([evt2id[e] for e in events], dtype=np.int64)

        # per-event z-score (fit on TRAIN)
        y_z = y.copy()
        mu_by_evt, sd_by_evt = {}, {}
        for e in evt_list:
            idx = (evt_id==evt2id[e]) & train_m
            mu = y[idx].mean() if idx.any() else 0.0
            sd = y[idx].std() if idx.any() else 1.0
            if sd < 1e-6: sd = 1.0
            mu_by_evt[e], sd_by_evt[e] = float(mu), float(sd)
            idx_all = (evt_id==evt2id[e])
            y_z[idx_all] = (y[idx_all]-mu)/sd

        # datasets/loaders
        class FamDS(Dataset):
            def __init__(self, X, eids, y):
                self.X = torch.tensor(X, dtype=torch.float32)
                self.e = torch.tensor(eids, dtype=torch.long)
                self.y = torch.tensor(y, dtype=torch.float32)
            def __len__(self): return len(self.X)
            def __getitem__(self, i): return self.X[i], self.e[i], self.y[i]

        train_dl = DataLoader(FamDS(X[train_m], evt_id[train_m], y_z[train_m]), batch_size=BATCH, shuffle=True)
        val_dl   = DataLoader(FamDS(X[val_m], evt_id[val_m], y_z[val_m]), batch_size=256)
        test_dl  = DataLoader(FamDS(X[test_m], evt_id[test_m], y_z[test_m]), batch_size=256)

        # model
        class SharedEncoderPerEventHeads(nn.Module):
            def __init__(self, in_dim, hid, n_events):
                super().__init__()
                self.lstm = nn.LSTM(in_dim, hid, batch_first=True)
                self.heads = nn.ModuleList([nn.Sequential(nn.LayerNorm(hid), nn.Linear(hid, 1))
                                            for _ in range(n_events)])
            def forward(self, x, evt_id):
                out, _ = self.lstm(x)
                last = out[:, -1, :]
                preds = torch.empty(x.size(0), device=x.device, dtype=torch.float32)
                for eid in torch.unique(evt_id):
                    mask = (evt_id == eid)
                    preds[mask] = self.heads[int(eid)](last[mask]).squeeze(-1)
                return preds

        device = "cuda" if torch.cuda.is_available() else "cpu"
        model = SharedEncoderPerEventHeads(in_dim=X.shape[-1], hid=args.hid, n_events=len(evt_list)).to(device)
        opt = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-4)
        loss_fn = nn.L1Loss()

        def run_epoch(dl, train=True):
            model.train(train); tot=0; n=0
            for xb, eb, yb in dl:
                xb, eb, yb = xb.to(device), eb.to(device), yb.to(device)
                with torch.set_grad_enabled(train):
                    pred = model(xb, eb); loss = loss_fn(pred, yb)
                if train:
                    opt.zero_grad(); loss.backward(); opt.step()
                tot += loss.item()*len(xb); n+=len(xb)
            return tot/max(n,1)

        best=1e9; patience=0; best_state=None
        for ep in range(EPOCHS):
            tr = run_epoch(train_dl, True)
            va = run_epoch(val_dl, False)
            print(f"[FSMH-LSTM] {gender} | {season} ep {ep:02d}  train L1(z)={tr:.3f}  val L1(z)={va:.3f}")
            if va < best - 1e-3:
                best, patience = va, 0
                best_state = {k:v.detach().cpu().clone() for k,v in model.state_dict().items()}
            else:
                patience += 1
            if patience >= 5: break
        if best_state is not None: model.load_state_dict(best_state)

        # evaluate on TEST in native units
        model.eval()
        preds_z = []
        with torch.no_grad():
            for xb, eb, yb in test_dl:
                xb, eb = xb.to(device), eb.to(device)
                preds_z.append(model(xb, eb).cpu().numpy())
        preds_z = np.concatenate(preds_z) if preds_z else np.array([])
        y_test = y[test_m]
        e_test = evt_id[test_m]

        # de-standardize per event
        yhat = np.empty_like(preds_z)
        for e, eid in ((e, i) for i,e in enumerate(evt_list)):
            idx = (e_test==eid)
            if idx.any():
                mu, sd = mu_by_evt[e], sd_by_evt[e]
                yhat[idx] = preds_z[idx]*sd + mu

        overall_mae = float(np.mean(np.abs(yhat - y_test))) if len(yhat) else float("nan")
        # per-event MAE
        per_event = []
        for e, eid in ((e, i) for i,e in enumerate(evt_list)):
            idx = (e_test==eid)
            if idx.any():
                mae = float(np.mean(np.abs(yhat[idx] - y_test[idx])))
                per_event.append({"event":e, "test_mae":mae, "n":int(idx.sum())})

        # save family model
        fam_dir = os.path.join(args.outdir, f"{safe_slug(gender)}__{safe_slug(season)}")
        os.makedirs(fam_dir, exist_ok=True)
        torch.save(model.state_dict(), os.path.join(fam_dir,"model.pth"))
        meta = {
            "model_name":"FSMH-LSTM",
            "gender":gender,"season":season,
            "events":evt_list,
            "event2id":evt2id,
            "mu_by_event":mu_by_evt,
            "sd_by_event":sd_by_evt,
            "hid":args.hid,"seqlen":args.seqlen,"in_dim":X.shape[-1],
            "created":datetime.utcnow().isoformat()+"Z",
            "val_L1_z":float(best),
            "test_mae_overall":overall_mae,
            "per_event_metrics":per_event
        }
        with open(os.path.join(fam_dir,"meta.json"),"w",encoding="utf-8") as f: json.dump(meta,f,indent=2)

        # accumulate for global summary
        row = {"gender":gender,"season":season,"overall_test_mae":overall_mae,"n_events":len(evt_list)}
        for pe in per_event:
            row[f"MAE_{pe['event']}"]=pe["test_mae"]
        family_metrics.append(row)

    # write summary CSV
    if family_metrics:
        pd.DataFrame(family_metrics).to_csv(os.path.join(args.outdir,"summary_metrics.csv"), index=False)
        print(f"✅ Saved FSMH-LSTM families → {args.outdir}")
    else:
        print("⚠️ No families trained (insufficient data with current thresholds).")

In [None]:
if __name__ == "__main__":
    main()