# Pre Process and configurations

In [None]:

import argparse, os, re, json, random, math
from datetime import datetime
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

# ------------------ config ------------------
SEQLEN = 32
EPOCHS = 30
BATCH = 128
LR = 1e-3
HID = 64
MIN_TRAIN_SAMPLES = 60   # skip tiny tasks
OUT_DIR_DEFAULT = "models_ESS_LSTM"
NPY_PATH_DEFAULT = r"C:\Users\danie\OneDrive\Desktop\FlightPhase\FlightPhase\scripts\tfrrs_performances.npy"
SEED = 42

# ------------------ utils -------------------
def set_seed(seed=42):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

RUN_EVENTS = {
    "60 Meters","100 Meters","200 Meters","400 Meters","800 Meters","1500 Meters",
    "Mile","3000 Meters","5000 Meters","10000 Meters","60 Hurdles","110 Hurdles",
    "400 Hurdles","3000 Steeplechase","DMR","4 x 100 Relay","4 x 400 Relay"
}
FIELD_EVENTS = {
    "Long Jump","Triple Jump","High Jump","Pole Vault",
    "Shot Put","Discus","Hammer","Javelin","Weight Throw"
}

# Define Clean Up Functions

In [None]:
def canonical_event(e: str) -> str:
    if e in RUN_EVENTS or e in FIELD_EVENTS: return e
    el = (e or "").strip().lower()
    alias = {
        "60m":"60 Meters","100m":"100 Meters","200m":"200 Meters","400m":"400 Meters",
        "800m":"800 Meters","1500m":"1500 Meters","mile":"Mile",
        "3k":"3000 Meters","5k":"5000 Meters","10k":"10000 Meters",
    }
    return alias.get(el, e)

def parse_time_to_seconds(s: str) -> float | None:
    if not s: return None
    sl = s.strip().lower()
    if sl in {"dnf","dq","fs","nt","ns"}: return None
    s = s.replace(" ", "")
    if ":" in s:
        parts = s.split(":")
        try:
            if len(parts)==2: m,sec = int(parts[0]), float(parts[1]); return m*60+sec
            if len(parts)==3: h,m,sec = int(parts[0]),int(parts[1]),float(parts[2]); return h*3600+m*60+sec
        except: return None
    try: return float(s)
    except: return None

def parse_distance_to_meters(s: str) -> float | None:
    if not s: return None
    sl = s.strip().lower()
    if sl in {"nm"}: return None
    if re.match(r"^\d{1,3}-\d{1,2}(\.\d+)?$", sl):  # ft-in
        ft, inch = sl.split("-")
        try:
            total_inches = int(ft)*12 + float(inch)
            return total_inches * 0.0254
        except: return None
    if sl.endswith("m"): sl = sl[:-1]
    try: return float(sl)
    except: return None

def mark_to_numeric(event: str, mark: str) -> tuple[float|None, bool]:
    if event in RUN_EVENTS:   return parse_time_to_seconds(mark), True
    if event in FIELD_EVENTS: return parse_distance_to_meters(mark), False
    t = parse_time_to_seconds(mark)
    if t is not None and (":" in (mark or "") or t < 30): return t, True
    d = parse_distance_to_meters(mark)
    return ((d, False) if d is not None else (None, True))

def load_df(npy_path: str) -> pd.DataFrame:
    arr = np.load(npy_path, allow_pickle=False)
    df = pd.DataFrame.from_records(arr)
    keep = ["Division","School","Gender","SeasonLabel","SeasonYear","Event",
            "Athlete","MarkOrTime","Wind","Meet","MeetDate"]
    df = df[keep].copy()
    for c in keep: df[c] = df[c].astype("string").str.strip()
    df["Event"] = df["Event"].map(canonical_event)
    df["date"] = pd.to_datetime(df["MeetDate"], errors="coerce")
    df = df.dropna(subset=["date","Event","Athlete","SeasonLabel","SeasonYear"])
    val_is_time = df.apply(lambda r: mark_to_numeric(r["Event"], r["MarkOrTime"]), axis=1, result_type="expand")
    df["value"] = val_is_time[0]; df["is_time"] = val_is_time[1]
    df = df.dropna(subset=["value"])
    df["key"] = df["Athlete"].str.lower().str.strip() + "||" + df["Event"].str.lower().str.strip() + "||" + df["Gender"]
    df["season_key"] = df["SeasonYear"].astype("Int64").astype("string") + "-" + df["SeasonLabel"]
    return df

# Next Season Peak Per Athelte + Event

In [None]:
def agg_peak(g: pd.DataFrame) -> float:
    return g["value"].min() if g["is_time"].iloc[0] else g["value"].max()

def build_targets(df: pd.DataFrame) -> pd.DataFrame:
    peaks = (df.groupby(["key","season_key"], as_index=False)
               .apply(agg_peak).rename(columns={None:"peak_value"}))
    meta = (df.groupby(["key","season_key"], as_index=False)
              .agg(SeasonYear=("SeasonYear","first"),
                   SeasonLabel=("SeasonLabel","first"),
                   Event=("Event","first"),
                   Gender=("Gender","first"),
                   is_time=("is_time","first")))
    peaks = peaks.merge(meta, on=["key","season_key"], how="left")
    def next_season(label, year): return f"{int(year+1)}-{label}"
    peaks["next_season_key"] = peaks.apply(lambda r: next_season(r["SeasonLabel"], int(r["SeasonYear"])), axis=1)
    out = peaks.merge(peaks[["key","season_key","peak_value"]]
                      .rename(columns={"season_key":"next_season_key","peak_value":"y_next"}),
                      on=["key","next_season_key"], how="left").drop(columns=["next_season_key"])
    out = out.dropna(subset=["y_next"]).reset_index(drop=True)
    return out

# Build SequencesFrom Raw Marks Up to the End of the Season

In [None]:
def build_sequences(df: pd.DataFrame, seqlen=SEQLEN):
    df = df.sort_values(["key","date"]).reset_index(drop=True)
    def make_seq(g):
        vals = g["value"].to_numpy()
        days = g["date"].diff().dt.days.fillna(0).clip(lower=0).to_numpy()
        wind = pd.to_numeric(g["Wind"].str.replace("+","", regex=False).str.replace("m/s","", regex=False),
                             errors="coerce").fillna(0.0).to_numpy()
        X = np.stack([vals, days, wind], axis=1).astype("float32")
        return X[-seqlen:] if len(X)>seqlen else X
    seqs = {}
    for (k,s), g in df.groupby(["key","season_key"]): seqs[(k,s)] = make_seq(g)
    return seqs

def pad_seq(X, L=SEQLEN):
    if len(X)>=L: return X
    pad = np.zeros((L-len(X), X.shape[1]), dtype="float32")
    return np.vstack([pad, X])

class SeqDS(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
    def __len__(self): return len(self.X)
    def __getitem__(self, i): return self.X[i], self.y[i]

# Architecture

In [None]:
class LSTMReg(nn.Module):
    def __init__(self, in_dim=3, hid=64, layers=1, dropout=0.1):
        super().__init__()
        self.lstm = nn.LSTM(in_dim, hid, num_layers=layers, batch_first=True,
                            dropout=(dropout if layers>1 else 0.0))
        self.head = nn.Sequential(nn.LayerNorm(hid), nn.Linear(hid, 1))
    def forward(self, x):
        out, _ = self.lstm(x)
        last = out[:, -1, :]
        return self.head(last).squeeze(-1)

def safe_slug(s: str) -> str:
    return re.sub(r"[^A-Za-z0-9]+", "_", s).strip("_")

# Training All Tasks

In [None]:
def main():
    set_seed(SEED)
    ap = argparse.ArgumentParser()
    ap.add_argument("-i","--input", default=NPY_PATH_DEFAULT)
    ap.add_argument("-o","--outdir", default=OUT_DIR_DEFAULT)
    ap.add_argument("--min-train", type=int, default=MIN_TRAIN_SAMPLES)
    ap.add_argument("--epochs", type=int, default=EPOCHS)
    ap.add_argument("--hid", type=int, default=HID)
    ap.add_argument("--seqlen", type=int, default=SEQLEN)
    args = ap.parse_args()

    os.makedirs(args.outdir, exist_ok=True)
    df = load_df(args.input)
    targets_all = build_targets(df)
    seqs_all = build_sequences(df, args.seqlen)

    device = "cuda" if torch.cuda.is_available() else "cpu"

    metrics = []
    tasks = (targets_all[["Event","Gender","SeasonLabel"]]
             .drop_duplicates().sort_values(["Gender","SeasonLabel","Event"]))
    for _, t in tasks.iterrows():
        event, gender, season = t["Event"], t["Gender"], t["SeasonLabel"]
        tmask = ((targets_all["Event"]==event) &
                 (targets_all["Gender"]==gender) &
                 (targets_all["SeasonLabel"]==season))
        T = targets_all[tmask].copy()
        if T.empty: continue

        X_list, y_list, years = [], [], []
        for _, r in T.iterrows():
            X = seqs_all.get((r["key"], r["season_key"]))
            if X is None: continue
            X_list.append(pad_seq(X, args.seqlen))
            y_list.append(float(r["y_next"]))
            years.append(int(r["SeasonYear"]))
        if not X_list: continue

        X = np.stack(X_list); y = np.array(y_list, dtype="float32"); years = np.array(years)
        train_m = years <= 2023; val_m = years == 2024; test_m = years >= 2025
        if train_m.sum() < args.min_train:
            # try a random split fallback if time-based too small
            idx = np.arange(len(X)); np.random.shuffle(idx)
            n = len(idx); a=max(args.min_train, int(0.7*n)); b=int(0.85*n)
            if a>=n: continue
            train_m = np.zeros(n,bool); val_m=np.zeros(n,bool); test_m=np.zeros(n,bool)
            train_m[idx[:a]]=True; val_m[idx[a:b]]=True; test_m[idx[b:]]=True

        ds_tr, ds_va, ds_te = SeqDS(X[train_m], y[train_m]), SeqDS(X[val_m], y[val_m]), SeqDS(X[test_m], y[test_m])
        if len(ds_tr) < args.min_train: 
            print(f"Skipping {event} | {gender} | {season} (train {len(ds_tr)} < {args.min_train})")
            continue

        dl_tr = DataLoader(ds_tr, batch_size=BATCH, shuffle=True)
        dl_va = DataLoader(ds_va, batch_size=256)
        dl_te = DataLoader(ds_te, batch_size=256)

        model = LSTMReg(in_dim=X.shape[-1], hid=args.hid).to(device)
        opt = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-4)
        loss_fn = nn.L1Loss()

        best = 1e9; patience = 0; best_state=None
        for ep in range(args.epochs):
            # train
            model.train(); tot=0; n=0
            for xb, yb in dl_tr:
                xb, yb = xb.to(device), yb.to(device)
                pred = model(xb); loss = loss_fn(pred, yb)
                opt.zero_grad(); loss.backward(); opt.step()
                tot += loss.item()*len(xb); n+=len(xb)
            tr = tot/max(n,1)
            # val
            model.eval(); tot=0; n=0
            with torch.no_grad():
                for xb, yb in dl_va:
                    xb, yb = xb.to(device), yb.to(device)
                    pred = model(xb); loss = loss_fn(pred, yb)
                    tot += loss.item()*len(xb); n+=len(xb)
            va = tot/max(n,1)
            print(f"[ESS-LSTM] {gender} | {season} | {event}  ep {ep:02d}  train={tr:.3f}  val={va:.3f}")
            if va < best - 1e-3:
                best, patience = va, 0
                best_state = {k:v.detach().cpu().clone() for k,v in model.state_dict().items()}
            else:
                patience += 1
            if patience >= 5: break
        if best_state is not None: model.load_state_dict(best_state)

        # test
        model.eval(); tot=0; n=0
        with torch.no_grad():
            for xb, yb in dl_te:
                xb, yb = xb.to(device), yb.to(device)
                pred = model(xb); loss = loss_fn(pred, yb)
                tot += loss.item()*len(xb); n+=len(xb)
        te = tot/max(n,1) if n else float("nan")

        # save weights + metadata
        subdir = os.path.join(args.outdir, f"{safe_slug(gender)}__{safe_slug(season)}__{safe_slug(event)}")
        os.makedirs(subdir, exist_ok=True)
        weight_path = os.path.join(subdir, "model.pth")
        torch.save(model.state_dict(), weight_path)
        meta = {
            "model_name":"ESS-LSTM",
            "event":event,"gender":gender,"season":season,
            "seqlen":args.seqlen,"hid":args.hid,"in_dim":X.shape[-1],
            "train_n":int(len(ds_tr)),"val_n":int(len(ds_va)),"test_n":int(len(ds_te)),
            "val_mae":float(best),"test_mae":float(te),
            "created":datetime.utcnow().isoformat()+"Z"
        }
        with open(os.path.join(subdir,"meta.json"),"w",encoding="utf-8") as f: json.dump(meta,f,indent=2)
        metrics.append(meta)

    # write summary CSV
    if metrics:
        pd.DataFrame(metrics).to_csv(os.path.join(args.outdir,"summary_metrics.csv"), index=False)
        print(f"✅ Saved {len(metrics)} ESS-LSTM models → {args.outdir}")
    else:
        print("⚠️ No tasks trained (insufficient data with current thresholds).")

In [None]:
if __name__ == "__main__":
    main()