In [None]:
# dataset_build_player_value.py
# Portable version using pathlib (Mac / Linux / Windows safe)


import numpy as np
import pandas as pd
from pathlib import Path

# ----------------------------
# Project paths (portable, notebook-safe)
# ----------------------------
try:
    PROJECT_ROOT = Path(__file__).resolve().parent.parent
except NameError:
    # Running in Jupyter / interactive
    PROJECT_ROOT = Path.cwd().parent

DATA_DIR = PROJECT_ROOT / "Data"
OUT_DIR = PROJECT_ROOT / "Data_Processed"
OUT_DIR.mkdir(parents=True, exist_ok=True)

PLAYERS_CSV = DATA_DIR / "players.csv"
VALUATIONS_CSV = DATA_DIR / "player_valuations.csv"
EVENTS_CSV = DATA_DIR / "game_events.csv"

# ----------------------------
# Config
# ----------------------------
SEQ_LEN_T = 20
NN_WINDOW_W = 20
MIN_PRIOR_GAMES = 3
MAX_SAMPLES = None  # set None to keep all
USE_LOG_TARGET = True

# ----------------------------
# Helpers
# ----------------------------
def safe_to_datetime(s):
    return pd.to_datetime(s, errors="coerce", utc=False)

def compute_age_years(dob, ref_date):
    if pd.isna(dob) or pd.isna(ref_date):
        return np.nan
    return (ref_date - dob).days / 365.25

def standardize_position(pos):
    if pd.isna(pos):
        return "UNK"
    p = str(pos).upper()
    if "GOAL" in p or p == "GK":
        return "GK"
    if "DEF" in p:
        return "DEF"
    if "MID" in p:
        return "MID"
    if "ATT" in p or "FORW" in p or "WING" in p or "STRIK" in p:
        return "ATT"
    return p[:10]

def standardize_foot(foot):
    if pd.isna(foot):
        return "UNK"
    f = str(foot).lower()
    if f.startswith("right"):
        return "R"
    if f.startswith("left"):
        return "L"
    if "both" in f:
        return "B"
    return "UNK"

# Add this helper near your other helpers
def make_big5_flag(val_df):
    """
    Big-5 leagues flag (England, Spain, Italy, Germany, France).
    Uses player_valuations.csv column: player_club_domestic_competition_id

    Works for common IDs like:
      GB1 (Premier League), ES1 (LaLiga), IT1 (Serie A), DE1 (Bundesliga), FR1 (Ligue 1)

    If your dataset uses different codes, just extend BIG5_IDS.
    """
    BIG5_IDS = {"GB1", "ES1", "IT1", "DE1", "FR1"}

    comp = val_df["player_club_domestic_competition_id"].fillna("").astype(str).str.upper()
    val_df["is_big5_league"] = comp.isin(BIG5_IDS).astype(np.float32)
    return val_df

# ----------------------------
# Load data
# ----------------------------
print("Loading CSVs...")
players = pd.read_csv(PLAYERS_CSV)
valuations = pd.read_csv(VALUATIONS_CSV)
events = pd.read_csv(EVENTS_CSV, low_memory=False)

players["date_of_birth"] = safe_to_datetime(players["date_of_birth"])
valuations["date"] = safe_to_datetime(valuations["date"])
events["date"] = safe_to_datetime(events["date"])

valuations = valuations.dropna(subset=["player_id", "date", "market_value_in_eur"])
valuations["market_value_in_eur"] = pd.to_numeric(
    valuations["market_value_in_eur"], errors="coerce"
)
valuations = valuations.dropna(subset=["market_value_in_eur"])
valuations = valuations.sort_values(["player_id", "date"]).reset_index(drop=True)

# ----------------------------
# Static player features
# ----------------------------
players_static = players[
    ["player_id", "height_in_cm", "foot", "position"]
].copy()

players_static["height_in_cm"] = pd.to_numeric(
    players_static["height_in_cm"], errors="coerce"
)
players_static["foot"] = players_static["foot"].apply(standardize_foot)
players_static["pos_group"] = players_static["position"].apply(standardize_position)

players_dob = players[["player_id", "date_of_birth"]]

static_ohe = pd.get_dummies(
    players_static[["foot", "pos_group"]].fillna("UNK"),
    prefix=["foot", "pos"],
)

players_static_num = pd.concat(
    [
        players_static[["player_id", "height_in_cm"]].reset_index(drop=True),
        static_ohe.reset_index(drop=True),
    ],
    axis=1,
).drop_duplicates("player_id")

# ----------------------------
# Event-based per-game features
# ----------------------------
print("Building per-game event features...")

ev = events.dropna(subset=["date", "game_id"]).copy()
ev["game_id"] = pd.to_numeric(ev["game_id"], errors="coerce").astype("Int64")
ev["minute"] = pd.to_numeric(ev["minute"], errors="coerce")

desc = ev["description"].fillna("")
is_goal = ev["type"] == "Goals"
is_yellow = (ev["type"] == "Cards") & desc.str.contains("Yellow card", case=False)
is_red = (ev["type"] == "Cards") & desc.str.contains("Red card", case=False)
is_sub = ev["type"] == "Substitutions"

def count_events(df, col="player_id", name="count"):
    return (
        df[[col, "game_id"]]
        .dropna()
        .groupby([col, "game_id"])
        .size()
        .rename(name)
        .reset_index()
        .rename(columns={col: "player_id"})
    )

goals = count_events(ev[is_goal], "player_id", "goals")
assists = count_events(ev[is_goal], "player_assist_id", "assists")
yellow = count_events(ev[is_yellow], "player_id", "yellow_cards")
red = count_events(ev[is_red], "player_id", "red_cards")
sub_in = count_events(ev[is_sub], "player_in_id", "sub_in")
sub_out = count_events(ev[is_sub], "player_id", "sub_out")

game_dates = (
    ev.groupby("game_id")["date"].min().reset_index(name="game_date")
)

pairs = pd.concat(
    [goals, assists, yellow, red, sub_in, sub_out], axis=0
)[["player_id", "game_id"]].drop_duplicates()

per_game = pairs.merge(game_dates, on="game_id", how="left")

for df in [goals, assists, yellow, red, sub_in, sub_out]:
    per_game = per_game.merge(df, on=["player_id", "game_id"], how="left")

per_game = per_game.fillna(0)
per_game = per_game.sort_values(["player_id", "game_date"]).reset_index(drop=True)

GAME_FEATURES = [
    "goals", "assists", "yellow_cards", "red_cards", "sub_in", "sub_out"
]

# ----------------------------
# Build NN + RNN datasets
# ----------------------------
print("Building NN and RNN datasets...")

val = valuations.merge(players_dob, on="player_id", how="left")
val["age_years"] = val.apply(
    lambda r: compute_age_years(r["date_of_birth"], r["date"]), axis=1
)
val = val.merge(players_static_num, on="player_id", how="left")
val = make_big5_flag(val)


val["y_raw"] = pd.to_numeric(val["market_value_in_eur"], errors="coerce").astype(np.float32)
val["y_log"] = np.log1p(val["y_raw"])

static_cols = ["height_in_cm", "age_years", "is_big5_league"] + [
    c for c in val.columns if c.startswith("foot_") or c.startswith("pos_")
]


X_seq, X_static, y_out, meta_rows, nn_rows = [], [], [], [], []

pgroups = {pid: g for pid, g in per_game.groupby("player_id")}
vgroups = {pid: g for pid, g in val.groupby("player_id")}

for pid, vg in vgroups.items():
    if pid not in pgroups:
        continue

    gg = pgroups[pid]
    g_dates = gg["game_date"].to_numpy()
    g_feats = gg[GAME_FEATURES].to_numpy(dtype=np.float32)

    idxs = np.searchsorted(g_dates, vg["date"].to_numpy(), side="left")

    for i, n_before in enumerate(idxs):
        if n_before < MIN_PRIOR_GAMES:
            continue

        seq = g_feats[max(0, n_before - SEQ_LEN_T):n_before]
        if seq.shape[0] < SEQ_LEN_T:
            seq = np.vstack([np.zeros((SEQ_LEN_T - seq.shape[0], seq.shape[1])), seq])

        # -------- TARGET (CORRECTLY ALIGNED) --------
        target_raw = float(vg.iloc[i]["y_raw"])
        target_log = float(vg.iloc[i]["y_log"])

        X_seq.append(seq)
        X_static.append(vg.iloc[i][static_cols].to_numpy(dtype=np.float32))

        if USE_LOG_TARGET:
            y_out.append(target_log)
        else:
            y_out.append(target_raw)

        meta_rows.append((pid, vg.iloc[i]["date"]))

        win = g_feats[max(0, n_before - NN_WINDOW_W):n_before]
        nn_rows.append({
            "player_id": pid,
            "valuation_date": vg.iloc[i]["date"],
            "y_raw": target_raw,
            "y_log": target_log,
            **vg.iloc[i][static_cols].to_dict(),
            **{f"mean_{f}": win[:, j].mean() for j, f in enumerate(GAME_FEATURES)},
            **{f"sum_{f}": win[:, j].sum() for j, f in enumerate(GAME_FEATURES)},
        })


X_seq = np.asarray(X_seq, dtype=np.float32)
X_static = np.asarray(X_static, dtype=np.float32)
y_out = np.asarray(y_out, dtype=np.float32)

meta = pd.DataFrame(meta_rows, columns=["player_id", "valuation_date"])
tabular_df = pd.DataFrame(nn_rows)

# Optional downsampling
if MAX_SAMPLES and len(tabular_df) > MAX_SAMPLES:
    idx = np.random.default_rng(0).choice(len(tabular_df), MAX_SAMPLES, replace=False)
    tabular_df = tabular_df.iloc[idx]
    X_seq = X_seq[idx]
    X_static = X_static[idx]
    y_out = y_out[idx]
    meta = meta.iloc[idx]

# ----------------------------
# Save outputs (CSV + NPZ)
# ----------------------------
tabular_df.to_csv(OUT_DIR / "nn_tabular_dataset.csv", index=False)
meta.to_csv(OUT_DIR / "meta.csv", index=False)

np.savez_compressed(
    OUT_DIR / "rnn_dataset.npz",
    X_seq=X_seq,
    X_static=X_static,
    y=y_out,
)

print("Saved datasets to:", OUT_DIR)
print(" -", OUT_DIR / "nn_tabular_dataset.csv")
print(" -", OUT_DIR / "meta.csv")
print(" -", OUT_DIR / "rnn_dataset.npz")
print("Tabular shape:", tabular_df.shape)
print("RNN X_seq:", X_seq.shape, "X_static:", X_static.shape)


Loading CSVs...
Building per-game event features...
Building NN and RNN datasets...
Saved datasets to: c:\Users\ofurn\Dokumenter\Github\FYSSTK3155\PROJECT 3\Code\processed_player_value
 - c:\Users\ofurn\Dokumenter\Github\FYSSTK3155\PROJECT 3\Code\processed_player_value\nn_tabular_dataset.csv
 - c:\Users\ofurn\Dokumenter\Github\FYSSTK3155\PROJECT 3\Code\processed_player_value\meta.csv
 - c:\Users\ofurn\Dokumenter\Github\FYSSTK3155\PROJECT 3\Code\processed_player_value\rnn_dataset.npz
Tabular shape: (278558, 28)
RNN X_seq: (278558, 20, 6) X_static: (278558, 12)
