In [None]:
import pandas as pd
from pathlib import Path
import re

# ==========================================================
base_dir = Path("/Users/chasesiegel/Desktop/Comp_Sci/Capstone/Dynasty")

dom_dir = base_dir / "data" / "processed"          # where dom_master_{year}.csv live
skill_path = base_dir / "data" / "scraper" / "skill_draftees_2000_2025.csv"
birthdays_path = base_dir / "data" / "processed" / "skill_draftee_birthdays.csv"

out_path = base_dir / "data" / "processed" / "drafted_dom_master_with_birthdays.csv"
# ==========================================================

# If your dom_master columns have different names, adjust this mapping:
DOM_COLS = {
    "year": "year",          # season year column in dom_master files
    "player": "player_name", # or "player" depending on your schema
    "DOM": "DOM",            # dom column
    "DOM_plus": "DOM+",      # dom+ column
    "PDOM": "PDOM",
    "PDOM_plus": "PDOM+",
    "RDOM": "RDOM",
    "RDOM_plus": "RDOM+",
}

def normalize_name(name: str) -> str:
    """Light name normalization consistent across datasets."""
    if not isinstance(name, str):
        return ""
    s = name.strip().lower()
    # remove common punctuation that can differ across sources
    for ch in [".", "'", "â€™", "-", ","]:
        s = s.replace(ch, "")
    # squeeze whitespace
    s = re.sub(r"\s+", " ", s)
    return s

# -------------------------------
# 1) Load skill draftees
# -------------------------------
skill = pd.read_csv(skill_path)
if "player_name" not in skill.columns:
    raise ValueError("Expected 'player_name' in skill_draftees_2000_2025.csv")

skill["merge_key"] = skill["player_name"].astype(str).map(normalize_name)

print("Skill draftees shape:", skill.shape)

# -------------------------------
# 2) Load & combine dom_master_{year}.csv files
# -------------------------------
dom_files = sorted(dom_dir.glob("dom_master_*.csv"))
if not dom_files:
    raise FileNotFoundError(f"No dom_master_*.csv found in {dom_dir}")

dom_frames = []
for path in dom_files:
    df = pd.read_csv(path)
    df["source_file"] = path.name

    # Try to add a year from filename if no explicit year column
    if DOM_COLS["year"] not in df.columns:
        m = re.search(r"dom_master_(\d{4})", path.name)
        if m:
            df[DOM_COLS["year"]] = int(m.group(1))

    dom_frames.append(df)

dom_all = pd.concat(dom_frames, ignore_index=True)
print("Combined dom_master shape:", dom_all.shape)

# -------------------------------
# 3) Sanity: ensure required columns exist
# -------------------------------
missing = [v for v in DOM_COLS.values() if v not in dom_all.columns]
if missing:
    raise ValueError(f"These expected DOM columns are missing from dom_master files: {missing}")

# -------------------------------
# 4) Normalize names & sort seasons
# -------------------------------
dom_all["merge_key"] = dom_all[DOM_COLS["player"]].astype(str).map(normalize_name)

dom_all = dom_all.sort_values([ "merge_key", DOM_COLS["year"] ])
dom_all["season_idx"] = dom_all.groupby("merge_key").cumcount() + 1

# keep only first 4 seasons
dom_all = dom_all[dom_all["season_idx"] <= 4].copy()

# -------------------------------
# 5) Collapse per-player seasons into wide format
# -------------------------------
def flatten_player(group: pd.DataFrame) -> pd.Series:
    """
    For a single player (merge_key), build a wide row:
    player_name, Year1, DOM1, DOM+1, ..., Year4, DOM4, DOM+4, ...
    """
    row = {}
    # Use the "nicest" player_name (first occurrence)
    row["player_name"] = group.iloc[0][DOM_COLS["player"]]

    for _, r in group.iterrows():
        idx = int(r["season_idx"])
        year = r[DOM_COLS["year"]]

        row[f"Year{idx}"] = year
        row[f"DOM{idx}"] = r[DOM_COLS["DOM"]]
        row[f"DOM+{idx}"] = r[DOM_COLS["DOM_plus"]]
        row[f"PDOM{idx}"] = r[DOM_COLS["PDOM"]]
        row[f"PDOM+{idx}"] = r[DOM_COLS["PDOM_plus"]]
        row[f"RDOM{idx}"] = r[DOM_COLS["RDOM"]]
        row[f"RDOM+{idx}"] = r[DOM_COLS["RDOM_plus"]]

    return pd.Series(row)

dom_wide = dom_all.groupby("merge_key").apply(flatten_player).reset_index()
print("dom_wide shape:", dom_wide.shape)

# -------------------------------
# 6) Merge skill draftees with dom_wide (keep drafted players only)
# -------------------------------
merged = skill.merge(dom_wide, on="merge_key", how="left", suffixes=("", "_dom"))

# -------------------------------
# 7) Attach birth_day from players_dom_with_birthdays.csv
# -------------------------------
bd = pd.read_csv(birthdays_path)

# find name column in birthdays file
if "player_name" in bd.columns:
    bd_name_col = "player_name"
elif "player" in bd.columns:
    bd_name_col = "player"
else:
    raise ValueError("Could not find a player-name column in players_dom_with_birthdays.csv")

# find birth column
birth_candidates = ["birth_day", "birthday", "birth_date", "birthdate"]
birth_col = None
for c in birth_candidates:
    if c in bd.columns:
        birth_col = c
        break
if birth_col is None:
    raise ValueError(f"Could not find a birth column in players_dom_with_birthdays.csv; looked for {birth_candidates}")

bd["merge_key"] = bd[bd_name_col].astype(str).map(normalize_name)
bd_small = bd[["merge_key", birth_col]].drop_duplicates("merge_key").rename(columns={birth_col: "birth_day"})

merged = merged.merge(bd_small, on="merge_key", how="left")

# -------------------------------
# 8) Final column ordering
# -------------------------------
desired_cols = ["player_name"]

for i in range(1, 5):
    desired_cols.extend([
        f"Year{i}",
        f"DOM{i}", f"DOM+{i}",
        f"PDOM{i}", f"PDOM+{i}",
        f"RDOM{i}", f"RDOM+{i}",
    ])

desired_cols.append("birth_day")

# Keep only columns that actually exist; some later seasons may be missing
final_cols = [c for c in desired_cols if c in merged.columns]
final = merged[final_cols].copy()

# -------------------------------
# 9) Save
# -------------------------------
out_path.parent.mkdir(parents=True, exist_ok=True)
final.to_csv(out_path, index=False)

print("Final shape:", final.shape)
print("Saved to:", out_path)
final.head()
