In [None]:
import pandas as pd

# --- 1. Load your DOM + birthdays master and the drafted skill list ---
dom_path = "/Users/chasesiegel/Desktop/Comp_Sci/Capstone/Dynasty/data/processed/players_dom_with_birthdays.csv"
draft_path = "/Users/chasesiegel/Desktop/Comp_Sci/Capstone/Dynasty/data/scraper/skill_draftees_2000_2025.csv"


dom = pd.read_csv(dom_path)
draft = pd.read_csv(draft_path)

print("DOM shape:", dom.shape)
print("Drafted skill list shape:", draft.shape)

# --- 2. Figure out which column is the player name in the DOM file ---
if "player_name" in dom.columns:
    dom_name_col = "player_name"
elif "player" in dom.columns:
    dom_name_col = "player"
else:
    raise ValueError("Could not find a player name column in DOM file (expected 'player_name' or 'player').")

draft_name_col = "player_name"  # from the scrape cell

# --- 3. Build a clean merge key (simple strip; you can plug in your normalize_text later) ---
dom["merge_key"] = dom[dom_name_col].astype(str).str.strip()
draft["merge_key"] = draft[draft_name_col].astype(str).str.strip()

# --- 4. Select draft metadata you care about (can add more columns if you want) ---
draft_subset = draft[[
    "merge_key",
    "pos",
    "draft_year",
    "team",
    "round",
    "pick_overall",
    "pick_in_round",
    "pfr_player_url"
]]

# --- 5. Left-join: keep ALL DOM players, attach draft info where available ---
merged = dom.merge(
    draft_subset,
    on="merge_key",
    how="left",
    suffixes=("", "_draft")
)

# --- 6. Add drafted flag (True if we found a draft_year, else False) ---
merged["drafted"] = merged["draft_year"].notna()

# If you want it as actual bools (not pandas nullable)
merged["drafted"] = merged["drafted"].astype(bool)

# --- 7. Save new master file ---
out_path = "players_dom_master_with_draft_flag.csv"
merged.to_csv(out_path, index=False)

print("New master saved to:", out_path)
print("Drafted value counts:")
print(merged["drafted"].value_counts(dropna=False))


In [None]:
import time
from typing import Dict, Tuple, Optional

import pandas as pd
import requests
from bs4 import BeautifulSoup

# ==========================================================
# ✏️ EDIT THESE PATHS IF NEEDED
in_path = draft_path = "/Users/chasesiegel/Desktop/Comp_Sci/Capstone/Dynasty/data/scraper/skill_draftees_2000_2025.csv"
out_path = draft_path = "/Users/chasesiegel/Desktop/Comp_Sci/Capstone/Dynasty/data/scraper/skill_draftees_2000_2025_with_round.csv"
# ==========================================================

BASE_URL = "https://www.pro-football-reference.com"
DRAFT_URL_TEMPLATE = BASE_URL + "/years/{year}/draft.htm"


def norm_name(name: str) -> str:
    """
    Light name normalization to improve matching:
    - lowercase
    - strip spaces
    - remove periods, apostrophes, and hyphens
    """
    if not isinstance(name, str):
        return ""
    s = name.lower().strip()
    for ch in [".", "'", "-", "’"]:
        s = s.replace(ch, "")
    return s


def fetch_draft_html(year: int) -> str:
    url = DRAFT_URL_TEMPLATE.format(year=year)
    headers = {"User-Agent": "Mozilla/5.0"}
    resp = requests.get(url, headers=headers, timeout=30)
    resp.raise_for_status()
    return resp.text


def build_round_map_for_year(year: int) -> Dict[Tuple[int, str], Optional[int]]:
    """
    Returns a dict keyed by (draft_year, normalized_player_name) -> round_int_or_None
    """
    html = fetch_draft_html(year)
    soup = BeautifulSoup(html, "lxml")
    table = soup.find("table", id="drafts")
    round_map: Dict[Tuple[int, str], Optional[int]] = {}

    if not table:
        print(f"[WARN] No draft table for {year}")
        return round_map

    tbody = table.find("tbody")
    if not tbody:
        return round_map

    for tr in tbody.find_all("tr"):
        # Skip header rows embedded in tbody
        if "class" in tr.attrs and "thead" in tr["class"]:
            continue

        player_td = tr.find("td", {"data-stat": "player"})
        if player_td is None:
            continue

        player_name = player_td.get_text(strip=True)
        if not player_name:
            continue

        round_td = tr.find("td", {"data-stat": "round"})
        round_val = None
        if round_td:
            txt = round_td.get_text(strip=True)
            if txt.isdigit():
                round_val = int(txt)

        key = (year, norm_name(player_name))
        round_map[key] = round_val

    return round_map


# 1) Load your CSV
df = pd.read_csv(in_path)
print("Loaded:", in_path, "shape:", df.shape)

if "draft_year" not in df.columns:
    raise ValueError("Expected a 'draft_year' column in your skill_draftees_2000_2025 CSV.")
if "player_name" not in df.columns:
    raise ValueError("Expected a 'player_name' column in your skill_draftees_2000_2025 CSV.")

# 2) Build a global (year, name) -> round map for all years present
years = sorted(int(y) for y in df["draft_year"].dropna().unique())
print("Draft years in file:", years)

global_round_map: Dict[Tuple[int, str], Optional[int]] = {}

for y in years:
    print(f"Scraping draft round data for {y}...")
    year_map = build_round_map_for_year(y)
    print(f"  -> Found {len(year_map)} player entries for {y}")
    global_round_map.update(year_map)
    time.sleep(1.0)  # be polite to PFR

# 3) Apply the mapping to your DataFrame
def lookup_round(row) -> Optional[float]:
    year = int(row["draft_year"])
    name_norm = norm_name(row["player_name"])
    return global_round_map.get((year, name_norm), None)

df["round_from_pfr"] = df.apply(lookup_round, axis=1)

# 4) Save updated CSV
df.to_csv(out_path, index=False)
print("Saved enriched file with 'round_from_pfr' column to:", out_path)

print(df[["player_name", "draft_year", "round_from_pfr"]].head())
