### Running `birthdays_only.py`

This cell runs the `birthdays_only.py` script as a subprocess.  
It takes the input CSV (`skill_draftees_2000_2025.csv`) and generates  
an output CSV (`skill_draftee_birthdays.csv`).  

The command is built automatically, executed, and the script’s  
stdout and stderr are printed for easy debugging.


In [None]:
import subprocess
import sys
from pathlib import Path

# Path to your project root
ROOT = Path("/Users/chasesiegel/Desktop/Comp_Sci/Capstone/Dynasty")

# Path to the script
SCRIPT = ROOT / "src" / "scrapers" / "birthdays_only.py"

# Input and output file paths
INPUT_CSV = ROOT / "data" / "scraper" / "skill_draftees_2000_2025.csv"
OUTPUT_CSV = ROOT / "data" / "scraper" / "skill_draftee_birthdays.csv"

# Build the CLI command
cmd = [
    sys.executable,
    str(SCRIPT),
    "--in", str(INPUT_CSV),
    "--out", str(OUTPUT_CSV)
]

print("Running:", " ".join(cmd))

result = subprocess.run(cmd, capture_output=True, text=True)

print("\n--- STDOUT ---")
print(result.stdout)

print("\n--- STDERR ---")
print(result.stderr)

print("\nFinished.")


### Extracting Players With Missing Birthdays

This cell loads the processed birthday file  
(`skill_draftee_birthdays.csv`) and filters out all players who still  
do not have a `birth_date` value.

It displays the total count of missing birthdays, shows a preview, and  
saves the filtered list to:

`data/scrapers/skill_draftees_missing_birthdays.csv`

This file is then used for the Wikipedia fallback scraper.


In [None]:
import subprocess
import sys
from pathlib import Path
import pandas as pd

# Path to your project root
ROOT = Path("/Users/chasesiegel/Desktop/Comp_Sci/Capstone/Dynasty")

# Load the birthday output file
birthday_path = ROOT / "data" / "processed" / "skill_draftee_birthdays.csv"
df = pd.read_csv(birthday_path)

# Filter for missing birthdays
missing = df[df["birth_date"].isna()].copy()

print(f"Total players missing birthdays: {len(missing)}")
missing.head()

missing_path = ROOT / "data" / "scrapers" / "skill_draftees_missing_birthdays.csv"
missing.to_csv(missing_path, index=False)

print("Saved missing list to:", missing_path)


### Scraping Missing Player Birthdays From Wikipedia

This cell loads the list of players who still have missing birthdays
after the primary scraping step. For each player, it:

1. Searches Wikipedia using the API
2. Checks the top matching pages
3. Looks for a `<span class="bday">YYYY-MM-DD</span>` field
4. Records the birthday and the source URL if found

A small delay is added between requests to avoid overloading Wikipedia.

Results are saved to:

`data/processed/skill_draftees_missing_birthdays_wiki.csv`

The output includes:
- wiki_birth_date (if found)
- wiki_source_url


In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pathlib import Path
import time

# ====== CONFIG ======
ROOT = Path("/Users/chasesiegel/Desktop/Comp_Sci/Capstone/Dynasty")

missing_csv = ROOT / "data" / "scraper" / "skill_draftees_missing_birthdays.csv"
out_csv = ROOT / "data" / "scraper" / "skill_draftees_missing_birthdays_wiki.csv"

WIKI_API = "https://en.wikipedia.org/w/api.php"
REQUEST_HEADERS = {
    "User-Agent": "ChaseDynastyScraper/1.0 (contact: youremail@example.com)",
    "Accept-Language": "en-US,en;q=0.9",
}

# ====== FUNCTION TO FETCH ONE PLAYER'S BIRTHDAY FROM WIKIPEDIA ======
def fetch_wikipedia_birthday(player_name: str, max_hits: int = 5):
    """
    Search Wikipedia for the player name and try to extract a YYYY-MM-DD birthday.
    We:
      - Ask for up to `max_hits` search results
      - Visit each candidate page in order
      - Return the first one that has a <span class="bday">YYYY-MM-DD</span>
    Returns (birth_date_iso, page_url) or (None, None).
    """
    try:
        # Step 1: search for up to max_hits pages
        params = {
            "action": "query",
            "list": "search",
            "format": "json",
            "srsearch": player_name,
            "srlimit": max_hits,
        }
        r = requests.get(WIKI_API, params=params, headers=REQUEST_HEADERS, timeout=15)
        r.raise_for_status()
        data = r.json()

        search_results = data.get("query", {}).get("search", [])
        if not search_results:
            return None, None

        # Step 2: iterate each hit and look for a .bday span
        for hit in search_results:
            title = hit["title"]
            page_url = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"

            page = requests.get(page_url, headers=REQUEST_HEADERS, timeout=15)
            if page.status_code != 200:
                continue

            soup = BeautifulSoup(page.text, "lxml")

            bday_span = soup.find("span", {"class": "bday"})
            if bday_span:
                return bday_span.text.strip(), page_url

        # No candidate pages had a .bday span
        first_title = search_results[0]["title"]
        fallback_url = f"https://en.wikipedia.org/wiki/{first_title.replace(' ', '_')}"
        return None, fallback_url

    except Exception:
        return None, None

# ====== LOAD MISSING LIST ======
df = pd.read_csv(missing_csv)
print(f"Players with missing birthdays: {len(df)}")

# Prepare new columns (overwrite if they already exist)
df["wiki_birth_date"] = None
df["wiki_source_url"] = None

# ====== SCRAPE WIKIPEDIA FOR EACH PLAYER ======
for idx, row in df.iterrows():
    name = row["player_name"]
    print(f"[{idx+1}/{len(df)}] {name}...", end=" ")

    birth_date, src_url = fetch_wikipedia_birthday(name, max_hits=5)
    df.at[idx, "wiki_birth_date"] = birth_date
    df.at[idx, "wiki_source_url"] = src_url

    if birth_date:
        print(f"FOUND: {birth_date}")
    else:
        print("not found")

    # Gentle rate limiting so we don't hammer Wikipedia
    time.sleep(0.5)

# ====== SAVE RESULT ======
out_csv.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(out_csv, index=False)

print("\nDone.")
print("Saved to:", out_csv)
print("Still missing:", df['wiki_birth_date'].isna().sum())
df.head()


### Merging Primary Birthdays With Wikipedia Fallback

This cell combines the original birthday results  
(`skill_draftee_birthdays.csv`) with the Wikipedia-enriched fallback  
(`skill_draftees_missing_birthdays_wiki.csv`).

Steps performed:
1. Load both CSV files
2. Normalize player names
3. Merge them on `player_name`
4. For each player:
   - Keep the original `birth_date` if it exists
   - Otherwise use the `wiki_birth_date`
5. Do the same for the source URL
6. Drop the temporary wiki columns
7. Save the final unified dataset to:

`data/processed/skill_draftee_birthdays_master.csv`

The output includes exactly one birthday and one source URL per player.


In [5]:
import pandas as pd
from pathlib import Path

# ====== CONFIG ======
ROOT = Path("/Users/chasesiegel/Desktop/Comp_Sci/Capstone/Dynasty")

# Original birthdays (from SR / your birthdays_only.py)
csv1 = ROOT / "data" / "scraper" / "skill_draftee_birthdays.csv"

# Wikipedia-enriched missing list
csv2 = ROOT / "data" / "scraper" / "skill_draftees_missing_birthdays_wiki.csv"

# Output master file
out_csv = ROOT / "data" / "processed" / "skill_draftee_birthdays_master.csv"

# ====== LOAD FILES ======
df1 = pd.read_csv(csv1)
df2 = pd.read_csv(csv2)

# ====== NORMALIZE PLAYER NAME ======
def clean_name(x):
    if pd.isna(x):
        return ""
    return str(x).strip()

df1["player_name"] = df1["player_name"].apply(clean_name)
df2["player_name"] = df2["player_name"].apply(clean_name)

# ====== MERGE ON player_name (outer to keep everything) ======
merged = df1.merge(
    df2[["player_name", "wiki_birth_date", "wiki_source_url"]],
    on="player_name",
    how="outer",
)

# ====== COLLAPSE INTO SINGLE COLUMNS ======
# Use existing birth_date if present; otherwise fall back to wiki_birth_date
if "birth_date" in merged.columns and "wiki_birth_date" in merged.columns:
    merged["birth_date"] = merged["birth_date"].combine_first(merged["wiki_birth_date"])
else:
    raise ValueError("Expected columns 'birth_date' and 'wiki_birth_date' not found.")

# Same for source URL: prefer existing, else Wikipedia
if "birthdate_source_url" in merged.columns and "wiki_source_url" in merged.columns:
    merged["birthdate_source_url"] = merged["birthdate_source_url"].combine_first(
        merged["wiki_source_url"]
    )
else:
    raise ValueError("Expected columns 'birthdate_source_url' and 'wiki_source_url' not found.")

# (Optional) drop the wiki-specific helper columns now that we've merged them
merged = merged.drop(columns=["wiki_birth_date", "wiki_source_url"])

# ====== SAVE RESULT ======
merged.to_csv(out_csv, index=False)

print("Done.")
print("Saved merged file to:", out_csv)
print("Total rows:", len(merged))
print("Missing final birthdays:", merged["birth_date"].isna().sum())
merged.head()


Done.
Saved merged file to: /Users/chasesiegel/Desktop/Comp_Sci/Capstone/Dynasty/data/processed/skill_draftee_birthdays_master.csv
Total rows: 2122
Missing final birthdays: 9


Unnamed: 0,player_name,birth_date,birthdate_source_url
0,A.J. Brown,1997-06-30,https://www.pro-football-reference.com/players...
1,A.J. Derby,1991-09-20,https://www.pro-football-reference.com/players...
2,A.J. Feeley,1977-05-16,https://www.pro-football-reference.com/players...
3,A.J. Green,1988-07-31,https://www.pro-football-reference.com/players...
4,A.J. Jenkins,1989-09-30,https://www.pro-football-reference.com/players...


In [None]:
import pandas as pd
from pathlib import Path

# ====== CONFIG ======
ROOT = Path("/Users/chasesiegel/Desktop/Comp_Sci/Capstone/Dynasty")

birth_csv = ROOT / "data" / "processed" / "skill_draftee_birthdays_master.csv"
dom_csv   = ROOT / "data" / "processed" / "players_dom_numbered_with_years.csv"

out_csv   = ROOT / "data" / "processed" / "players_dom_numbered_with_years_and_birthdays.csv"

# ====== LOAD FILES ======
df_birth = pd.read_csv(birth_csv)
df_dom   = pd.read_csv(dom_csv)

# ====== NORMALIZE PLAYER NAME ======
def clean_name(x):
    if pd.isna(x):
        return ""
    return str(x).strip()

df_birth["player_name"] = df_birth["player_name"].apply(clean_name)
df_dom["player_name"]   = df_dom["player_name"].apply(clean_name)

# ====== MERGE ======
# Use birthdays as the base; attach DOM data where available
merged = df_birth.merge(
    df_dom,
    on="player_name",
    how="left",   # keep all drafted players with birthdays, bring over DOM stats if they exist
)

# ====== REORDER COLUMNS ======
front_cols = ["player_name", "birth_date", "birthdate_source_url"]
rest_cols = [c for c in merged.columns if c not in front_cols]

merged = merged[front_cols + rest_cols]

# ====== SAVE ======
out_csv.parent.mkdir(parents=True, exist_ok=True)
merged.to_csv(out_csv, index=False)

print("Saved merged file to:", out_csv)
print("Rows:", len(merged))
print("Columns:", len(merged.columns))
print("Missing birthdays in final (sanity check):", merged["birth_date"].isna().sum())
merged.head()


In [None]:
import pandas as pd
from pathlib import Path
from difflib import SequenceMatcher

# ====== CONFIG ======
ROOT = Path("/Users/chasesiegel/Desktop/Comp_Sci/Capstone/Dynasty")

merged_path = ROOT / "data" / "processed" / "players_dom_numbered_with_years_and_birthdays.csv"
dom_path    = ROOT / "data" / "processed" / "players_dom_numbered_with_years.csv"

# ====== LOAD FILES ======
merged = pd.read_csv(merged_path)
dom    = pd.read_csv(dom_path)

print("Merged rows:", len(merged))
print("DOM rows:", len(dom))

# ====== NORMALIZE NAMES ======
def clean_name(x):
    if pd.isna(x):
        return ""
    return str(x).strip()

merged["player_name_clean"] = merged["player_name"].apply(clean_name)
dom["player_name_clean"]    = dom["player_name"].apply(clean_name)

# ====== IDENTIFY "EMPTY DOM" RECORDS IN MERGED ======
# Heuristic: if ALL DOM-related columns are NaN for a row, treat it as an empty DOM record.
# Adjust this list if your column names differ.
dom_like_cols = [c for c in merged.columns if any(
    c.startswith(prefix) for prefix in ("Year", "DOM", "DOM+", "PDOM", "PDOM+", "RDOM", "RDOM+")
)]

print("Number of DOM-related columns considered:", len(dom_like_cols))

mask_empty_dom = merged[dom_like_cols].isna().all(axis=1)
empty_dom = merged[mask_empty_dom].copy()

print("Players with completely empty DOM records in merged:", len(empty_dom))

# ====== BUILD CANDIDATE NAME SET FROM ORIGINAL DOM TABLE ======
dom_names = dom["player_name_clean"].dropna().unique().tolist()

def best_match(name, candidates):
    """
    Return (best_candidate, score) where score in [0,1].
    """
    best = None
    best_score = 0.0
    for cand in candidates:
        s = SequenceMatcher(None, name, cand).ratio()
        if s > best_score:
            best_score = s
            best = cand
    return best, best_score

# ====== FUZZY MATCH EMPTY-DOM PLAYERS AGAINST DOM TABLE ======
rows = []
THRESHOLD = 0.50  # adjust: 0.75–0.9 depending how strict you want to be

for _, row in empty_dom.iterrows():
    name = row["player_name_clean"]
    if not name:
        continue
    match, score = best_match(name, dom_names)
    if score >= THRESHOLD:
        rows.append({
            "merged_player_name": row["player_name"],
            "merged_player_name_clean": name,
            "matched_dom_name": match,
            "similarity": score
        })

matches_df = pd.DataFrame(rows).sort_values("similarity", ascending=False)

print("\nPotential near-matches for empty DOM players (similarity >= {:.2f}):".format(THRESHOLD))
print("Total candidates:", len(matches_df))

# Show top 30 for inspection
matches_df.head(30)



# ====== SAVE RESULTS TO CSV ======
out_csv = ROOT / "data" / "processed" / "potential_dom_name_matches.csv"
matches_df.to_csv(out_csv, index=False)

print("\nCSV saved to:", out_csv)
matches_df.head()



In [None]:
import pandas as pd
from pathlib import Path

# ====== CONFIG ======
ROOT = Path("/Users/chasesiegel/Desktop/Comp_Sci/Capstone/Dynasty")

in_csv  = ROOT / "data" / "processed" / "players_dom_numbered_with_years_and_birthdays.csv"
out_csv = ROOT / "data" / "processed" / "players_missing_dom_metrics.csv"

# ====== LOAD FILE ======
df = pd.read_csv(in_csv)
print("Loaded:", len(df))

# ====== IDENTIFY DOM/PDOM/RDOM COLUMNS ======
dom_cols = [
    c for c in df.columns
    if any(
        c.startswith(prefix)
        for prefix in ("DOM", "DOM+", "PDOM", "PDOM+", "RDOM", "RDOM+")
    )
]

print("DOM-related columns found:", len(dom_cols))

# ====== FIND ROWS WHERE *ALL* METRIC COLUMNS ARE EMPTY ======
mask_missing = df[dom_cols].isna().all(axis=1)

missing_df = df[mask_missing].copy()
print("Players missing ALL DOM/PDOM/RDOM metrics:", len(missing_df))

# ====== SAVE TO CSV ======
out_csv.parent.mkdir(parents=True, exist_ok=True)
missing_df.to_csv(out_csv, index=False)

print("Saved →", out_csv)

missing_df.head()


In [None]:
import subprocess
import sys
import time
from pathlib import Path

import pandas as pd
import requests
from bs4 import BeautifulSoup

# ====== PATH CONFIG ======
ROOT = Path("/Users/chasesiegel/Desktop/Comp_Sci/Capstone/Dynasty")

# Stage 1: run birthdays_only.py
SCRIPT = ROOT / "src" / "scrapers" / "birthdays_only.py"
INPUT_CSV = ROOT / "data" / "scraper" / "skill_draftees_2000_2025.csv"
OUTPUT_CSV = ROOT / "data" / "scraper" / "skill_draftee_birthdays.csv"

print("=== Stage 1: Running birthdays_only.py ===")
cmd = [
    sys.executable,
    str(SCRIPT),
    "--in", str(INPUT_CSV),
    "--out", str(OUTPUT_CSV),
]
print("Running:", " ".join(cmd))

result = subprocess.run(cmd, capture_output=True, text=True)

print("\n--- STDOUT ---")
print(result.stdout)

print("\n--- STDERR ---")
print(result.stderr)

print("\nFinished Stage 1.")

# ====== Stage 2: Wikipedia fallback for missing birthdays ======

print("\n=== Stage 2: Wikipedia fallback for missing birthdays ===")

# Load the master birthdays file produced by birthdays_only.py
df_master = pd.read_csv(OUTPUT_CSV)
print(f"Loaded master birthdays file with {len(df_master)} rows from {OUTPUT_CSV}")

# Try to locate the birthday column
birthday_col = None
for cand in ["birth_date", "birthday", "birthdate", "BirthDate", "Birthday"]:
    if cand in df_master.columns:
        birthday_col = cand
        break

if birthday_col is None:
    raise ValueError(
        "Could not find a birthday column in OUTPUT_CSV. "
        "Expected one of: birth_date, birthday, birthdate, BirthDate, Birthday."
    )

# Ensure we have a player name column
name_col = None
for cand in ["player_name", "Player", "player", "name", "Name"]:
    if cand in df_master.columns:
        name_col = cand
        break

if name_col is None:
    raise ValueError("Could not find a player name column (expected something like 'player_name').")

# Identify rows with missing birthdays (NaN or empty string)
mask_missing = df_master[birthday_col].isna() | (df_master[birthday_col].astype(str).str.strip() == "")
df_missing = df_master.loc[mask_missing].copy()

print(f"Players with missing birthdays after Stage 1: {len(df_missing)}")

if df_missing.empty:
    print("No missing birthdays to fill from Wikipedia. Skipping Stage 2.")
else:
    # ====== WIKIPEDIA CONFIG ======
    WIKI_API = "https://en.wikipedia.org/w/api.php"
    REQUEST_HEADERS = {
        "User-Agent": "ChaseDynastyScraper/1.0 (contact: youremail@example.com)",
        "Accept-Language": "en-US,en;q=0.9",
    }

    def fetch_wikipedia_birthday(player_name: str, max_hits: int = 5):
        """
        Search Wikipedia for the player name and try to extract a YYYY-MM-DD birthday.
        We:
          - Ask for up to `max_hits` search results
          - Visit each candidate page in order
          - Return the first one that has a <span class="bday">YYYY-MM-DD</span>
        Returns (birth_date_iso, page_url) or (None, None).
        """
        try:
            # Step 1: search for up to max_hits pages
            params = {
                "action": "query",
                "list": "search",
                "format": "json",
                "srsearch": player_name,
                "srlimit": max_hits,
            }
            r = requests.get(WIKI_API, params=params, headers=REQUEST_HEADERS, timeout=15)
            r.raise_for_status()
            data = r.json()

            search_results = data.get("query", {}).get("search", [])
            if not search_results:
                return None, None

            # Step 2: iterate each hit and look for a .bday span
            for hit in search_results:
                title = hit["title"]
                page_url = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"

                page = requests.get(page_url, headers=REQUEST_HEADERS, timeout=15)
                if page.status_code != 200:
                    continue

                soup = BeautifulSoup(page.text, "lxml")

                bday_span = soup.find("span", {"class": "bday"})
                if bday_span:
                    return bday_span.text.strip(), page_url

            # No candidate pages had a .bday span
            first_title = search_results[0]["title"]
            fallback_url = f"https://en.wikipedia.org/wiki/{first_title.replace(' ', '_')}"
            return None, fallback_url

        except Exception:
            return None, None

    # Prepare new columns on the missing subset
    df_missing["wiki_birth_date"] = None
    df_missing["wiki_source_url"] = None

    print("\nScraping Wikipedia for missing players...\n")

    for idx, row in df_missing.iterrows():
        name = row[name_col]
        print(f"[{idx+1}/{len(df_master)}] {name}...", end=" ")

        birth_date, src_url = fetch_wikipedia_birthday(str(name), max_hits=5)
        df_missing.at[idx, "wiki_birth_date"] = birth_date
        df_missing.at[idx, "wiki_source_url"] = src_url

        if birth_date:
            print(f"FOUND: {birth_date}")
        else:
            print("not found")

        # Gentle rate limiting so we don't hammer Wikipedia
        time.sleep(0.5)

    # Merge Wikipedia birthdays back into the master dataframe
    # Only fill where birthday is missing and wiki_birth_date is not null
    for idx, row in df_missing.iterrows():
        wiki_bd = row["wiki_birth_date"]
        if pd.notna(wiki_bd) and str(wiki_bd).strip() != "":
            df_master.at[idx, birthday_col] = wiki_bd
            # Optionally also store a source URL
            source_col = "birthday_source_url"
            if source_col not in df_master.columns:
                df_master[source_col] = None
            df_master.at[idx, source_col] = row["wiki_source_url"]

    # Save debug CSV of missing list + wiki results
    wiki_debug_csv = ROOT / "data" / "processed" / "skill_draftees_missing_birthdays_wiki.csv"
    wiki_debug_csv.parent.mkdir(parents=True, exist_ok=True)
    df_missing.to_csv(wiki_debug_csv, index=False)
    print("\nSaved Wikipedia debug results to:", wiki_debug_csv)

    # Show how many are still missing after Wikipedia
    mask_still_missing = df_master[birthday_col].isna() | (df_master[birthday_col].astype(str).str.strip() == "")
    print("Still missing birthdays after Wikipedia fallback:", mask_still_missing.sum())

# Save final enriched birthdays CSV (overwriting OUTPUT_CSV)
df_master.to_csv(OUTPUT_CSV, index=False)
print("\n=== All done ===")
print("Final combined birthdays (with Wikipedia fallback) saved to:", OUTPUT_CSV)

df_master.head()
