In [None]:
import os
import re
import glob
import pandas as pd
from bs4 import BeautifulSoup

# ==== EDIT THESE ====
INPUT_DIR  = "/users/cooperfoster/desktop/w_hoops/schedule_links"       # folder containing YYYY-MM-DD.txt files
OUTPUT_CSV = "/users/cooperfoster/desktop/w_hoops/game_links.csv"
BASE = "https://www.sports-reference.com"
# ====================

sid_re = re.compile(r"/cbb/schools/([^/]+)/")
date_re = re.compile(r"(\d{4}-\d{2}-\d{2})")

def extract_sid(a_tag):
    if not a_tag:
        return None
    href = a_tag.get("href")
    if not href:
        return None
    m = sid_re.search(href)
    return m.group(1) if m else None

def infer_gender(game_div):
    # primary: gender-f / gender-m on the div itself
    classes = set(game_div.get("class", []))
    if "gender-f" in classes:
        return "women"
    if "gender-m" in classes:
        return "men"
    # fallback: desc row text ("Women's"/"Men's")
    desc = game_div.select_one("td.desc")
    if desc:
        t = desc.get_text(" ", strip=True).lower()
        if "women" in t:
            return "women"
        if "men" in t:
            return "men"
    return None

rows = []

txt_paths = sorted(glob.glob(os.path.join(INPUT_DIR, "*.txt")))
for path in txt_paths:
    fname = os.path.basename(path)
    mdate = date_re.search(fname)
    game_date = mdate.group(1) if mdate else None  # expected from filename like 2025-01-24.txt

    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        raw = f.read().strip()

    if not raw:
        continue

    # Wrap fragments so BeautifulSoup always has a root
    soup = BeautifulSoup(f"<html><body>{raw}</body></html>", "lxml")

    for div in soup.select("div.game_summary"):
        gender = infer_gender(div)

        w_tr = div.select_one("tr.winner")
        l_tr = div.select_one("tr.loser")

        winner_sid = extract_sid(w_tr.select_one("a") if w_tr else None)
        loser_sid  = extract_sid(l_tr.select_one("a") if l_tr else None)

        # Boxscore link is in td.gamelink a (can appear on either team row depending on ordering)
        box_a = div.select_one("td.gamelink a")
        box_href = box_a.get("href") if box_a else None
        box_url = (BASE + box_href) if box_href else None

        # Keep rows even if one team lacks a link (some non-D1 opponents show no href)
        rows.append({
            "game_date": game_date,
            "gender": gender,                 # "men" / "women"
            "winner_sid": winner_sid,
            "loser_sid": loser_sid,
            "boxscore_href": box_href,        # relative
            "boxscore_url": box_url           # absolute
        })

df = pd.DataFrame(rows)

# Basic cleanup: drop rows with no boxscore link (should be rare, but safe)
df = df[df["boxscore_href"].notna()].reset_index(drop=True)

df.to_csv(OUTPUT_CSV, index=False)
print(f"Parsed {len(txt_paths)} files -> {len(df)} games")
print(f"Saved: {OUTPUT_CSV}")
df.head()