In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# 1. The URL you want to scrape
url = "https://www.espn.com/mens-college-basketball/scoreboard/_/date/20250208/group/50"

# 2. Send a GET request with a browser-like user agent
headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/119.0.0.0 Safari/537.36"
    )
}
response = requests.get(url, headers=headers)

# 3. Parse the HTML
soup = BeautifulSoup(response.text, "html.parser")

In [12]:
import os
import re
import json
from datetime import datetime, timedelta

import requests
from bs4 import BeautifulSoup
import pandas as pd

# ---------------- Config ----------------
START_DATE = "20241101"  # inclusive
END_DATE   = "20250313"  # inclusive
GROUP = 50               # 50 = NCAA Division I
OUT_DIR = "box-score-ids"
OVERWRITE = False        # True to overwrite existing daily files
PAUSE_SECONDS = 3.4      # be polite (optional)


HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/119.0.0.0 Safari/537.36"
    )
}

def fetch_games_api(date_yyyymmdd: str, group: int = GROUP) -> pd.DataFrame:
    """Preferred: ESPN public JSON API (no HTML parsing)."""
    url = (
        "https://site.api.espn.com/apis/v2/sports/basketball/mens-college-basketball/"
        f"scoreboard?dates={date_yyyymmdd}&groups={group}"
    )
    r = requests.get(url, headers=HEADERS, timeout=30)
    r.raise_for_status()
    data = r.json()

    rows = []
    for e in data.get("events", []):
        gid = e.get("id")
        comp = (e.get("competitions") or [{}])[0]
        comps = comp.get("competitors", [])
        home = next((c for c in comps if c.get("homeAway") == "home"), {})
        away = next((c for c in comps if c.get("homeAway") == "away"), {})
        rows.append({
            "game_id": gid,
            "home_team": home.get("team", {}).get("displayName"),
            "away_team": away.get("team", {}).get("displayName"),
        })
    return pd.DataFrame(rows)

def fetch_scoreboard_html(date_yyyymmdd: str, group: int = GROUP) -> str:
    """Fallback: fetch the scoreboard HTML for the date/group."""
    url = f"https://www.espn.com/mens-college-basketball/scoreboard/_/date/{date_yyyymmdd}/group/{group}"
    r = requests.get(url, headers=HEADERS, timeout=30)
    r.raise_for_status()
    return r.text

def parse_games_from_html(html: str) -> pd.DataFrame:
    """
    Parse game blocks from server-rendered DOM.
    <section class="Scoreboard" id="<game_id>">…</section>
    """
    soup = BeautifulSoup(html, "html.parser")
    games = []

    # Primary: section blocks with game id
    for sec in soup.select("section.Scoreboard[id]"):
        gid = sec.get("id")
        away = sec.select_one(".ScoreboardScoreCell__Item--away .ScoreCell__TeamName--shortDisplayName")
        home = sec.select_one(".ScoreboardScoreCell__Item--home .ScoreCell__TeamName--shortDisplayName")
        games.append({
            "game_id": gid,
            "home_team": home.get_text(strip=True) if home else None,
            "away_team": away.get_text(strip=True) if away else None,
        })

    # Secondary: backup to anchor pattern if nothing found
    if not games:
        for a in soup.find_all("a", href=True):
            m = re.search(r"/mens-college-basketball/game/_/gameId/(\d+)", a["href"])
            if m:
                games.append({"game_id": m.group(1), "home_team": None, "away_team": None})

    return pd.DataFrame(games)

def get_games_for_date(date_yyyymmdd: str, group: int = GROUP) -> pd.DataFrame:
    """Try API first; if empty/error, fall back to HTML."""
    # API path
    try:
        df_api = fetch_games_api(date_yyyymmdd, group=group)
        if not df_api.empty:
            df_api.insert(0, "date", date_yyyymmdd)
            return df_api
    except Exception:
        pass

    # HTML fallback
    try:
        html = fetch_scoreboard_html(date_yyyymmdd, group=group)
        df_html = parse_games_from_html(html)
        if not df_html.empty:
            df_html.insert(0, "date", date_yyyymmdd)
        return df_html
    except Exception:
        return pd.DataFrame()

def daterange(start_yyyymmdd: str, end_yyyymmdd: str):
    start = datetime.strptime(start_yyyymmdd, "%Y%m%d")
    end = datetime.strptime(end_yyyymmdd, "%Y%m%d")
    d = start
    while d <= end:
        yield d.strftime("%Y%m%d")
        d += timedelta(days=1)

def main():
    os.makedirs(OUT_DIR, exist_ok=True)
    total_days = 0
    total_games = 0

    for day in daterange(START_DATE, END_DATE):
        out_path = os.path.join(OUT_DIR, f"espn_ncaam_box_scores_{day}.csv")
        if os.path.exists(out_path) and not OVERWRITE:
            print(f"⏭️  {day}: exists, skipping (set OVERWRITE=True to redo)")
            continue

        try:
            df = get_games_for_date(day, group=GROUP)
            if df is None or df.empty:
                print(f"— {day}: no games")
            else:
                df.to_csv(out_path, index=False)
                total_days += 1
                total_games += len(df)
                print(f"✅ {day}: saved {len(df)} games → {out_path}")
        except Exception as e:
            print(f"❌ {day}: ERROR {e}")

        # be polite to the server
        if PAUSE_SECONDS:
            try:
                import time
                time.sleep(PAUSE_SECONDS)
            except Exception:
                pass

    print(f"\nDone. Wrote {total_games} games across {total_days} days into '{OUT_DIR}/'.")

In [13]:
main()

— 20241101: no games
— 20241102: no games
— 20241103: no games
✅ 20241104: saved 100 games → box-score-ids/espn_ncaam_box_scores_20241104.csv
✅ 20241105: saved 11 games → box-score-ids/espn_ncaam_box_scores_20241105.csv
✅ 20241106: saved 37 games → box-score-ids/espn_ncaam_box_scores_20241106.csv
✅ 20241107: saved 42 games → box-score-ids/espn_ncaam_box_scores_20241107.csv
✅ 20241108: saved 80 games → box-score-ids/espn_ncaam_box_scores_20241108.csv
✅ 20241109: saved 64 games → box-score-ids/espn_ncaam_box_scores_20241109.csv
✅ 20241110: saved 29 games → box-score-ids/espn_ncaam_box_scores_20241110.csv
✅ 20241111: saved 42 games → box-score-ids/espn_ncaam_box_scores_20241111.csv
✅ 20241112: saved 79 games → box-score-ids/espn_ncaam_box_scores_20241112.csv
✅ 20241113: saved 55 games → box-score-ids/espn_ncaam_box_scores_20241113.csv
✅ 20241114: saved 26 games → box-score-ids/espn_ncaam_box_scores_20241114.csv
✅ 20241115: saved 50 games → box-score-ids/espn_ncaam_box_scores_20241115.csv
