In [1]:
# ---------- Daily Barttovik Ratings ----------

import asyncio
import os
from datetime import datetime, timedelta
from typing import List
import pandas as pd
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, TimeoutError as PWTimeout

BASE_URL = (
    "https://barttorvik.com/trank.php"
    "?year={year}&sort=&hteam=&t2value=&conlimit=All&state=All"
    "&begin={begin}&end={end}&top=0&revquad=0&quad=5&venue=All&type=All&mingames=0#"
)

# ---------- HTML fetch ----------

async def goto_and_get_html(page, url: str, table_selector: str = "table", timeout_ms: int = 30000) -> str:
    await page.goto(url, wait_until="domcontentloaded", timeout=timeout_ms)
    try:
        await page.wait_for_selector(table_selector, timeout=20000)
    except PWTimeout:
        await page.wait_for_load_state("networkidle", timeout=10000)

    # if we’re still on the verification page, wait a bit
    for _ in range(6):
        html = await page.content()
        if "Verifying your browser" not in html and "js_test_submitted" not in html:
            break
        await asyncio.sleep(1)

    return await page.content()

# ---------- Table parsing ----------

def parse_first_table(html: str) -> List[List[str]]:
    soup = BeautifulSoup(html, "html.parser")
    table = soup.select_one("table")
    if not table:
        return []
    rows = []
    for tr in table.select("tr"):
        cells = [c.get_text(strip=True) for c in tr.select("th, td")]
        if cells:
            rows.append(cells)
    return rows


def rows_to_dataframe(rows: List[List[str]]) -> pd.DataFrame:
    """Convert raw scraped rows into a DataFrame"""
    if len(rows) < 3:
        return pd.DataFrame()
    columns = rows[1]  # second row = headers
    data = rows[2:]
    max_len = len(columns)
    norm = [r[:max_len] + ([""] * (max_len - len(r))) for r in data]
    df = pd.DataFrame(norm, columns=columns)
    return df

# ---------- Orchestrator ----------

async def scrape_barttorvik_daily(
    year: int = 2021,
    begin: str = "20201101",
    end: str = "20210313",
    output_dir: str = "daily_csvs",
    master_csv: str = "barttorvik_2021_all.csv",
    table_selector: str = "table",
    headless: bool = True,
    pause_sec: float = 3.8
):
    os.makedirs(output_dir, exist_ok=True)
    start_dt = datetime.strptime(begin, "%Y%m%d")
    final_dt = datetime.strptime(end, "%Y%m%d")

    first_write = not os.path.exists(master_csv)

    async with async_playwright() as p:
        browser = await p.chromium.launch(channel="chrome", headless=headless)
        context = await browser.new_context(
            user_agent=("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                        "AppleWebKit/537.36 (KHTML, like Gecko) "
                        "Chrome/119.0.0.0 Safari/537.36"),
            locale="en-US",
            timezone_id="America/New_York",
        )
        page = await context.new_page()

        dt = start_dt
        total_rows = 0
        while dt <= final_dt:
            end_str = dt.strftime("%Y%m%d")
            url = BASE_URL.format(year=year, begin=begin, end=end_str)

            try:
                html = await goto_and_get_html(page, url, table_selector=table_selector)
                rows = parse_first_table(html)
                df = rows_to_dataframe(rows)

                if not df.empty:
                    df.insert(0, "Date", end_str)

                    # write individual daily file
                    daily_path = os.path.join(output_dir, f"barttorvik_{end_str}.csv")
                    df.to_csv(daily_path, index=False)
                    print(f"✔️  {end_str}: saved {len(df)} rows to {daily_path}")

                    # append to master CSV
                    if first_write:
                        df.to_csv(master_csv, index=False)
                        first_write = False
                    else:
                        df.to_csv(master_csv, mode="a", header=False, index=False)

                    total_rows += len(df)
                else:
                    print(f"⚠️  {end_str}: no data (empty table)")

            except Exception as e:
                print(f"❌ {end_str}: ERROR {e}")

            await asyncio.sleep(pause_sec)
            dt += timedelta(days=1)

        await browser.close()

    print(f"\n✅ Done! {total_rows} total rows saved across days.")

# ---------- Run ----------
# In Jupyter or async environment:
await scrape_barttorvik_daily(
    year=2026,
    begin="20251103",
    end="20251106",
    output_dir="daily_csvs_2026",
    master_csv="barttorvik_2026_all.csv"
)

✔️  20251103: saved 225 rows to daily_csvs_2026/barttorvik_20251103.csv
✔️  20251104: saved 267 rows to daily_csvs_2026/barttorvik_20251104.csv
✔️  20251105: saved 284 rows to daily_csvs_2026/barttorvik_20251105.csv
✔️  20251106: saved 284 rows to daily_csvs_2026/barttorvik_20251106.csv

✅ Done! 1060 total rows saved across days.


In [2]:
# ---------- Game Ids (dated subfolders) ----------

import os
import re
import json
from datetime import datetime, timedelta

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# ---------------- Config ----------------
START_DATE = "20251103"  # inclusive
END_DATE   = "20251106"  # inclusive
GROUP = 50               # 50 = NCAA Division I
OUT_DIR = "daily-box-score-ids"
OVERWRITE = False        # True to overwrite existing daily files
PAUSE_SECONDS = 3.4      # be polite (optional)

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/119.0.0.0 Safari/537.36"
    )
}

def fetch_games_api(date_yyyymmdd: str, group: int = GROUP) -> pd.DataFrame:
    """Preferred: ESPN public JSON API (no HTML parsing)."""
    url = (
        "https://site.api.espn.com/apis/v2/sports/basketball/mens-college-basketball/"
        f"scoreboard?dates={date_yyyymmdd}&groups={group}"
    )
    r = requests.get(url, headers=HEADERS, timeout=30)
    r.raise_for_status()
    data = r.json()

    rows = []
    for e in data.get("events", []):
        gid = e.get("id")
        comp = (e.get("competitions") or [{}])[0]
        comps = comp.get("competitors", [])
        home = next((c for c in comps if c.get("homeAway") == "home"), {})
        away = next((c for c in comps if c.get("homeAway") == "away"), {})
        rows.append({
            "game_id": gid,
            "home_team": (home.get("team") or {}).get("displayName"),
            "away_team": (away.get("team") or {}).get("displayName"),
        })
    return pd.DataFrame(rows)

def fetch_scoreboard_html(date_yyyymmdd: str, group: int = GROUP) -> str:
    """Fallback: fetch the scoreboard HTML for the date/group."""
    url = f"https://www.espn.com/mens-college-basketball/scoreboard/_/date/{date_yyyymmdd}/group/{group}"
    r = requests.get(url, headers=HEADERS, timeout=30)
    r.raise_for_status()
    return r.text

def parse_games_from_html(html: str) -> pd.DataFrame:
    """
    Parse game blocks from server-rendered DOM.
    <section class="Scoreboard" id="<game_id>">…</section>
    """
    soup = BeautifulSoup(html, "html.parser")
    games = []

    # Primary: section blocks with game id
    for sec in soup.select("section.Scoreboard[id]"):
        gid = sec.get("id")
        away = sec.select_one(".ScoreboardScoreCell__Item--away .ScoreCell__TeamName--shortDisplayName")
        home = sec.select_one(".ScoreboardScoreCell__Item--home .ScoreCell__TeamName--shortDisplayName")
        games.append({
            "game_id": gid,
            "home_team": home.get_text(strip=True) if home else None,
            "away_team": away.get_text(strip=True) if away else None,
        })

    # Secondary: backup to anchor pattern if nothing found
    if not games:
        for a in soup.find_all("a", href=True):
            m = re.search(r"/mens-college-basketball/game/_/gameId/(\d+)", a["href"])
            if m:
                games.append({"game_id": m.group(1), "home_team": None, "away_team": None})

    return pd.DataFrame(games)

def get_games_for_date(date_yyyymmdd: str, group: int = GROUP) -> pd.DataFrame:
    """Try API first; if empty/error, fall back to HTML."""
    try:
        df_api = fetch_games_api(date_yyyymmdd, group=group)
        if not df_api.empty:
            df_api.insert(0, "date", date_yyyymmdd)
            return df_api
    except Exception:
        pass

    try:
        html = fetch_scoreboard_html(date_yyyymmdd, group=group)
        df_html = parse_games_from_html(html)
        if not df_html.empty:
            df_html.insert(0, "date", date_yyyymmdd)
        return df_html
    except Exception:
        return pd.DataFrame()

def daterange(start_yyyymmdd: str, end_yyyymmdd: str):
    start = datetime.strptime(start_yyyymmdd, "%Y%m%d")
    end = datetime.strptime(end_yyyymmdd, "%Y%m%d")
    d = start
    while d <= end:
        yield d.strftime("%Y%m%d")
        d += timedelta(days=1)

def main():
    total_days = 0
    total_games = 0

    for day in daterange(START_DATE, END_DATE):
        # Make a dated subfolder like: daily-box-score-ids/20251103/
        day_dir = os.path.join(OUT_DIR, day)
        os.makedirs(day_dir, exist_ok=True)

        # Files we’ll write inside that folder
        csv_path = os.path.join(day_dir, "game_ids.csv")
        txt_path = os.path.join(day_dir, "game_ids.txt")

        if (not OVERWRITE) and os.path.exists(csv_path):
            print(f"⏭️  {day}: {csv_path} exists, skipping (set OVERWRITE=True to redo)")
            continue

        try:
            df = get_games_for_date(day, group=GROUP)
            if df is None or df.empty:
                print(f"— {day}: no games")
            else:
                # Normalize columns/order
                df = df[["date", "game_id", "home_team", "away_team"]]

                # Write CSV in dated folder
                df.to_csv(csv_path, index=False)

                # Also write a plain text list of IDs (one per line)
                with open(txt_path, "w", encoding="utf-8") as f:
                    for gid in df["game_id"].astype(str):
                        f.write(gid + "\n")

                total_days += 1
                total_games += len(df)
                print(f"✅ {day}: saved {len(df)} games → {csv_path} and {txt_path}")
        except Exception as e:
            print(f"❌ {day}: ERROR {e}")

        # be polite to the server
        if PAUSE_SECONDS:
            try:
                time.sleep(PAUSE_SECONDS)
            except Exception:
                pass

    print(f"\nDone. Wrote {total_games} games across {total_days} days into '{OUT_DIR}/<YYYYMMDD>/' folders.")

if __name__ == "__main__":
    main()

⏭️  20251103: daily-box-score-ids/20251103/game_ids.csv exists, skipping (set OVERWRITE=True to redo)
⏭️  20251104: daily-box-score-ids/20251104/game_ids.csv exists, skipping (set OVERWRITE=True to redo)
⏭️  20251105: daily-box-score-ids/20251105/game_ids.csv exists, skipping (set OVERWRITE=True to redo)
✅ 20251106: saved 44 games → daily-box-score-ids/20251106/game_ids.csv and daily-box-score-ids/20251106/game_ids.txt

Done. Wrote 44 games across 1 days into 'daily-box-score-ids/<YYYYMMDD>/' folders.


In [5]:
# ---------- Game Box Scores ----------

import os
import csv
import requests
from typing import Any, Dict, List, Tuple, Optional

SUMMARY_URL = "https://site.api.espn.com/apis/site/v2/sports/basketball/mens-college-basketball/summary?event={gid}"
UA_HEADERS = {"User-Agent": "Mozilla/5.0"}

# ----------------------------
# Fetch
# ----------------------------
def _get_summary(game_id: str) -> Dict[str, Any]:
    r = requests.get(SUMMARY_URL.format(gid=game_id), headers=UA_HEADERS, timeout=20)
    r.raise_for_status()
    return r.json()

# ----------------------------
# Helpers
# ----------------------------
def _first_comp(summary: Dict[str, Any]) -> Dict[str, Any]:
    return (summary.get("header", {}).get("competitions") or [{}])[0]

def _competitors(summary: Dict[str, Any]) -> List[Dict[str, Any]]:
    return _first_comp(summary).get("competitors") or []

def _home_comp(comp_list: List[Dict[str, Any]]) -> Dict[str, Any]:
    return next((c for c in comp_list if c.get("homeAway") == "home"), comp_list[0] if comp_list else {})

def _away_comp(comp_list: List[Dict[str, Any]]) -> Dict[str, Any]:
    return next((c for c in comp_list if c.get("homeAway") == "away"), comp_list[1] if len(comp_list) > 1 else {})

def _team_name(team_obj: Dict[str, Any]) -> str:
    return team_obj.get("displayName") or team_obj.get("name") or team_obj.get("location") or ""

def _parse_value_to_int(val: Any) -> Optional[int]:
    if val is None:
        return None
    try:
        return int(val)
    except Exception:
        try:
            # sometimes "42" or "42.0"
            return int(str(val).split(".")[0])
        except Exception:
            return None

def _extract_period_num(item: Dict[str, Any]) -> Optional[int]:
    # supports {"period": 1} or {"period": {"number": 1}}
    p = item.get("period")
    if isinstance(p, dict):
        return _parse_value_to_int(p.get("number"))
    return _parse_value_to_int(p)

def _extract_score_from_item(item: Dict[str, Any]) -> Optional[int]:
    # common shapes: {"value": 42} or {"displayValue": "42"} or {"score": 42}
    for k in ("value", "displayValue", "score"):
        if k in item:
            return _parse_value_to_int(item[k])
    return None

def _half_scores(competitor: Dict[str, Any]) -> Tuple[Optional[int], Optional[int]]:
    """
    Robust 1H/2H extraction from competitor lines:
      - supports 'linescores' OR 'scoreByPeriod'
      - supports period number under 'period' or 'period.number'
      - supports score value under 'value', 'displayValue', or 'score'
      - if no period numbers, falls back to first two items in order
    """
    lines = competitor.get("linescores") or competitor.get("scoreByPeriod") or []
    if not isinstance(lines, list) or not lines:
        return (None, None)

    by_period: Dict[int, int] = {}
    fallback_order_vals: List[int] = []

    for it in lines:
        if not isinstance(it, dict):
            continue
        val = _extract_score_from_item(it)
        if val is not None:
            fallback_order_vals.append(val)
        pnum = _extract_period_num(it)
        if pnum is not None and val is not None:
            by_period[pnum] = val

    # Prefer explicit period numbers 1 and 2
    h1 = by_period.get(1)
    h2 = by_period.get(2)
    if h1 is not None or h2 is not None:
        return h1, h2

    # Fall back to first two entries by order
    if len(fallback_order_vals) >= 2:
        return fallback_order_vals[0], fallback_order_vals[1]

    return (None, None)

def _write_csv(path: str, rows: List[Dict[str, Any]], header: List[str] = None):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    if not rows:
        if header:
            with open(path, "w", newline="", encoding="utf-8") as f:
                csv.DictWriter(f, fieldnames=header).writeheader()
        else:
            open(path, "w").close()
        return
    cols = header or list({k for r in rows for k in r.keys()})
    with open(path, "w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=cols)
        w.writeheader()
        w.writerows(rows)

# ----------------------------
# Parsers (summary-only)
# ----------------------------
def parse_game_info(summary: Dict[str, Any], game_id: str) -> List[Dict[str, Any]]:
    header = summary.get("header", {}) or {}
    comp = _first_comp(summary)
    comps = _competitors(summary)

    home = _home_comp(comps)
    away = _away_comp(comps)

    # date/time (UTC ISO)
    dt_iso = (header.get("competitions", [{}])[0].get("date")
              or header.get("date")
              or "")
    date_utc, time_utc = "", ""
    if "T" in dt_iso:
        date_utc, rest = dt_iso.split("T", 1)
        time_utc = rest
    else:
        date_utc = dt_iso

    # final scores
    home_score = _parse_value_to_int(home.get("score"))
    away_score = _parse_value_to_int(away.get("score"))

    # halves
    home_1h, home_2h = _half_scores(home)
    away_1h, away_2h = _half_scores(away)

    row = {
        "game_id": game_id,
        "date_utc": date_utc,
        "time_utc": time_utc,
        "neutral_site": bool(comp.get("neutralSite")),
        "home_team": _team_name((home.get("team") or {})),
        "away_team": _team_name((away.get("team") or {})),
        "home_1h": home_1h,
        "away_1h": away_1h,
        "home_2h": home_2h,
        "away_2h": away_2h,
        "home_score": home_score,
        "away_score": away_score,
    }
    return [row]

def parse_team_stats(summary: Dict[str, Any], game_id: str) -> List[Dict[str, Any]]:
    rows: List[Dict[str, Any]] = []
    teams = (summary.get("boxscore", {}) or {}).get("teams") or []
    for t in teams:
        team_obj = t.get("team", {}) or {}
        stats = t.get("statistics") or []
        row = {
            "game_id": game_id,
            "team_id": team_obj.get("id"),
            "team": _team_name(team_obj),
            "abbreviation": team_obj.get("abbreviation"),
            "homeAway": t.get("homeAway"),
            "displayOrder": t.get("displayOrder"),
        }
        for s in stats:
            key = s.get("name") or s.get("label")
            if key:
                row[key] = s.get("displayValue")
        rows.append(row)
    return rows

def parse_player_stats(summary: Dict[str, Any], game_id: str) -> List[Dict[str, Any]]:
    rows: List[Dict[str, Any]] = []
    players_blocks = (summary.get("boxscore", {}) or {}).get("players") or []

    for team_block in players_blocks:
        team_obj = team_block.get("team", {}) or {}
        team_id = team_obj.get("id")
        team_name = _team_name(team_obj)
        team_abbr = team_obj.get("abbreviation")

        for stats_pack in team_block.get("statistics") or []:
            keys = stats_pack.get("keys") or []
            for ath in stats_pack.get("athletes") or []:
                athlete = ath.get("athlete", {}) or {}
                values = ath.get("stats") or []
                row = {
                    "game_id": game_id,
                    "team_id": team_id,
                    "team": team_name,
                    "abbreviation": team_abbr,
                    "athlete_id": athlete.get("id"),
                    "athlete_name": athlete.get("displayName"),
                    "jersey": athlete.get("jersey"),
                    "position": (athlete.get("position") or {}).get("abbreviation") or (athlete.get("position") or {}).get("displayName"),
                    "starter": ath.get("starter"),
                    "didNotPlay": ath.get("didNotPlay"),
                    "ejected": ath.get("ejected"),
                }
                for k, v in zip(keys, values):
                    row[k] = v
                rows.append(row)
    return rows

def parse_officials(summary: Dict[str, Any], game_id: str) -> List[Dict[str, Any]]:
    officials = (
        summary.get("officials")
        or summary.get("gameInfo", {}).get("officials")
        or _first_comp(summary).get("officials")
        or []
    )
    out = []
    for off in officials:
        name = off.get("fullName") or off.get("displayName")
        out.append({"game_id": game_id, "official_name": name})
    return out

# ----------------------------
# Public: single-game saver
# ----------------------------
def save_single_game(game_id: str, outdir: str = "data/boxscores") -> Dict[str, str]:
    """
    Writes FOUR CSVs using only the summary endpoint:
      1) {gid}_game_info.csv      (exact fields requested incl. 1H/2H)
      2) {gid}_team_stats.csv
      3) {gid}_player_stats.csv
      4) {gid}_officials.csv      (names only)
    """
    summary = _get_summary(game_id)

    game_info_rows  = parse_game_info(summary, game_id)
    team_stats_rows = parse_team_stats(summary, game_id)
    player_rows     = parse_player_stats(summary, game_id)
    officials_rows  = parse_officials(summary, game_id)

    game_info_path    = os.path.join(outdir, f"game-info-2026/{game_id}_game_info.csv")
    team_stats_path   = os.path.join(outdir, f"team-stats-2026/{game_id}_team_stats.csv")
    player_stats_path = os.path.join(outdir, f"player-stats-2026/{game_id}_player_stats.csv")
    officials_path    = os.path.join(outdir, f"officials-2026/{game_id}_officials.csv")

    _write_csv(
        game_info_path,
        game_info_rows,
        header=[
            "game_id","date_utc","time_utc","neutral_site",
            "home_team","away_team",
            "home_1h","away_1h","home_2h","away_2h",
            "home_score","away_score",
        ],
    )
    _write_csv(team_stats_path, team_stats_rows)
    _write_csv(player_stats_path, player_rows)
    _write_csv(officials_path, officials_rows, header=["game_id","official_name"])

    return {
        "game_info_csv": game_info_path,
        "team_stats_csv": team_stats_path,
        "player_stats_csv": player_stats_path,
        "officials_csv": officials_path,
    }


import glob
import pandas as pd

csv_files = glob.glob("daily-box-score-ids/20251106/*.csv")
combined_df = pd.concat((pd.read_csv(f) for f in csv_files), ignore_index=True)

import time
game_ids = list(combined_df['game_id'])

count = 1
for gid in game_ids:
    print(count)
    save_single_game(gid)
    time.sleep(0.5)
    count += 1

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
