In [None]:
# pip install playwright bs4
# python -m playwright install
import sys, subprocess
subprocess.check_call([sys.executable, "-m", "playwright", "install", "chromium"])

## 2021: 20201101 - 20220313

In [None]:
import asyncio, time
from typing import List
from playwright.async_api import async_playwright, TimeoutError as PWTimeout
from bs4 import BeautifulSoup

async def get_final_html(
    url: str,
    *,
    table_selector: str = "table",
    timeout_ms: int = 30000,
    headless: bool = True
) -> str:
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=headless)
        context = await browser.new_context(
            user_agent=("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                        "AppleWebKit/537.36 (KHTML, like Gecko) "
                        "Chrome/119.0.0.0 Safari/537.36"),
            locale="en-US",
            timezone_id="America/New_York",
        )
        page = await context.new_page()
        await page.goto(url, wait_until="domcontentloaded", timeout=timeout_ms)

        # Prefer: wait for the real table to show up
        try:
            await page.wait_for_selector(table_selector, timeout=20000)
        except PWTimeout:
            # Fallback: give network a chance to go idle, then re-check
            await page.wait_for_load_state("networkidle", timeout=10000)

        # If we still landed on the verification page, poll briefly
        for _ in range(6):
            html = await page.content()
            if ("Verifying your browser" not in html
                and "js_test_submitted" not in html):
                break
            await asyncio.sleep(1)

        final_html = await page.content()
        await browser.close()
        return final_html


def parse_first_table(html: str) -> List[List[str]]:
    soup = BeautifulSoup(html, "html.parser")
    table = soup.select_one("table")
    if not table:
        return []
    rows = []
    for tr in table.select("tr"):
        cells = [c.get_text(strip=True) for c in tr.select("th, td")]
        if cells:
            rows.append(cells)
    return rows

# --- How to call this ---

# 1) If you're already inside `async def` (FastAPI/Jupyter/etc):

async def scrape_table(url: str, table_selector="table"):
    html = await get_final_html(url, table_selector=table_selector, headless=True)
    rows = parse_first_table(html)
    return rows

rows = await scrape_table('https://barttorvik.com/trank.php?year=2021&sort=&hteam=&t2value=&conlimit=All&state=All&begin=20201101&end=20211110&top=0&revquad=0&quad=5&venue=All&type=All&mingames=0#', "table.your-selector")

In [8]:
import pandas as pd
# Use the second row as headers
columns = rows[1]

# Remaining rows are data (skip first 2 rows)
data = rows[2:]

# Create DataFrame
df = pd.DataFrame(data, columns=columns)

print(df.shape)
df.head(10)

(376, 24)


Unnamed: 0,Rk,Team,Conf,G,Rec,AdjOE,AdjDE,Barthag,EFG%,EFGD%,...,FTR,FTRD,2P%,2P%D,3P%,3P%D,3PR,3PRD,Adj T.,WAB
0,1,"Houston1 seed,Sweet Sixteen",B12,7,7–00–0,118.121,79.51,0.98961,51.2132,41.07,...,36.2126,36.2229,50.8162,40.41,34.6118,27.941,38.0164,41.3281,65.0338,1.48
1,2,"Purdue1 seed,Finals",B10,7,7–00–0,120.012,84.03,0.98372,57.219,42.715,...,46.32,18.43,53.395,47.7116,42.83,24.06,36.1211,43.0311,70.9127,2.81
2,3,"Arizona2 seed,Sweet Sixteen",P12,6,6–00–0,121.48,86.05,0.98133,57.518,44.234,...,37.1107,24.136,56.932,42.317,39.221,31.5129,31.3303,38.6221,75.41,1.83
3,4,"Connecticut1 seed,CHAMPS",BE,7,7–00–0,123.53,92.221,0.96664,57.816,43.622,...,38.182,30.2128,65.91,40.19,30.9236,34.1226,41.484,31.034,69.2201,1.017
4,5,"BYU6 seed, R64",B12,6,6–00–0,120.411,90.011,0.96585,58.611,42.313,...,26.1303,27.471,59.216,46.272,38.628,22.51,50.16,31.034,71.792,1.212
5,6,"Auburn4 seed, R64",SEC,6,5–10–0,115.832,87.69,0.96096,51.5125,40.54,...,44.428,38.7269,52.4118,42.921,33.3156,23.74,37.5175,33.692,73.046,0.4836
6,7,"Marquette2 seed,Sweet Sixteen",BE,7,6–10–0,117.025,90.917,0.94827,55.83,47.7105,...,29.6258,25.246,60.51,46.169,32.8178,33.5203,41.68,38.3218,70.4147,1.76
7,8,"Alabama4 seed,Final Four",SEC,7,5–20–0,130.51,101.7147,0.94578,59.55,51.6235,...,48.313,35.622,58.417,50.92,40.711,35.1254,41.974,36.1151,73.731,0.0356
8,9,"Florida Atlantic8 seed, R64",Amer,7,6–10–0,122.35,95.544,0.94529,57.915,46.879,...,32.421,28.087,57.423,46.894,39.221,31.3125,39.8116,35.0123,68.4235,1.019
9,10,"Baylor3 seed, R32",B12,6,6–00–0,127.42,99.7106,0.94381,59.08,46.879,...,47.217,30.0122,52.3121,49.2149,48.71,28.044,32.5285,33.281,71.98,1.75


In [23]:
# pip install playwright bs4 pandas python-dateutil
# python -m playwright install   # only once

import asyncio
import os
from datetime import datetime, timedelta
from typing import List
import pandas as pd
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright, TimeoutError as PWTimeout

BASE_URL = (
    "https://barttorvik.com/trank.php"
    "?year={year}&sort=&hteam=&t2value=&conlimit=All&state=All"
    "&begin={begin}&end={end}&top=0&revquad=0&quad=5&venue=All&type=All&mingames=0#"
)

# ---------- HTML fetch ----------

async def goto_and_get_html(page, url: str, table_selector: str = "table", timeout_ms: int = 30000) -> str:
    await page.goto(url, wait_until="domcontentloaded", timeout=timeout_ms)
    try:
        await page.wait_for_selector(table_selector, timeout=20000)
    except PWTimeout:
        await page.wait_for_load_state("networkidle", timeout=10000)

    # if we’re still on the verification page, wait a bit
    for _ in range(6):
        html = await page.content()
        if "Verifying your browser" not in html and "js_test_submitted" not in html:
            break
        await asyncio.sleep(1)

    return await page.content()

# ---------- Table parsing ----------

def parse_first_table(html: str) -> List[List[str]]:
    soup = BeautifulSoup(html, "html.parser")
    table = soup.select_one("table")
    if not table:
        return []
    rows = []
    for tr in table.select("tr"):
        cells = [c.get_text(strip=True) for c in tr.select("th, td")]
        if cells:
            rows.append(cells)
    return rows


def rows_to_dataframe(rows: List[List[str]]) -> pd.DataFrame:
    """Convert raw scraped rows into a DataFrame"""
    if len(rows) < 3:
        return pd.DataFrame()
    columns = rows[1]  # second row = headers
    data = rows[2:]
    max_len = len(columns)
    norm = [r[:max_len] + ([""] * (max_len - len(r))) for r in data]
    df = pd.DataFrame(norm, columns=columns)
    return df

# ---------- Orchestrator ----------

async def scrape_barttorvik_daily(
    year: int = 2021,
    begin: str = "20201101",
    end: str = "20210313",
    output_dir: str = "daily_csvs",
    master_csv: str = "barttorvik_2021_all.csv",
    table_selector: str = "table",
    headless: bool = True,
    pause_sec: float = 3.8
):
    os.makedirs(output_dir, exist_ok=True)
    start_dt = datetime.strptime(begin, "%Y%m%d")
    final_dt = datetime.strptime(end, "%Y%m%d")

    first_write = not os.path.exists(master_csv)

    async with async_playwright() as p:
        browser = await p.chromium.launch(channel="chrome", headless=headless)
        context = await browser.new_context(
            user_agent=("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                        "AppleWebKit/537.36 (KHTML, like Gecko) "
                        "Chrome/119.0.0.0 Safari/537.36"),
            locale="en-US",
            timezone_id="America/New_York",
        )
        page = await context.new_page()

        dt = start_dt
        total_rows = 0
        while dt <= final_dt:
            end_str = dt.strftime("%Y%m%d")
            url = BASE_URL.format(year=year, begin=begin, end=end_str)

            try:
                html = await goto_and_get_html(page, url, table_selector=table_selector)
                rows = parse_first_table(html)
                df = rows_to_dataframe(rows)

                if not df.empty:
                    df.insert(0, "Date", end_str)

                    # write individual daily file
                    daily_path = os.path.join(output_dir, f"barttorvik_{end_str}.csv")
                    df.to_csv(daily_path, index=False)
                    print(f"✔️  {end_str}: saved {len(df)} rows to {daily_path}")

                    # append to master CSV
                    if first_write:
                        df.to_csv(master_csv, index=False)
                        first_write = False
                    else:
                        df.to_csv(master_csv, mode="a", header=False, index=False)

                    total_rows += len(df)
                else:
                    print(f"⚠️  {end_str}: no data (empty table)")

            except Exception as e:
                print(f"❌ {end_str}: ERROR {e}")

            await asyncio.sleep(pause_sec)
            dt += timedelta(days=1)

        await browser.close()

    print(f"\n✅ Done! {total_rows} total rows saved across days.")

# ---------- Run ----------
# In Jupyter or async environment:
await scrape_barttorvik_daily(
    year=2025,
    begin="20241103",
    end="20250318",
    output_dir="daily_csvs_2025",
    master_csv="barttorvik_2025_all.csv"
)


✔️  20241103: saved 1 rows to daily_csvs_2025/barttorvik_20241103.csv
✔️  20241104: saved 242 rows to daily_csvs_2025/barttorvik_20241104.csv
✔️  20241105: saved 258 rows to daily_csvs_2025/barttorvik_20241105.csv
✔️  20241106: saved 288 rows to daily_csvs_2025/barttorvik_20241106.csv
✔️  20241107: saved 301 rows to daily_csvs_2025/barttorvik_20241107.csv
✔️  20241108: saved 341 rows to daily_csvs_2025/barttorvik_20241108.csv
✔️  20241109: saved 367 rows to daily_csvs_2025/barttorvik_20241109.csv
✔️  20241110: saved 372 rows to daily_csvs_2025/barttorvik_20241110.csv
✔️  20241111: saved 375 rows to daily_csvs_2025/barttorvik_20241111.csv
✔️  20241112: saved 377 rows to daily_csvs_2025/barttorvik_20241112.csv
✔️  20241113: saved 377 rows to daily_csvs_2025/barttorvik_20241113.csv
✔️  20241114: saved 377 rows to daily_csvs_2025/barttorvik_20241114.csv
✔️  20241115: saved 377 rows to daily_csvs_2025/barttorvik_20241115.csv
✔️  20241116: saved 378 rows to daily_csvs_2025/barttorvik_2024111