In [None]:
# cell 1: imports
import os
import time
import math
import pandas as pd

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# cell 2: config (EDIT THESE)
CSV_PATH   = "/path/to/your/teams.csv"   # <- path to your CSV with 'link' column
OUTPUT_DIR = "/path/to/output/folder"    # <- folder where individual CSVs will be saved

REQUESTS_PER_MIN = 6
DELAY_SECONDS = math.ceil(60 / REQUESTS_PER_MIN)  # ~10s between requests
BASE_URL = "https://www.sports-reference.com/cbb/schools/{teamcode}/women/2025-gamelogs.html"
TABLE_ID = "team_game_log"  # sports-ref's table id

# cell 3: helper functions

def init_driver(headless=True):
    """Start a Chrome webdriver (works well in most notebook setups)."""
    options = webdriver.ChromeOptions()
    if headless:
        options.add_argument("--headless=new")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),
                              options=options)
    return driver


def safe_filename(name: str) -> str:
    """Create a filesystem-safe filename from a team code or school name."""
    keep = "-_.() "
    name = "".join(c for c in name if c.isalnum() or c in keep)
    return name.replace(" ", "_").lower()

# cell 4: core scraping logic

def scrape_team_gamelog(driver, team_code: str) -> pd.DataFrame | None:
    """
    Load the team game log page for a given team code and return the table as a DataFrame.
    Returns None if table not found or some error occurs.
    """
    url = BASE_URL.format(teamcode=team_code)
    print(f"Fetching {url}")

    driver.get(url)

    try:
        # wait for the table to be present in DOM
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.ID, TABLE_ID))
        )
    except Exception as e:
        print(f"  ! Timed out waiting for table for {team_code}: {e}")
        return None

    # sports-reference often puts tables inside HTML comments; pandas.read_html handles that
    html = driver.page_source

    try:
        tables = pd.read_html(html, attrs={"id": TABLE_ID})
        if not tables:
            print(f"  ! No table parsed for {team_code}")
            return None
        df = tables[0]
        return df
    except ValueError:
        print(f"  ! read_html could not find table for {team_code}")
        return None
    
    # cell 5: driver loop

def run_scrape(csv_path: str, output_dir: str,
               start_idx: int = 0, end_idx: int | None = None,
               headless: bool = True):
    """
    Iterate over team codes in the CSV and save each #team_game_log table as its own CSV.

    CSV is expected to contain a column named 'link' with the team codes
    (e.g. 'alabama-am', 'albany-ny').
    """
    os.makedirs(output_dir, exist_ok=True)

    teams = pd.read_csv(csv_path)
    if "link" not in teams.columns:
        raise ValueError("CSV must contain a 'link' column with team codes.")

    # optional slice of rows (for resuming)
    if end_idx is None:
        subset = teams.iloc[start_idx:]
    else:
        subset = teams.iloc[start_idx:end_idx]

    driver = init_driver(headless=headless)

    try:
        for idx, row in subset.iterrows():
            team_code = str(row["link"]).strip()

            # skip missing/blank codes
            if not team_code or team_code.lower() == "nan":
                print(f"Skipping row {idx}: empty team code")
                continue

            filename = safe_filename(team_code) + ".csv"
            out_path = os.path.join(output_dir, filename)

            # skip if already scraped
            if os.path.exists(out_path):
                print(f"Already exists, skipping: {out_path}")
                continue

            df = scrape_team_gamelog(driver, team_code)
            if df is not None and not df.empty:
                df.to_csv(out_path, index=False)
                print(f"  -> Saved {out_path}")
            else:
                print(f"  ! No data for {team_code}")

            # rate limiting: ~6 requests per minute
            print(f"Sleeping {DELAY_SECONDS} seconds for rate limit...")
            time.sleep(DELAY_SECONDS)

    finally:
        driver.quit()

In [None]:
# cell 6: run it
run_scrape(CSV_PATH, OUTPUT_DIR, start_idx=0, end_idx=None, headless=True)