In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup, Comment
from pybaseball import playerid_lookup, pitching_stats
import statsapi
from data_scraper import MLB_Scrape
import unicodedata
import io

# —————————————————————————————
# Setup a session with headers to avoid blocks
# —————————————————————————————
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/115.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://www.baseball-reference.com/"
}
session = requests.Session()
session.headers.update(HEADERS)
scraper = MLB_Scrape()

# —————————————————————————————
# Accent-stripping utility
# —————————————————————————————
def strip_accents(text: str) -> str:
    """
    Normalize unicode string, removing any accent marks.
    """
    return ''.join(
        c for c in unicodedata.normalize('NFD', text)
        if unicodedata.category(c) != 'Mn'
    )

# —————————————————————————————
# ID Lookup via pybaseball (robust)
# —————————————————————————————
def get_player_id(name: str) -> int:
    name_clean = strip_accents(name)
    parts = name_clean.split()
    last, first = parts[-1], parts[0]
    df = playerid_lookup(last, first)
    if df.empty or pd.isna(df.loc[0, 'key_mlbam']):
        raise ValueError(f"No MLBAM ID found for {name!r}")
    return int(df.loc[0, 'key_mlbam'])

# —————————————————————————————
# Scrape Top-10 finishers for each award year
# —————————————————————————————
def get_top10_cy_young_ids(years: list[int]) -> dict[int, set[int]]:
    top_ids: dict[int, set[int]] = {}
    for yr in years:
        url = f"https://www.baseball-reference.com/awards/awards_{yr}.shtml"
        resp = session.get(url); resp.raise_for_status()
        soup = BeautifulSoup(resp.content, "html.parser", from_encoding='utf-8')
        comments = soup.find_all(string=lambda t: isinstance(t, Comment))
        season_ids: set[int] = set()
        for c in comments:
            if "Cy Young Voting" not in c:
                continue
            tbl_html = str(BeautifulSoup(c, "html.parser").find("table"))
            if not tbl_html:
                continue
            df_full = pd.read_html(io.StringIO(tbl_html))[0]
            df_top = df_full.head(10)
            player_cols = [col for col in df_top.columns if "player" in str(col).lower()]
            name_col = player_cols[0] if player_cols else df_top.columns[1]
            for raw_name in df_top[name_col]:
                try:
                    pid = get_player_id(raw_name)
                    season_ids.add(pid)
                except Exception:
                    print(f"Warning: could not lookup ID for {raw_name!r}")
        top_ids[yr] = season_ids
    return top_ids

# —————————————————————————————
# Build the 2-season scrape map
# —————————————————————————————
def build_player_seasons(top_ids: dict[int, set[int]]) -> dict[int, list[int]]:
    ps: dict[int, set[int]] = {}
    for season, pids in top_ids.items():
        for pid in pids:
            ps.setdefault(pid, set()).update({season - 1, season})
    return {pid: sorted(yrs) for pid, yrs in ps.items()}

# —————————————————————————————
# Fetch full name from Stats API
# —————————————————————————————
def get_player_name(pid: int) -> str:
    resp = statsapi.get('people', {'personIds': pid})
    ppl = resp.get('people', [])
    if not ppl:
        raise ValueError(f"No MLB person found for ID {pid}")
    return ppl[0]['fullName']

# —————————————————————————————
# Scrape pitch-level data
# —————————————————————————————
def scrape_pitch_level(player_seasons: dict[int, list[int]]) -> pd.DataFrame:
    all_p = []
    for pid, seasons in player_seasons.items():
        name = get_player_name(pid)
        for yr in seasons:
            games = scraper.get_player_games_list(
                player_id=pid, season=yr,
                start_date=f"{yr}-03-01", end_date=f"{yr}-11-30",
                sport_id=1, game_type=['R'], pitching=True
            )
            if not games:
                continue
            jsons = scraper.get_data(game_list_input=games)
            df_raw = scraper.get_data_df(data_list=jsons)
            df = df_raw.to_pandas() if hasattr(df_raw, 'to_pandas') else df_raw
            df = df[df['pitcher_id'] == pid].copy()
            df['Name'], df['Season'] = name, yr
            all_p.append(df)
    return pd.concat(all_p, ignore_index=True) if all_p else pd.DataFrame()

# —————————————————————————————
# Aggregate to game level
# —————————————————————————————
def aggregate_game_level(pitch_df: pd.DataFrame, feature_cols: list[str]) -> pd.DataFrame:
    return (
        pitch_df
        .groupby(['game_id','game_date','Name','Season'])[feature_cols]
        .mean()
        .reset_index()
        .rename({c: f"{c}_mean" for c in feature_cols}, axis=1)
    )

# —————————————————————————————
# Scrape season totals
# —————————————————————————————
def scrape_season_totals(player_seasons: dict[int, list[int]], qual: int = 0) -> pd.DataFrame:
    recs = []
    name_map = {pid: get_player_name(pid) for pid in player_seasons}
    for pid, seasons in player_seasons.items():
        name = name_map[pid]
        for yr in seasons:
            df_yr = pitching_stats(yr, qual=qual)
            df_f = df_yr[df_yr['Name'] == name].copy()
            df_f['Season'] = yr
            recs.append(df_f)
    return pd.concat(recs, ignore_index=True) if recs else pd.DataFrame()

# —————————————————————————————
# Main execution: split into pre- and award-year datasets
# —————————————————————————————
if __name__ == '__main__':
    award_years = [2022, 2023, 2024]
    prev_years = [yr - 1 for yr in award_years]

    # get mappings
    top_ids_by_year = get_top10_cy_young_ids(award_years)
    player_seasons_map = build_player_seasons(top_ids_by_year)

    # full scrapes
    pitch_df = scrape_pitch_level(player_seasons_map)
    features = [
        'rbi','start_speed','end_speed','sz_top','sz_bot','x','y','ax','ay','az',
        'pfxx','pfxz','px','pz','vx0','vy0','vz0','x0','y0','z0','zone',
        'type_confidence','plate_time','extension','spin_rate','spin_direction','vb','ivb','hb'
    ]
    game_df = aggregate_game_level(pitch_df, features)
    season_df = scrape_season_totals(player_seasons_map)

    # split
    pitch_pre  = pitch_df [pitch_df ['Season'].isin(prev_years)]
    pitch_aw   = pitch_df [pitch_df ['Season'].isin(award_years)]
    game_pre   = game_df  [game_df  ['Season'].isin(prev_years)]
    game_aw    = game_df  [game_df  ['Season'].isin(award_years)]
    season_pre = season_df[season_df['Season'].isin(prev_years)]
    season_aw  = season_df[season_df['Season'].isin(award_years)]

    # outputs
    print("Pitch-level before-year rows:", len(pitch_pre))
    print("Pitch-level award-year rows:", len(pitch_aw))
    print("Game-level before-year rows:", len(game_pre))
    print("Game-level award-year rows:", len(game_aw))
    print("Season-level before-year rows:", len(season_pre))
    print("Season-level award-year rows:", len(season_aw))

Gathering player lookup table. This may take a moment.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 30/30 [00:00<00:00, 50.26iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 30/30 [00:00<00:00, 52.14iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 25/25 [00:00<00:00, 50.75iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 29/29 [00:00<00:00, 49.80iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 23/23 [00:00<00:00, 49.03iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 31/31 [00:00<00:00, 49.07iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 34/34 [00:00<00:00, 50.93iteration/s]


Converting Data to Dataframe.
No pitching games found for player 434378 in season 2021
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 28/28 [00:00<00:00, 50.43iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 28/28 [00:00<00:00, 55.34iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 30/30 [00:00<00:00, 48.80iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 33/33 [00:00<00:00, 52.20iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 31/31 [00:00<00:00, 41.08iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 31/31 [00:00<00:00, 49.83iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 32/32 [00:00<00:00, 44.41iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 32/32 [00:00<00:00, 58.41iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 33/33 [00:00<00:00, 51.30iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 33/33 [00:00<00:00, 45.41iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 23/23 [00:00<00:00, 31.93iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 28/28 [00:00<00:00, 56.57iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 16/16 [00:00<00:00, 45.83iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 31/31 [00:00<00:00, 57.57iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 28/28 [00:00<00:00, 51.70iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 33/33 [00:00<00:00, 49.52iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 32/32 [00:00<00:00, 50.67iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 32/32 [00:00<00:00, 43.64iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 2/2 [00:00<00:00,  6.85iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 30/30 [00:00<00:00, 50.45iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 32/32 [00:00<00:00, 60.94iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 32/32 [00:00<00:00, 57.66iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 20/20 [00:00<00:00, 40.42iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 31/31 [00:00<00:00, 52.85iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 30/30 [00:00<00:00, 52.32iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 33/33 [00:01<00:00, 24.26iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 33/33 [00:04<00:00,  6.89iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 22/22 [00:00<00:00, 50.48iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 31/31 [00:00<00:00, 40.86iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 31/31 [00:00<00:00, 51.36iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 28/28 [00:00<00:00, 47.00iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 25/25 [00:00<00:00, 40.02iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 31/31 [00:00<00:00, 53.15iteration/s]


Converting Data to Dataframe.
No pitching games found for player 673540 in season 2022
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 29/29 [00:00<00:00, 38.04iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 31/31 [00:00<00:00, 62.50iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 32/32 [00:00<00:00, 44.39iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 24/24 [00:00<00:00, 51.98iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 32/32 [00:01<00:00, 30.39iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 25/25 [00:00<00:00, 51.96iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 32/32 [00:01<00:00, 25.41iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 32/32 [00:00<00:00, 56.86iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 33/33 [00:00<00:00, 64.99iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 33/33 [00:00<00:00, 52.28iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 30/30 [00:00<00:00, 52.16iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 33/33 [00:00<00:00, 51.29iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 24/24 [00:00<00:00, 44.71iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 30/30 [00:00<00:00, 61.96iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 20/20 [00:00<00:00, 47.41iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 31/31 [00:00<00:00, 55.86iteration/s]


Converting Data to Dataframe.
No pitching games found for player 112116 in season 2022
No pitching games found for player 112116 in season 2023
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 23/23 [00:00<00:00, 54.37iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 30/30 [00:00<00:00, 57.72iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 26/26 [00:00<00:00, 58.60iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 32/32 [00:00<00:00, 46.20iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 32/32 [00:00<00:00, 42.23iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 32/32 [00:00<00:00, 56.27iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 32/32 [00:00<00:00, 41.80iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 61/61 [00:00<00:00, 67.33iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 61/61 [00:00<00:00, 64.54iteration/s]


Converting Data to Dataframe.
No pitching games found for player 694973 in season 2023
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 23/23 [00:00<00:00, 49.03iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 26/26 [00:00<00:00, 46.44iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 33/33 [00:00<00:00, 53.16iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 20/20 [00:00<00:00, 45.11iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 29/29 [00:00<00:00, 49.15iteration/s]


Converting Data to Dataframe.
No pitching games found for player 684007 in season 2023
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 29/29 [00:00<00:00, 47.33iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 22/22 [00:01<00:00, 18.37iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 26/26 [00:05<00:00,  4.53iteration/s]


Converting Data to Dataframe.
No pitching games found for player 671922 in season 2023
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 74/74 [00:05<00:00, 12.90iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 32/32 [00:00<00:00, 48.87iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 33/33 [00:05<00:00,  5.70iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 33/33 [00:00<00:00, 34.34iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 65/65 [00:01<00:00, 58.05iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 75/75 [00:01<00:00, 65.84iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 74/74 [00:01<00:00, 63.53iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 15/15 [00:00<00:00, 40.84iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 31/31 [00:00<00:00, 54.53iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 29/29 [00:08<00:00,  3.38iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 32/32 [00:00<00:00, 53.18iteration/s]


Converting Data to Dataframe.
Pitch-level before-year rows: 165614
Pitch-level award-year rows: 178516
Game-level before-year rows: 1946
Game-level award-year rows: 2302
Season-level before-year rows: 69
Season-level award-year rows: 73


In [5]:
pitch_df.to_csv("data/pitch_level.csv", index=False)
game_df.to_csv("data/game_level.csv", index=False)
season_df.to_csv("data/season_totals.csv", index=False)

In [3]:
import requests
import pandas as pd
from bs4 import BeautifulSoup, Comment

# —————————————————————————————
# Helpers for scraping
# —————————————————————————————
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/115.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://www.baseball-reference.com/"
}
session = requests.Session()
session.headers.update(HEADERS)


def scrape_cy_young_winners(seasons: list[int]) -> pd.DataFrame:
    """
    Scrape Baseball‑Reference award pages for Cy Young winners.
    Returns a DataFrame with columns ['Name', 'Season', 'winner_flag'].
    """
    records = []
    for yr in seasons:
        url = f"https://www.baseball-reference.com/awards/awards_{yr}.shtml"
        resp = session.get(url)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "html.parser")

        # BR hides their tables in HTML comments
        comments = soup.find_all(string=lambda t: isinstance(t, Comment))
        # we want the first Cy Young Voting table per league, but the winner is always first row
        for c in comments:
            if "Cy Young Voting" not in c:
                continue
            tbl = BeautifulSoup(c, "html.parser").find("table")
            if tbl is None:
                continue

            # parse the table into pandas
            df = pd.read_html(str(tbl))[0]
            # identify the player column
            player_cols = [col for col in df.columns if "player" in str(col).lower()]
            if player_cols:
                name_col = player_cols[0]
            else:
                # fallback: assume second column is player
                name_col = df.columns[1]

            # grab the first-place finisher
            winner = df[name_col].iloc[0]
            records.append({
                "Name": winner,
                "Season": yr,
                "winner_flag": 1
            })
            # once per table, move on to next comment (other league)
    winners_df = pd.DataFrame.from_records(records)
    return winners_df


if __name__ == "__main__":
    # 1) list all seasons for which you need labels:
    #    (e.g. award years and/or prior years from your pipeline)
    seasons = [2021, 2022, 2023, 2024]

    # 2) load your season_totals to get every (Name, Season) you scraped
    combos = (
        season_df[["Name", "Season"]]
        .drop_duplicates()
        .reset_index(drop=True)
    )

    # 3) scrape winners
    winners_df = scrape_cy_young_winners(seasons)

    # 4) merge and fill zeros for non‑winners
    labels = (
        combos
        .merge(winners_df, on=["Name", "Season"], how="left")
        .fillna({"winner_flag": 0})
    )
    labels["winner_flag"] = labels["winner_flag"].astype(int)

  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]


In [None]:
labels.to_csv('data/cy_winners.csv')