In [2]:
import requests
import pandas as pd
from bs4 import BeautifulSoup, Comment
from pybaseball import playerid_lookup, pitching_stats
import statsapi
from data_scraper import MLB_Scrape
import unicodedata
import io

# —————————————————————————————
# Setup a session with headers to avoid blocks
# —————————————————————————————
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/115.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://www.baseball-reference.com/"
}
session = requests.Session()
session.headers.update(HEADERS)
scraper = MLB_Scrape()

# —————————————————————————————
# Accent-stripping utility
# —————————————————————————————
def strip_accents(text: str) -> str:
    """
    Normalize unicode string, removing any accent marks.
    """
    return ''.join(
        c for c in unicodedata.normalize('NFD', text)
        if unicodedata.category(c) != 'Mn'
    )

# —————————————————————————————
# ID Lookup via pybaseball (robust)
# —————————————————————————————
def get_player_id(name: str) -> int:
    """
    Lookup the MLBAM personId for a full name via pybaseball.playerid_lookup(),
    stripping accents so names like 'José' resolve correctly.
    """
    name_clean = strip_accents(name)
    parts = name_clean.split()
    last, first = parts[-1], parts[0]
    df = playerid_lookup(last, first)
    if df.empty or pd.isna(df.loc[0, 'key_mlbam']):
        raise ValueError(f"No MLBAM ID found for {name!r}")
    return int(df.loc[0, 'key_mlbam'])

# —————————————————————————————
# Scrape Top-10 finishers for each award year
# —————————————————————————————
def get_top10_cy_young_ids(years: list[int]) -> dict[int, set[int]]:
    """
    For each year, scrape the AL & NL Cy Young voting tables and return:
      { year: {mlbam_id, ...} }
    """
    top_ids: dict[int, set[int]] = {}
    for yr in years:
        url = f"https://www.baseball-reference.com/awards/awards_{yr}.shtml"
        resp = session.get(url)
        resp.raise_for_status()
        # parse with correct encoding to avoid mojibake
        soup = BeautifulSoup(resp.content, "html.parser", from_encoding='utf-8')

        # grab hidden tables from HTML comments
        comments = soup.find_all(string=lambda t: isinstance(t, Comment))
        season_ids: set[int] = set()

        for c in comments:
            if "Cy Young Voting" not in c:
                continue
            tbl_html = str(BeautifulSoup(c, "html.parser").find("table"))
            if not tbl_html:
                continue

            df_full = pd.read_html(io.StringIO(tbl_html))[0]
            df = df_full.head(10)

            # find the column header that contains “player”
            player_cols = [col for col in df.columns if "player" in str(col).lower()]
            name_col = player_cols[0] if player_cols else df.columns[1]

            for raw_name in df[name_col]:
                try:
                    pid = get_player_id(raw_name)
                    season_ids.add(pid)
                except Exception as e:
                    print(f"⚠️  Warning: could not lookup ID for {raw_name!r}: {e}")

        top_ids[yr] = season_ids
    return top_ids

# —————————————————————————————
# Build the 2-season scrape map
# —————————————————————————————
def build_player_seasons(top_ids: dict[int, set[int]]) -> dict[int, list[int]]:
    """
    Convert {season: {pids}} into {pid: [season-1, season, ...]}
    """
    ps: dict[int, set[int]] = {}
    for season, pids in top_ids.items():
        for pid in pids:
            ps.setdefault(pid, set()).update({season - 1, season})
    return {pid: sorted(yrs) for pid, yrs in ps.items()}

# —————————————————————————————
# Fetch full name from Stats API
# —————————————————————————————
def get_player_name(pid: int) -> str:
    """
    Fetch fullName from MLB Stats API for a given mlbam playerID.
    """
    resp = statsapi.get('people', {'personIds': pid})
    ppl = resp.get('people', [])
    if not ppl:
        raise ValueError(f"No MLB person found for ID {pid}")
    return ppl[0]['fullName']

# —————————————————————————————
# Scrape pitch-level data
# —————————————————————————————
def scrape_pitch_level(player_seasons: dict[int, list[int]]) -> pd.DataFrame:
    """
    Scrape pitch-by-pitch data for each playerID in each desired season.
    Returns a pandas.DataFrame with columns including
    ['game_id','pitcher_id','pitch_type','release_speed','spin_rate',...,'Name','Season'].
    """
    all_p = []
    for pid, seasons in player_seasons.items():
        name = get_player_name(pid)
        for yr in seasons:
            games = scraper.get_player_games_list(
                player_id=pid, season=yr,
                start_date=f"{yr}-03-01", end_date=f"{yr}-11-30",
                sport_id=1, game_type=['R'], pitching=True
            )
            if not games:
                continue

            jsons = scraper.get_data(game_list_input=games)
            df_raw = scraper.get_data_df(data_list=jsons)
            df = df_raw.to_pandas() if hasattr(df_raw, 'to_pandas') else df_raw
            df = df[df['pitcher_id'] == pid].copy()
            df['Name'], df['Season'] = name, yr
            all_p.append(df)

    return pd.concat(all_p, ignore_index=True) if all_p else pd.DataFrame()

# —————————————————————————————
# Aggregate to game level
# —————————————————————————————
def aggregate_game_level(pitch_df: pd.DataFrame,
                         feature_cols: list[str]) -> pd.DataFrame:
    """
    Compute per-game means of your pitch-level features.
    """
    agg = (
        pitch_df
        .groupby(['game_id','game_date','Name','Season'])[feature_cols]
        .mean()
        .reset_index()
        .rename({c: f"{c}_mean" for c in feature_cols}, axis=1)
    )
    return agg

# —————————————————————————————
# Scrape season totals
# —————————————————————————————
def scrape_season_totals(player_seasons: dict[int, list[int]],
                         qual: int = 0) -> pd.DataFrame:
    """
    For each player & season, pull full-season totals from pybaseball.
    Returns a DataFrame with the standard pitching stats plus 'Name' & 'Season'.
    """
    recs = []
    # prefetch names
    name_map = {pid: get_player_name(pid) for pid in player_seasons}
    for pid, seasons in player_seasons.items():
        name = name_map[pid]
        for yr in seasons:
            df_yr = pitching_stats(yr, qual=qual)
            df_f = df_yr[df_yr['Name'] == name].copy()
            df_f['Season'] = yr
            recs.append(df_f)

    return pd.concat(recs, ignore_index=True) if recs else pd.DataFrame()

# —————————————————————————————
# Main execution
# —————————————————————————————
if __name__ == '__main__':
    award_years = [2022, 2023, 2024]
    top_ids_by_year = get_top10_cy_young_ids(award_years)
    player_seasons_map = build_player_seasons(top_ids_by_year)

    print("Top-10 Cy Young finishers by year (MLBAM IDs):")
    for yr, ids in top_ids_by_year.items():
        print(f" {yr}: {sorted(ids)}")

    print("\nFinal scrape map (playerID -> [years…]):")
    for pid, yrs in player_seasons_map.items():
        name = get_player_name(pid)
        print(f" {pid} ({name}): {yrs}")

    pitch_df = scrape_pitch_level(player_seasons_map)
    print("Pitch-level rows:", len(pitch_df))

    features = [
        'rbi','start_speed','end_speed','sz_top','sz_bot','x','y','ax','ay','az',
        'pfxx','pfxz','px','pz','vx0','vy0','vz0','x0','y0','z0','zone',
        'type_confidence','plate_time','extension','spin_rate','spin_direction','vb','ivb','hb'
    ]
    game_df = aggregate_game_level(pitch_df, features)
    print("Game-level rows:", len(game_df))

    season_df = scrape_season_totals(player_seasons_map)
    print("Season-level rows:", len(season_df))


Top-10 Cy Young finishers by year (MLBAM IDs):
 2022: [434378, 506433, 543037, 592332, 605400, 608331, 656302, 657140, 660271, 663556, 664285, 666201, 668678, 669203, 669456]
 2023: [112116, 543037, 543243, 554430, 592332, 605135, 605483, 621107, 657006, 657277, 664285, 668678, 669203, 669923, 673540, 675911, 680694]
 2024: [489446, 519242, 554430, 579328, 607625, 656302, 657277, 661403, 664285, 664854, 666142, 668881, 669203, 669302, 669373, 671922, 684007, 694973]

Final scrape map (playerID -> [years…]):
 506433 (Yu Darvish): [2021, 2022]
 663556 (Shane McClanahan): [2021, 2022]
 668678 (Zac Gallen): [2021, 2022, 2023]
 434378 (Justin Verlander): [2021, 2022]
 608331 (Max Fried): [2021, 2022]
 592332 (Kevin Gausman): [2021, 2022, 2023]
 656302 (Dylan Cease): [2021, 2022, 2023, 2024]
 660271 (Shohei Ohtani): [2021, 2022]
 669456 (Shane Bieber): [2021, 2022]
 669203 (Corbin Burnes): [2021, 2022, 2023, 2024]
 657140 (Kyle Wright): [2021, 2022]
 605400 (Aaron Nola): [2021, 2022]
 666201

Processing: 100%|██████████| 30/30 [00:00<00:00, 38.56iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 30/30 [00:00<00:00, 39.47iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 25/25 [00:00<00:00, 38.69iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 29/29 [00:00<00:00, 42.29iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 23/23 [00:00<00:00, 40.01iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 31/31 [00:00<00:00, 43.86iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 34/34 [00:00<00:00, 41.36iteration/s]


Converting Data to Dataframe.
No pitching games found for player 434378 in season 2021
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 28/28 [00:00<00:00, 44.38iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 28/28 [00:00<00:00, 47.69iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 30/30 [00:00<00:00, 47.65iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 33/33 [00:00<00:00, 43.08iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 31/31 [00:00<00:00, 45.39iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 31/31 [00:00<00:00, 43.10iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 32/32 [00:00<00:00, 41.34iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 32/32 [00:00<00:00, 41.62iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 33/33 [00:00<00:00, 50.38iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 33/33 [00:00<00:00, 44.31iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 23/23 [00:00<00:00, 40.88iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 28/28 [00:00<00:00, 52.33iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 16/16 [00:00<00:00, 34.95iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 31/31 [00:01<00:00, 28.58iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 28/28 [00:00<00:00, 43.20iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 33/33 [00:00<00:00, 47.97iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 32/32 [00:00<00:00, 38.48iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 32/32 [00:00<00:00, 48.16iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 2/2 [00:00<00:00,  5.70iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 30/30 [00:00<00:00, 43.72iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 32/32 [00:00<00:00, 47.74iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 32/32 [00:00<00:00, 40.66iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 20/20 [00:00<00:00, 34.53iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 31/31 [00:00<00:00, 41.52iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 30/30 [00:00<00:00, 44.20iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 33/33 [00:00<00:00, 43.65iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 33/33 [00:00<00:00, 35.67iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 22/22 [00:00<00:00, 39.93iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 31/31 [00:00<00:00, 44.56iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 31/31 [00:00<00:00, 44.55iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 28/28 [00:00<00:00, 42.28iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 25/25 [00:00<00:00, 48.11iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 31/31 [00:00<00:00, 42.60iteration/s]


Converting Data to Dataframe.
No pitching games found for player 673540 in season 2022
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 29/29 [00:00<00:00, 44.81iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 31/31 [00:00<00:00, 45.68iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 32/32 [00:01<00:00, 31.76iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 24/24 [00:00<00:00, 46.51iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 32/32 [00:00<00:00, 48.85iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 25/25 [00:00<00:00, 42.21iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 32/32 [00:00<00:00, 38.55iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 32/32 [00:00<00:00, 48.73iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 33/33 [00:00<00:00, 47.95iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 33/33 [00:00<00:00, 51.01iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 30/30 [00:00<00:00, 50.81iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 33/33 [00:00<00:00, 52.17iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 24/24 [00:00<00:00, 37.59iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 30/30 [00:00<00:00, 51.16iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 20/20 [00:00<00:00, 47.56iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 31/31 [00:00<00:00, 40.40iteration/s]


Converting Data to Dataframe.
No pitching games found for player 112116 in season 2022
No pitching games found for player 112116 in season 2023
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 23/23 [00:00<00:00, 40.55iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 30/30 [00:00<00:00, 51.87iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 26/26 [00:00<00:00, 58.02iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 32/32 [00:00<00:00, 44.41iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 32/32 [00:00<00:00, 52.22iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 32/32 [00:00<00:00, 53.07iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 32/32 [00:00<00:00, 53.42iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 61/61 [00:01<00:00, 47.87iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 61/61 [00:01<00:00, 52.07iteration/s]


Converting Data to Dataframe.
No pitching games found for player 694973 in season 2023
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 23/23 [00:00<00:00, 40.28iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 26/26 [00:00<00:00, 39.38iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 33/33 [00:00<00:00, 42.04iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 20/20 [00:00<00:00, 33.65iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 29/29 [00:00<00:00, 45.45iteration/s]


Converting Data to Dataframe.
No pitching games found for player 684007 in season 2023
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 29/29 [00:00<00:00, 47.02iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 22/22 [00:00<00:00, 31.86iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 26/26 [00:00<00:00, 41.20iteration/s]


Converting Data to Dataframe.
No pitching games found for player 671922 in season 2023
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 74/74 [00:01<00:00, 54.55iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 32/32 [00:00<00:00, 38.42iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 33/33 [00:00<00:00, 54.67iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 33/33 [00:00<00:00, 47.77iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 65/65 [00:01<00:00, 36.21iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 75/75 [00:01<00:00, 46.14iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 74/74 [00:01<00:00, 60.50iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 15/15 [00:00<00:00, 33.61iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 31/31 [00:00<00:00, 52.93iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 29/29 [00:00<00:00, 44.64iteration/s]


Converting Data to Dataframe.
This May Take a While. Progress Bar shows Completion of Data Retrieval.


Processing: 100%|██████████| 32/32 [00:00<00:00, 63.97iteration/s]


Converting Data to Dataframe.
Pitch-level rows: 210062
Game-level rows: 2646
Season-level rows: 87


In [29]:
pitch_df.to_csv("pitch_level.csv", index=False)
game_df.to_csv("game_level.csv", index=False)
season_df.to_csv("season_totals.csv", index=False)

In [30]:
import requests
import pandas as pd
from bs4 import BeautifulSoup, Comment

# —————————————————————————————
# Helpers for scraping
# —————————————————————————————
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/115.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://www.baseball-reference.com/"
}
session = requests.Session()
session.headers.update(HEADERS)


def scrape_cy_young_winners(seasons: list[int]) -> pd.DataFrame:
    """
    Scrape Baseball‑Reference award pages for Cy Young winners.
    Returns a DataFrame with columns ['Name', 'Season', 'winner_flag'].
    """
    records = []
    for yr in seasons:
        url = f"https://www.baseball-reference.com/awards/awards_{yr}.shtml"
        resp = session.get(url)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "html.parser")

        # BR hides their tables in HTML comments
        comments = soup.find_all(string=lambda t: isinstance(t, Comment))
        # we want the first Cy Young Voting table per league, but the winner is always first row
        for c in comments:
            if "Cy Young Voting" not in c:
                continue
            tbl = BeautifulSoup(c, "html.parser").find("table")
            if tbl is None:
                continue

            # parse the table into pandas
            df = pd.read_html(str(tbl))[0]
            # identify the player column
            player_cols = [col for col in df.columns if "player" in str(col).lower()]
            if player_cols:
                name_col = player_cols[0]
            else:
                # fallback: assume second column is player
                name_col = df.columns[1]

            # grab the first-place finisher
            winner = df[name_col].iloc[0]
            records.append({
                "Name": winner,
                "Season": yr,
                "winner_flag": 1
            })
            # once per table, move on to next comment (other league)
    winners_df = pd.DataFrame.from_records(records)
    return winners_df


if __name__ == "__main__":
    # 1) list all seasons for which you need labels:
    #    (e.g. award years and/or prior years from your pipeline)
    seasons = [2021, 2022, 2023, 2024]

    # 2) load your season_totals to get every (Name, Season) you scraped
    season_df = pd.read_csv("season_totals.csv")
    combos = (
        season_df[["Name", "Season"]]
        .drop_duplicates()
        .reset_index(drop=True)
    )

    # 3) scrape winners
    winners_df = scrape_cy_young_winners(seasons)

    # 4) merge and fill zeros for non‑winners
    labels = (
        combos
        .merge(winners_df, on=["Name", "Season"], how="left")
        .fillna({"winner_flag": 0})
    )
    labels["winner_flag"] = labels["winner_flag"].astype(int)

    # 5) save
    labels.to_csv("cy_young_labels.csv", index=False)
    print("Written cy_young_labels.csv with", len(labels), "rows")


  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]


Written cy_young_labels.csv with 86 rows


  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
  df = pd.read_html(str(tbl))[0]
