# Data Collection from PGA Website
This file is used to scrape data from PGA official website

In [28]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re


In [29]:
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/120.0.0.0 Safari/537.36"
}

def scrape_espn_player_stats(season: int) -> pd.DataFrame:
    """
    Scrape ESPN 'PGA TOUR Player Stats {season}' page.

    Returns a DataFrame with:
    NAME, Season, EARNINGS, CUP, EVNTS, RNDS, CUTS, TOP10, WINS,
    SCORE, DDIS, DACC, GIR, PUTTS, SAND, BIRDS
    """
    url = f"https://www.espn.com/golf/stats/player/_/season/{season}"
    print(f"Scraping {season} from {url}")

    resp = requests.get(url, headers=HEADERS)
    resp.raise_for_status()

    soup = BeautifulSoup(resp.text, "html.parser")

    # ---------- 1) Get raw text lines ----------
    text = soup.get_text("\n")
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]

    # This header line appears right before the stat rows.
    # In the flattened text it looks like this (no spaces):
    # EARNINGSCUPEVNTSRNDSCUTSTOP10WINSSCOREDDISDACCGIRPUTTSSANDBIRDS
    header_compact = "EARNINGSCUPEVNTSRNDSCUTSTOP10WINSSCOREDDISDACCGIRPUTTSSANDBIRDS"

    stats_start = None
    for i, ln in enumerate(lines):
        if ln.replace(" ", "") == header_compact:
            stats_start = i + 1  # stats start on next line
            break

    if stats_start is None:
        raise ValueError(f"Could not find stats header for season {season}.")

    # ---------- 2) Collect stats lines ----------
    stat_lines = []
    for ln in lines[stats_start:]:
        if ln.startswith("Show More"):
            break  # end of table
        if not ln.startswith("$"):
            continue  # skip non-stat lines
        tokens = ln.split()
        # Should be: $EARNINGS CUP EVNTS RNDS CUTS TOP10 WINS SCORE DDIS DACC GIR PUTTS SAND BIRDS
        if len(tokens) != 14:
            continue
        stat_lines.append(tokens)

    if not stat_lines:
        raise ValueError(f"No stat rows found for season {season}.")

    # ---------- 3) Collect player names ----------
    # Player links have URLs like /golf/player/_/id/3599/brian-stuard
    player_links = soup.find_all("a", href=re.compile(r"/golf/player/_/id/"))
    names = []
    for a in player_links:
        name = a.get_text(strip=True)
        if name and name not in names:
            names.append(name)

    # We only have stats for the first N players; align lengths just in case
    n = min(len(names), len(stat_lines))
    names = names[:n]
    stat_lines = stat_lines[:n]

    # ---------- 4) Build DataFrame ----------
    stat_cols = [
        "EARNINGS", "CUP", "EVNTS", "RNDS", "CUTS", "TOP10", "WINS",
        "SCORE", "DDIS", "DACC", "GIR", "PUTTS", "SAND", "BIRDS"
    ]

    records = []
    for name, row in zip(names, stat_lines):
        rec = {
            "NAME": name,
            "Season": season,
        }
        rec["EARNINGS"] = row[0]    # '$x,xxx'
        # map remaining 13 stats
        for col, val in zip(stat_cols[1:], row[1:]):
            rec[col] = val
        records.append(rec)

    df = pd.DataFrame.from_records(records)
    return df



In [30]:
def scrape_espn_range(start_season: int, end_season: int) -> pd.DataFrame:
    """
    Scrape multiple seasons from ESPN and stack into one DataFrame.
    """
    all_frames = []
    for season in range(start_season, end_season + 1):
        try:
            df_season = scrape_espn_player_stats(season)
            all_frames.append(df_season)
        except Exception as e:
            print(f"Season {season} failed: {e}")

    if not all_frames:
        raise RuntimeError("No seasons were successfully scraped.")

    combined = pd.concat(all_frames, ignore_index=True)
    return combined


In [31]:
espn_recent = scrape_espn_range(2019, 2025)

# Save to your project raw-data folder
espn_recent.to_csv("data/raw-data/espn_pga_stats_2019_2025_top_players.csv", index=False)

espn_recent.head()


Scraping 2019 from https://www.espn.com/golf/stats/player/_/season/2019
Season 2019 failed: Could not find stats header for season 2019.
Scraping 2020 from https://www.espn.com/golf/stats/player/_/season/2020
Season 2020 failed: Could not find stats header for season 2020.
Scraping 2021 from https://www.espn.com/golf/stats/player/_/season/2021
Season 2021 failed: Could not find stats header for season 2021.
Scraping 2022 from https://www.espn.com/golf/stats/player/_/season/2022
Season 2022 failed: Could not find stats header for season 2022.
Scraping 2023 from https://www.espn.com/golf/stats/player/_/season/2023
Season 2023 failed: Could not find stats header for season 2023.
Scraping 2024 from https://www.espn.com/golf/stats/player/_/season/2024
Season 2024 failed: Could not find stats header for season 2024.
Scraping 2025 from https://www.espn.com/golf/stats/player/_/season/2025
Season 2025 failed: Could not find stats header for season 2025.


RuntimeError: No seasons were successfully scraped.