In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
from statistic_scraping import fetch_stat_details

In [2]:
#Example Usage

STAT_ID = "102"             # Scoring Average
YEAR    = 2024              # seasonal stats
TOURNEY = None              # set to string to filter to a single event

# fetch the DataFrame
df = fetch_stat_details(STAT_ID, year=YEAR, tournament_id=TOURNEY)

print(f"\nTop 5 for stat {STAT_ID} / year {YEAR} / tourney {TOURNEY}:")
print(df.head())


Top 5 for stat 102 / year 2024 / tourney None:
   rank playerId     playerName       % Fairways Hit Possible Fairways
0     1    46414      Aaron Rai  72.02%          857             1,190
1     2    36884     Ben Kohles  70.57%          832             1,179
2     3    29535  Brice Garnett  70.51%          471               668
3     4    52513   Carson Young  70.50%          839             1,190
4     5    40026  Daniel Berger  70.42%          745             1,058


In [29]:
#Create a Dictionary for relevant stats and their codes

stats = {
    #Strokes Gained
    'SG: Total': '02675', 
    'SG: Off the Tee' : '02567',
    'SG: Tee to Green' : '02674',
    'SG: Approach the Green' : '02568', 
    'SG: Around the Green' : '02569',

    #Off the tee
    'Total Driving' : '129',
    'Driving Distance' : '101', 
    'Driving Accuracy Percentage' : '102',
    'Ball Speed' : '02402',

    #Approaching the Green
    'Ball Striking' : '158',
    'Greens or Fringe in Regulation' : '02437',
    'Proximity to Hole' : '331', 
    'GIR Percentage from Other than Fairway' : '199',
    'Going for the Green' : '419',
    'Going for the Green - Hit Green Pct' : '486', 

    #Around the Green
    'Scrambling' : '130', 
    'Proximity to Hole (ARG)' : '374',
    'Sand Save Percentage' : '111', 
    'Scrambling from Rough' : '363',
    'Scrambling from >30 YDS' : '366',

    #Putting
    'Putting Average' : '104', 
    'Putts per Round' : '119', 
    'One-Putt Percentage' : '413',
    '3-Putt Avoidance' : '426', 
    'Putting Inside 10ft' : '484',
    'Putts Made per Event Over 10ft' : '434',
    
    #Scoring
    'Scoring Average':'120',
    'Birdie Average': '156',
    'Birdie or Better Percentage' : '352',
    'Bogey Avoidance' : '02414',
    'Par 3 Scoring Avg' : '142', 
    'Par 4 Scoring Avg' : '143', 
    'Par 5 Scoring Avg' : '144',
    'Scoring Differential Field Avg' : '02417'
}

In [30]:
# ─── choose your range of years ─────────────────────────────────────────
YEARS = range(2010, 2025) 

# ─── container: stat_dfs[year][stat_name] → DataFrame ────────────────
stat_dfs = {}

for year in YEARS:
    print(f"⏳ Fetching stats for {year} …")
    stat_dfs[year] = {}
    for stat_name, stat_code in stats.items():
        df = fetch_stat_details(stat_code,
                                year=year,
                                tournament_id=None)
        df['Year'] = year
        stat_dfs[year][stat_name] = df
    print(f"✔️  Done {year}")

⏳ Fetching stats for 2010 …
✔️  Done 2010
⏳ Fetching stats for 2011 …
✔️  Done 2011
⏳ Fetching stats for 2012 …
✔️  Done 2012
⏳ Fetching stats for 2013 …
✔️  Done 2013
⏳ Fetching stats for 2014 …
✔️  Done 2014
⏳ Fetching stats for 2015 …
✔️  Done 2015
⏳ Fetching stats for 2016 …
✔️  Done 2016
⏳ Fetching stats for 2017 …
✔️  Done 2017
⏳ Fetching stats for 2018 …
✔️  Done 2018
⏳ Fetching stats for 2019 …
✔️  Done 2019
⏳ Fetching stats for 2020 …
✔️  Done 2020
⏳ Fetching stats for 2021 …
✔️  Done 2021
⏳ Fetching stats for 2022 …
✔️  Done 2022
⏳ Fetching stats for 2023 …
✔️  Done 2023
⏳ Fetching stats for 2024 …
✔️  Done 2024


In [25]:
stat_dfs[2024]["SG: Around the Green"].sort_values(by='Measured Rounds',ascending = False).head()

Unnamed: 0,rank,playerId,playerName,Avg,Total SG:ARG,Measured Rounds,Year
44,45,47591,Eric Cole,0.172,16.48,96,2024
22,23,54591,Ben Griffin,0.271,24.643,91,2024
160,160,35532,Tom Hoge,-0.247,-22.479,91,2024
51,52,50188,S.H. Kim,0.164,14.778,90,2024
18,19,30927,Brendon Todd,0.3,26.689,89,2024


In [41]:
import pandas as pd
from functools import reduce
import re

# helper to make safe column names
def sanitize(s: str) -> str:
    # replace non‐alphanumerics with underscore, collapse multiples
    return re.sub(r'\W+', '_', s).strip('_')

# the “key” columns every table has
KEYS = ['playerId','playerName']

# collect all the little per‐stat DataFrames here
to_merge = []

for year, stats_dict in stat_dfs.items():
    for stat_name, df in stats_dict.items():
        # 1) decide which columns are the “substats”
        #    drop any metadata like rank or country if present
        drop_meta = set(KEYS + ['rank','country'])
        subcols = [c for c in df.columns if c not in drop_meta]
        if not subcols:
            continue  # nothing to pull

        # 2) slice to just the key + those subcols
        tmp = df[KEYS + subcols].copy()

        # 3) rename each subcol → Year_Stat_Subcol
        col_map = {
            col: f"{year}_{sanitize(stat_name)}_{sanitize(col)}"
            for col in subcols
        }
        tmp = tmp.rename(columns=col_map)

        to_merge.append(tmp)

# 4) merge them all together on playerId & playerName
#    using outer‐join so missing combos become NaN
wide_df = reduce(
    lambda left, right: pd.merge(left, right, on=KEYS, how='outer'),
    to_merge
)


In [48]:
print(wide_df.shape)

wide_df.to_csv('data/individual_yoy_statistics.csv')

(594, 2252)


Work is done for individual year long statistics. Let's now scrape tournament result data

In [2]:
# 1) import your module
import tournament_result_scraping

# 2) call the builder
full_df = tournament_result_scraping.build_full_leaderboards(start_year=2010)

# 3) inspect / save
full_df.head()
full_df.to_csv('leaderboards_2010_to_present.csv', index=False)

→ Season 2010


RuntimeError: Cannot use HTMLSession within an existing event loop. Use AsyncHTMLSession instead.