In [None]:
import pandas as pd
from pathlib import Path
from pybaseball import chadwick_register

FEATURES_DIR = Path("../data/processed/features/")
RAW_LAHMAN_DIR = Path("../data/raw/lahman/")

# Load features
lahman = pd.read_csv(FEATURES_DIR / "lahman_hitting_2020_2025.csv")
statcast = pd.read_csv(FEATURES_DIR / "statcast_hitting_2020_2025.csv")
people = pd.read_csv(RAW_LAHMAN_DIR / "People.csv", usecols=["bbrefID","retroID"])
register = chadwick_register()  # downloads/loads once

register

Gathering player lookup table. This may take a moment.


In [4]:
# Test for player Bryce Harper
harper = register[(register["name_first"] == "Bryce") & (register["name_last"] == "Harper")]
harper

Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
20119,Harper,Bryce,547180,harpb003,harpebr03,11579,2012.0,2025.0


Based on these results, "key_mlbam" in the Chadwick register is the same ID as "Player_id" in the Statcast data. Therefore, we can merge the Statcast data with the Chadwick register to get the "bbrefID" for each player, which can then be used to merge with the Lahman data.

In [6]:
# Statcast -> People to get Bbref ID
statcast = statcast.merge(register, left_on="player_id", right_on="key_mlbam", how="left")
statcast

Unnamed: 0,"last_name, first_name",player_id,attempts,avg_hit_angle,anglesweetspotpercent,max_hit_speed,avg_hit_speed,ev50,fbld,gb,...,brl_pa,year,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
0,"Merrifield, Whit",593160,216,15.8,39.8,105.8,86.1,95.9,89.6,81.7,...,4.2,2020,Merrifield,Whit,593160,merrw001,merriwh01,11281,2016.0,2024.0
1,"Turner, Trea",607208,199,9.5,34.2,111.2,90.5,100.6,93.3,88.7,...,7.3,2020,Turner,Trea,607208,turnt001,turnetr01,16252,2015.0,2025.0
2,"Lindor, Francisco",596019,197,13.5,36.5,111.4,89.9,99.7,93.3,86.8,...,4.1,2020,Lindor,Francisco,596019,lindf001,lindofr01,12916,2015.0,2025.0
3,"Alberto, Hanser",593643,193,13.2,32.6,103.8,82.3,93.5,88.4,79.5,...,0.9,2020,Alberto,Hanser,593643,albeh001,alberha01,11902,2015.0,2023.0
4,"Machado, Manny",592518,191,15.6,34.6,115.7,90.2,102.7,94.1,88.2,...,8.3,2020,Machado,Manny,592518,machm001,machama01,11493,2012.0,2025.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1497,"Caballero, José",676609,222,17.4,36.5,114.4,86.0,97.8,90.4,81.4,...,3.5,2025,Caballero,José,676609,cabaj001,cabaljo01,23401,2023.0,2025.0
1498,"Gorman, Nolan",669357,219,22.3,38.8,111.6,89.6,100.5,93.8,82.6,...,5.0,2025,Gorman,Nolan,669357,gormn001,gormano01,22263,2022.0,2025.0
1499,"Bleday, JJ",668709,217,20.5,29.5,109.9,88.7,99.5,93.2,86.9,...,4.9,2025,Bleday,JJ,668709,bledj001,bledajj01,26368,2022.0,2025.0
1500,"Cowser, Colton",681297,199,13.8,32.7,113.7,90.9,101.6,96.2,85.7,...,7.8,2025,Cowser,Colton,681297,cowsc001,cowseco01,29591,2023.0,2025.0


In [9]:
# Merge the merged data with Lahman to get complete player information
merged = statcast.merge(lahman, left_on="key_bbref", right_on="playerID", how="left")
merged

Unnamed: 0,"last_name, first_name",player_id,attempts,avg_hit_angle,anglesweetspotpercent,max_hit_speed,avg_hit_speed,ev50,fbld,gb,...,R,H,HR,RBI,BB,SO,SB,AVG,HR_rate,BB_rate
0,"Merrifield, Whit",593160,216,15.8,39.8,105.8,86.1,95.9,89.6,81.7,...,38.0,70.0,9.0,30.0,12.0,33.0,12.0,0.282258,0.036290,0.046154
1,"Merrifield, Whit",593160,216,15.8,39.8,105.8,86.1,95.9,89.6,81.7,...,97.0,184.0,10.0,74.0,40.0,103.0,40.0,0.277108,0.015060,0.056818
2,"Merrifield, Whit",593160,216,15.8,39.8,105.8,86.1,95.9,89.6,81.7,...,70.0,126.0,11.0,58.0,38.0,85.0,16.0,0.250000,0.021825,0.070111
3,"Merrifield, Whit",593160,216,15.8,39.8,105.8,86.1,95.9,89.6,81.7,...,66.0,149.0,11.0,67.0,36.0,101.0,26.0,0.272395,0.020110,0.061750
4,"Merrifield, Whit",593160,216,15.8,39.8,105.8,86.1,95.9,89.6,81.7,...,44.0,65.0,4.0,15.0,35.0,52.0,17.0,0.221843,0.013652,0.106707
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6321,"Stephenson, Tyler",663886,187,12.9,42.2,107.9,90.5,100.9,94.9,85.0,...,4.0,5.0,2.0,6.0,2.0,9.0,0.0,0.294118,0.117647,0.105263
6322,"Stephenson, Tyler",663886,187,12.9,42.2,107.9,90.5,100.9,94.9,85.0,...,56.0,100.0,10.0,45.0,41.0,75.0,0.0,0.285714,0.028571,0.104859
6323,"Stephenson, Tyler",663886,187,12.9,42.2,107.9,90.5,100.9,94.9,85.0,...,24.0,53.0,6.0,35.0,12.0,47.0,1.0,0.319277,0.036145,0.067416
6324,"Stephenson, Tyler",663886,187,12.9,42.2,107.9,90.5,100.9,94.9,85.0,...,59.0,113.0,13.0,56.0,47.0,135.0,0.0,0.243011,0.027957,0.091797


In [13]:
unmatched = merged[merged["playerID"].isna()]

unmatched



Unnamed: 0,"last_name, first_name",player_id,attempts,avg_hit_angle,anglesweetspotpercent,max_hit_speed,avg_hit_speed,ev50,fbld,gb,...,R,H,HR,RBI,BB,SO,SB,AVG,HR_rate,BB_rate
3140,"Greene, Riley",682985,259,2.8,32.0,112.1,89.5,102.1,95.6,85.1,...,,,,,,,,,,
4059,"Call, Alex",669743,302,17.0,29.5,108.5,86.8,97.4,90.8,84.0,...,,,,,,,,,,
4177,"Greene, Riley",682985,266,6.6,36.8,114.4,91.6,103.0,96.1,86.9,...,,,,,,,,,,
4750,"Smith, Josh",669701,407,15.4,33.7,108.0,87.8,98.3,91.7,84.5,...,,,,,,,,,,
4926,"Greene, Riley",682985,358,12.2,39.1,114.0,91.3,102.1,95.3,86.7,...,,,,,,,,,,
5601,"Wilson, Jacob",805779,450,8.8,32.0,108.5,84.6,95.1,87.5,83.2,...,,,,,,,,,,
5643,"Ramírez, Agustín",682663,428,8.3,31.1,116.9,90.8,103.9,95.4,88.8,...,,,,,,,,,,
5747,"Greene, Riley",682985,404,15.0,36.9,115.1,89.9,102.6,94.9,84.7,...,,,,,,,,,,
5751,"Durbin, Caleb",702332,402,13.9,28.6,108.9,85.2,96.1,89.2,83.8,...,,,,,,,,,,
5765,"Smith, Josh",669701,399,12.7,33.6,108.2,87.2,97.4,89.9,85.1,...,,,,,,,,,,
