In [19]:
import pandas as pd
from pathlib import Path
from pybaseball import chadwick_register

FEATURES_DIR = Path("../data/processed/features/")
RAW_LAHMAN_DIR = Path("../data/raw/lahman/")

# Load features
lahman = pd.read_csv(FEATURES_DIR / "lahman_hitting_2020_2025.csv")
statcast = pd.read_csv(FEATURES_DIR / "statcast_hitting_2020_2025.csv")
people = pd.read_csv(RAW_LAHMAN_DIR / "People.csv", usecols=["bbrefID","retroID"])
register = chadwick_register()  # downloads/loads once

register

Gathering player lookup table. This may take a moment.


Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
0,Bradley,Jed,605152,bradj002,bradlje01,13166,2016.0,2016.0
1,Barrios,Manuel,110625,barrm002,barrima01,1000605,1997.0,1998.0
2,Martin,Frank,118336,martf102,martifr01,1008165,1897.0,1899.0
3,Browne,Byron,111603,browb101,brownby01,1001500,1965.0,1972.0
4,Jackson,John,800935,jackj702,,-1,,
...,...,...,...,...,...,...,...,...
25896,Page,Vance,120175,pagev101,pageva01,1009957,1938.0,1941.0
25897,Wilkinson,Roy,124274,wilkr101,wilkiro01,1013961,1918.0,1922.0
25898,Demaree,Al,113250,demaa101,demaral01,1003187,1912.0,1919.0
25899,McKee,Frank,118767,mckef101,mckeefr01,1008583,1884.0,1884.0


In [20]:
# Test for player Bryce Harper
harper = register[(register["name_first"] == "Bryce") & (register["name_last"] == "Harper")]
harper

Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
20119,Harper,Bryce,547180,harpb003,harpebr03,11579,2012.0,2025.0


Based on these results, "key_mlbam" in the Chadwick register is the same ID as "Player_id" in the Statcast data. Therefore, we can merge the Statcast data with the Chadwick register to get the "bbrefID" for each player, which can then be used to merge with the Lahman data.

In [21]:
# Statcast -> People to get Bbref ID
statcast = statcast.merge(register, left_on="player_id", right_on="key_mlbam", how="left")
statcast

Unnamed: 0,"last_name, first_name",player_id,attempts,avg_hit_angle,anglesweetspotpercent,max_hit_speed,avg_hit_speed,ev50,fbld,gb,...,brl_pa,year,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
0,"Merrifield, Whit",593160,216,15.8,39.8,105.8,86.1,95.9,89.6,81.7,...,4.2,2020,Merrifield,Whit,593160,merrw001,merriwh01,11281,2016.0,2024.0
1,"Turner, Trea",607208,199,9.5,34.2,111.2,90.5,100.6,93.3,88.7,...,7.3,2020,Turner,Trea,607208,turnt001,turnetr01,16252,2015.0,2025.0
2,"Lindor, Francisco",596019,197,13.5,36.5,111.4,89.9,99.7,93.3,86.8,...,4.1,2020,Lindor,Francisco,596019,lindf001,lindofr01,12916,2015.0,2025.0
3,"Alberto, Hanser",593643,193,13.2,32.6,103.8,82.3,93.5,88.4,79.5,...,0.9,2020,Alberto,Hanser,593643,albeh001,alberha01,11902,2015.0,2023.0
4,"Machado, Manny",592518,191,15.6,34.6,115.7,90.2,102.7,94.1,88.2,...,8.3,2020,Machado,Manny,592518,machm001,machama01,11493,2012.0,2025.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1497,"Caballero, José",676609,222,17.4,36.5,114.4,86.0,97.8,90.4,81.4,...,3.5,2025,Caballero,José,676609,cabaj001,cabaljo01,23401,2023.0,2025.0
1498,"Gorman, Nolan",669357,219,22.3,38.8,111.6,89.6,100.5,93.8,82.6,...,5.0,2025,Gorman,Nolan,669357,gormn001,gormano01,22263,2022.0,2025.0
1499,"Bleday, JJ",668709,217,20.5,29.5,109.9,88.7,99.5,93.2,86.9,...,4.9,2025,Bleday,JJ,668709,bledj001,bledajj01,26368,2022.0,2025.0
1500,"Cowser, Colton",681297,199,13.8,32.7,113.7,90.9,101.6,96.2,85.7,...,7.8,2025,Cowser,Colton,681297,cowsc001,cowseco01,29591,2023.0,2025.0


In [22]:
# Merge the merged data with Lahman to get complete player information
merged = statcast.merge(lahman, left_on="key_bbref", right_on="playerID", how="left")
merged

Unnamed: 0,"last_name, first_name",player_id,attempts,avg_hit_angle,anglesweetspotpercent,max_hit_speed,avg_hit_speed,ev50,fbld,gb,...,GIDP,PA,AVG,OBP,TB,SLG,OPS,HR_rate,BB_rate,K_rate
0,"Merrifield, Whit",593160,216,15.8,39.8,105.8,86.1,95.9,89.6,81.7,...,3.0,265.0,0.282258,0.324528,109.0,0.439516,0.764044,0.033962,0.045283,0.124528
1,"Merrifield, Whit",593160,216,15.8,39.8,105.8,86.1,95.9,89.6,81.7,...,12.0,720.0,0.277108,0.316667,262.0,0.394578,0.711245,0.013889,0.055556,0.143056
2,"Merrifield, Whit",593160,216,15.8,39.8,105.8,86.1,95.9,89.6,81.7,...,11.0,550.0,0.250000,0.298182,189.0,0.375000,0.673182,0.020000,0.069091,0.154545
3,"Merrifield, Whit",593160,216,15.8,39.8,105.8,86.1,95.9,89.6,81.7,...,15.0,592.0,0.272395,0.317568,209.0,0.382084,0.699652,0.018581,0.060811,0.170608
4,"Merrifield, Whit",593160,216,15.8,39.8,105.8,86.1,95.9,89.6,81.7,...,7.0,335.0,0.221843,0.311178,92.0,0.313993,0.625171,0.011940,0.104478,0.155224
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6321,"Stephenson, Tyler",663886,187,12.9,42.2,107.9,90.5,100.9,94.9,85.0,...,0.0,20.0,0.294118,0.400000,11.0,0.647059,1.047059,0.100000,0.100000,0.450000
6322,"Stephenson, Tyler",663886,187,12.9,42.2,107.9,90.5,100.9,94.9,85.0,...,11.0,402.0,0.285714,0.365672,151.0,0.431429,0.797100,0.024876,0.101990,0.186567
6323,"Stephenson, Tyler",663886,187,12.9,42.2,107.9,90.5,100.9,94.9,85.0,...,4.0,183.0,0.319277,0.371585,80.0,0.481928,0.853512,0.032787,0.065574,0.256831
6324,"Stephenson, Tyler",663886,187,12.9,42.2,107.9,90.5,100.9,94.9,85.0,...,16.0,517.0,0.243011,0.317215,176.0,0.378495,0.695709,0.025145,0.090909,0.261122


In [23]:
unmatched = merged[merged["playerID"].isna()]

merged['has_mlb_stats'] = merged['H'].notna()
merged['is_rookie_or_prospect'] = ~merged['has_mlb_stats']

merged

Unnamed: 0,"last_name, first_name",player_id,attempts,avg_hit_angle,anglesweetspotpercent,max_hit_speed,avg_hit_speed,ev50,fbld,gb,...,AVG,OBP,TB,SLG,OPS,HR_rate,BB_rate,K_rate,has_mlb_stats,is_rookie_or_prospect
0,"Merrifield, Whit",593160,216,15.8,39.8,105.8,86.1,95.9,89.6,81.7,...,0.282258,0.324528,109.0,0.439516,0.764044,0.033962,0.045283,0.124528,True,False
1,"Merrifield, Whit",593160,216,15.8,39.8,105.8,86.1,95.9,89.6,81.7,...,0.277108,0.316667,262.0,0.394578,0.711245,0.013889,0.055556,0.143056,True,False
2,"Merrifield, Whit",593160,216,15.8,39.8,105.8,86.1,95.9,89.6,81.7,...,0.250000,0.298182,189.0,0.375000,0.673182,0.020000,0.069091,0.154545,True,False
3,"Merrifield, Whit",593160,216,15.8,39.8,105.8,86.1,95.9,89.6,81.7,...,0.272395,0.317568,209.0,0.382084,0.699652,0.018581,0.060811,0.170608,True,False
4,"Merrifield, Whit",593160,216,15.8,39.8,105.8,86.1,95.9,89.6,81.7,...,0.221843,0.311178,92.0,0.313993,0.625171,0.011940,0.104478,0.155224,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6321,"Stephenson, Tyler",663886,187,12.9,42.2,107.9,90.5,100.9,94.9,85.0,...,0.294118,0.400000,11.0,0.647059,1.047059,0.100000,0.100000,0.450000,True,False
6322,"Stephenson, Tyler",663886,187,12.9,42.2,107.9,90.5,100.9,94.9,85.0,...,0.285714,0.365672,151.0,0.431429,0.797100,0.024876,0.101990,0.186567,True,False
6323,"Stephenson, Tyler",663886,187,12.9,42.2,107.9,90.5,100.9,94.9,85.0,...,0.319277,0.371585,80.0,0.481928,0.853512,0.032787,0.065574,0.256831,True,False
6324,"Stephenson, Tyler",663886,187,12.9,42.2,107.9,90.5,100.9,94.9,85.0,...,0.243011,0.317215,176.0,0.378495,0.695709,0.025145,0.090909,0.261122,True,False


## Persisted Changes

### Parquet

In [24]:
merged.to_parquet("../data/processed/features/merged_hitting_2020_2025.parquet", index=False)


### SQLite

In [25]:
import sqlite3
conn = sqlite3.connect("../data/processed/db/merged.db")
merged.to_sql("hitting", conn, if_exists="replace", index=False)
conn.close()
