In [None]:
import pandas as pd
from pathlib import Path
from pybaseball import chadwick_register

FEATURES_DIR = Path("../data/processed/features/")
RAW_LAHMAN_DIR = Path("../data/raw/lahman/")

# Load features
lahman = pd.read_csv(FEATURES_DIR / "lahman_hitting_2020_2025.csv")
statcast = pd.read_csv(FEATURES_DIR / "statcast_hitting_2020_2025.csv")
people = pd.read_csv(RAW_LAHMAN_DIR / "People.csv", usecols=["bbrefID","retroID"])
register = chadwick_register()  # downloads/loads once

register

In [None]:
# Test for player Bryce Harper
harper = register[(register["name_first"] == "Bryce") & (register["name_last"] == "Harper")]
harper

Based on these results, "key_mlbam" in the Chadwick register is the same ID as "Player_id" in the Statcast data. Therefore, we can merge the Statcast data with the Chadwick register to get the "bbrefID" for each player, which can then be used to merge with the Lahman data.

In [None]:
# Statcast -> People to get Bbref ID
statcast = statcast.merge(register, left_on="player_id", right_on="key_mlbam", how="left")
statcast

In [None]:
# Merge the merged data with Lahman to get complete player information
merged = statcast.merge(lahman, left_on="key_bbref", right_on="playerID", how="left")
merged

In [None]:
unmatched = merged[merged["playerID"].isna()]

merged['has_mlb_stats'] = merged['H'].notna()
merged['is_rookie_or_prospect'] = ~merged['has_mlb_stats']

merged

## Persisted Changes

### Parquet

In [17]:
merged.to_parquet("../data/processed/features/merged_hitting_2020_2025.parquet", index=False)


### SQLite

In [18]:
import sqlite3
conn = sqlite3.connect("../data/processed/db/merged.db")
merged.to_sql("hitting", conn, if_exists="replace", index=False)
conn.close()
