In [6]:
import sqlite3
import pandas as pd
from pathlib import Path

DB_PATH = Path("../data/processed/db/lahman.db")
conn = sqlite3.connect(DB_PATH)
conn

<sqlite3.Connection at 0x12a11f010>

In [None]:
query = """
SELECT 
    b.playerID,
    b.yearID,
    p.nameFirst,
    p.nameLast,
    SUM(b.G)  AS G,
    SUM(b.AB) AS AB,
    SUM(b.R)  AS R,
    SUM(b.H)  AS H,
    SUM(b.HR) AS HR,
    SUM(b.RBI) AS RBI,
    SUM(b.BB) AS BB,
    SUM(b.SO) AS SO,
    SUM(b.SB) AS SB
FROM Batting b
JOIN People p USING(playerID)
WHERE b.yearID BETWEEN 2020 AND 2025
GROUP BY b.playerID, b.yearID
"""

lahman_hitters = pd.read_sql_query(query, conn)
lahman_hitters.head()
lahman

Unnamed: 0,playerID,yearID,nameFirst,nameLast,G,AB,R,H,HR,RBI,BB,SO,SB
0,abadfe01,2021,Fernando,Abad,16,0,0,0,0,0.0,0,0.0,0.0
1,abadfe01,2023,Fernando,Abad,6,0,0,0,0,0.0,0,0.0,0.0
2,abbotan01,2023,Andrew,Abbott,21,0,0,0,0,0.0,0,0.0,0.0
3,abbotan01,2024,Andrew,Abbott,25,0,0,0,0,0.0,0,0.0,0.0
4,abbotco01,2021,Cory,Abbott,8,3,0,1,0,0.0,0,1.0,0.0


In [8]:
df = lahman_hitters.copy()

# avoid division by zero
df = df[df["AB"] > 0].copy()

df["AVG"] = df["H"] / df["AB"]
df["HR_rate"] = df["HR"] / df["AB"]
df["BB_rate"] = df["BB"] / (df["AB"] + df["BB"])  # simple plate discipline proxy

df[["nameFirst", "nameLast", "yearID", "AB", "H", "HR", "AVG", "HR_rate", "BB_rate"]].head()

Unnamed: 0,nameFirst,nameLast,yearID,AB,H,HR,AVG,HR_rate,BB_rate
4,Cory,Abbott,2021,3,1,0,0.333333,0.0,0.0
7,CJ,Abrams,2022,284,70,2,0.246479,0.007042,0.017301
8,CJ,Abrams,2023,563,138,18,0.245115,0.031972,0.053782
9,CJ,Abrams,2024,541,133,20,0.245841,0.036969,0.068847
19,Jose,Abreu,2020,240,76,19,0.316667,0.079167,0.069767


In [9]:
FEATURES_DIR = Path("../data/processed/features/")
FEATURES_DIR.mkdir(parents=True, exist_ok=True)

output_path = FEATURES_DIR / "lahman_hitting_2020_2025.csv"
df.to_csv(output_path, index=False)
output_path

PosixPath('../data/processed/features/lahman_hitting_2020_2025.csv')

In [11]:
statcast_path = FEATURES_DIR / "statcast_hitting_2020_2025.csv"
statcast = pd.read_csv(statcast_path)

statcast.head()
statcast.columns

Index(['last_name, first_name', 'player_id', 'attempts', 'avg_hit_angle',
       'anglesweetspotpercent', 'max_hit_speed', 'avg_hit_speed', 'ev50',
       'fbld', 'gb', 'max_distance', 'avg_distance', 'avg_hr_distance',
       'ev95plus', 'ev95percent', 'barrels', 'brl_percent', 'brl_pa', 'year'],
      dtype='object')