In [4]:
import pandas as pd
from pybaseball import batting_stats

In [9]:
all_years = []
for year in range(2010, 2025):
    df = batting_stats(year, year, qual=0, league="all")
    df["Season"] = year
    all_years.append(df)
batting = pd.concat(all_years, ignore_index=True)
batting.to_csv("fangraphs_hitters_2010_2024.csv", index=False)


In [10]:
print(batting.shape)
print(batting.columns)
batting.head()

(20503, 320)
Index(['IDfg', 'Season', 'Name', 'Team', 'Age', 'G', 'AB', 'PA', 'H', '1B',
       ...
       'maxEV', 'HardHit', 'HardHit%', 'Events', 'CStr%', 'CSW%', 'xBA',
       'xSLG', 'xwOBA', 'L-WAR'],
      dtype='object', length=320)


Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR
0,1875,2010,Josh Hamilton,TEX,29,133,518,571,186,111,...,,0.0,,0,0.094,0.229,,,,8.4
1,1201,2010,Carl Crawford,TBR,28,154,600,663,184,122,...,,0.0,,0,0.142,0.229,,,,7.7
2,9368,2010,Evan Longoria,TBR,24,151,574,661,169,96,...,,0.0,,0,0.166,0.254,,,,7.5
3,4314,2010,Joey Votto,CIN,26,150,547,648,177,102,...,,0.0,,0,0.133,0.238,,,,6.9
4,1177,2010,Albert Pujols,STL,30,159,587,700,183,101,...,,0.0,,0,0.161,0.22,,,,6.8


In [19]:
batting = pd.read_csv("fangraphs_hitters_2010_2024.csv")
batting = batting[batting['PA'] >= 200].copy()


In [18]:
print(list(batting.columns))

['IDfg', 'Season', 'Name', 'Team', 'Age', 'G', 'AB', 'PA', 'H', '1B', '2B', '3B', 'HR', 'R', 'RBI', 'BB', 'IBB', 'SO', 'HBP', 'SF', 'SH', 'GDP', 'SB', 'CS', 'AVG', 'GB', 'FB', 'LD', 'IFFB', 'Pitches', 'Balls', 'Strikes', 'IFH', 'BU', 'BUH', 'BB%', 'K%', 'BB/K', 'OBP', 'SLG', 'OPS', 'ISO', 'BABIP', 'GB/FB', 'LD%', 'GB%', 'FB%', 'IFFB%', 'HR/FB', 'IFH%', 'BUH%', 'wOBA', 'wRAA', 'wRC', 'Bat', 'Fld', 'Rep', 'Pos', 'RAR', 'WAR', 'Dol', 'Spd', 'wRC+', 'WPA', '-WPA', '+WPA', 'RE24', 'REW', 'pLI', 'phLI', 'PH', 'WPA/LI', 'Clutch', 'FB% (Pitch)', 'FBv', 'SL%', 'SLv', 'CT%', 'CTv', 'CB%', 'CBv', 'CH%', 'CHv', 'SF%', 'SFv', 'KN%', 'KNv', 'XX%', 'PO%', 'wFB', 'wSL', 'wCT', 'wCB', 'wCH', 'wSF', 'wKN', 'wFB/C', 'wSL/C', 'wCT/C', 'wCB/C', 'wCH/C', 'wSF/C', 'wKN/C', 'O-Swing%', 'Z-Swing%', 'Swing%', 'O-Contact%', 'Z-Contact%', 'Contact%', 'Zone%', 'F-Strike%', 'SwStr%', 'BsR', 'FA% (sc)', 'FT% (sc)', 'FC% (sc)', 'FS% (sc)', 'FO% (sc)', 'SI% (sc)', 'SL% (sc)', 'CU% (sc)', 'KC% (sc)', 'EP% (sc)', 'CH% (

In [22]:
for col in ['BB%', 'K%']:
    batting[col] = pd.to_numeric(batting[col].astype(str).str.rstrip('%'), errors='coerce') / 100


In [None]:
batting.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5093 entries, 0 to 20502
Columns: 320 entries, IDfg to L-WAR
dtypes: float64(281), int64(35), object(4)
memory usage: 12.5+ MB


In [None]:
import pandas as pd
import numpy as np

pd.set_option('display.float_format', '{:.4f}'.format)

def get_career_stats(player_name, data):
    p = data[data['Name'] == player_name].copy()
    if p.empty:
        return None

    for col in ['BB%', 'K%']:
        s = pd.to_numeric(p[col].astype(str).str.rstrip('%'), errors='coerce')

        if s.quantile(0.98) > 1.5:
            s = s / 100.0
        if s.max() < 0.05:
            s = s * 100.0

        p[col] = s

    totals = p[['PA','HR','R','RBI','SB']].sum(numeric_only=True)
    rates  = p[['BB%','K%','ISO','wOBA']].mean(numeric_only=True)
    out = pd.concat([totals, rates])
    
    return out.round(4)

In [46]:
feature_cols = ['Age','PA','BB%','K%','HR','ISO','wOBA']
target_cols  = ['AVG','SLG','OBP','OPS','wRC+','HR','ISO','wOBA']
batting = batting.sort_values(['Name','Season']).copy()

for col in target_cols:
    batting[f"Next_{col}"] = batting.groupby('Name')[col].shift(-1)

train = batting.dropna(subset=feature_cols + [f"Next_{col}" for col in target_cols])



In [47]:
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor

X = train[feature_cols]
Y = train[[f"Next_{col}" for col in target_cols]]

model = MultiOutputRegressor(LinearRegression())
model.fit(X, Y)

0,1,2
,estimator,LinearRegression()
,n_jobs,

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [48]:
def project_next_season(player_name, data, model):
    p = data[data['Name'] == player_name].sort_values('Season')
    if p.empty:
        return None
    
    latest = p.iloc[-1][feature_cols]
    features_df = pd.DataFrame([latest], columns=feature_cols)
    
    preds = model.predict(features_df)[0]  # array of predictions
    
    # Map each prediction back to its stat
    return {col: round(val, 3) for col, val in zip(target_cols, preds)}


In [49]:
# Interactive prompt
player = input("Enter player name: ")

# Get projection
projection = project_next_season(player, batting, model)

# Print results nicely
if projection is None:
    print(f"No data found for {player}")
else:
    print(f"\nProjection for {player} next season:")
    for stat, value in projection.items():
        print(f"{stat:5}: {value}")



Projection for Aaron Judge next season:
AVG  : 0.261
SLG  : 0.556
OBP  : 0.391
OPS  : 0.947
wRC+ : 156.236
HR   : 38.921
ISO  : 0.295
wOBA : 0.402
