In [3]:
import os
import pandas as pd
import numpy as np
from pybaseball import batting_stats

In [4]:
START = 2004
END = 2024

In [5]:
batting = batting_stats(START, END, qual=200)

LOADING THE DATA

In [6]:
batting.to_csv("batting.csv")

In [7]:
batting = batting.groupby("IDfg", group_keys=False).filter(lambda x: x.shape[0] > 1)

In [8]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR
0,1109,2004,Barry Bonds,SFG,39,147,373,617,135,60,...,,,,0,0.124,0.164,,,,11.9
1,15640,2024,Aaron Judge,NYY,32,158,559,704,180,85,...,117.5,238.0,0.609,391,0.146,0.267,,,,11.3
4,15640,2022,Aaron Judge,NYY,30,157,570,696,177,87,...,118.4,246.0,0.609,404,0.169,0.287,,,,11.4
152,25764,2024,Bobby Witt Jr.,KCR,24,161,636,709,211,123,...,116.9,259.0,0.481,538,0.138,0.236,,,,10.0
10,13611,2018,Mookie Betts,BOS,25,136,520,614,180,96,...,110.6,217.0,0.500,434,0.220,0.270,,,,10.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6505,8585,2009,Yuniesky Betancourt,- - -,27,134,470,508,115,83,...,,0.0,,0,0.181,0.232,,,,-2.4
7068,9272,2018,Chris Davis,BAL,32,128,470,522,79,51,...,111.8,113.0,0.401,282,0.174,0.316,,,,-2.6
6530,45,2012,Rod Barajas,PIT,36,104,321,361,66,44,...,,0.0,,0,0.147,0.258,,,,-2.6
6683,319,2011,Adam Dunn,CHW,31,122,415,496,66,39,...,,0.0,,0,0.169,0.295,,,,-2.9


CREATING A MACHINE LEARNING TARGET

In [9]:
def next_season(player):
    player = player.sort_values("Season")
    player["Next_WAR"] = player["WAR"].shift(-1)
    return player

batting = batting.groupby("IDfg", group_keys=False).apply(next_season)

  batting = batting.groupby("IDfg", group_keys=False).apply(next_season)


In [10]:
batting[["Name", "Season", "WAR", "Next_WAR"]]

Unnamed: 0,Name,Season,WAR,Next_WAR
5512,Alfredo Amezaga,2006,1.1,2.0
4936,Alfredo Amezaga,2007,2.0,1.2
5185,Alfredo Amezaga,2008,1.2,
2481,Garret Anderson,2004,0.8,-0.2
4094,Garret Anderson,2005,-0.2,0.1
...,...,...,...,...
1029,Seiya Suzuki,2024,3.6,
5110,Zach Neto,2023,1.1,3.5
3017,Zach Neto,2024,3.5,
2425,Masataka Yoshida,2023,0.6,0.8


CLEANING THE DATA

In [11]:
null_count = batting.isnull().sum()

In [12]:
null_count

IDfg           0
Season         0
Name           0
Team           0
Age            0
            ... 
xBA         6792
xSLG        6792
xwOBA       6792
L-WAR          0
Next_WAR    1215
Length: 321, dtype: int64

In [13]:
complete_cols = list(batting.columns[null_count == 0])

In [14]:
batting = batting[complete_cols + ["Next_WAR"]].copy()

In [15]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,Cent%+,Oppo%+,Soft%+,Med%+,Hard%+,Events,CStr%,CSW%,L-WAR,Next_WAR
5512,1,2006,Alfredo Amezaga,FLA,28,132,334,378,87,72,...,107,113,143,109,63,0,0.188,0.256,1.1,2.0
4936,1,2007,Alfredo Amezaga,FLA,29,133,400,448,105,80,...,101,112,109,113,75,0,0.175,0.227,2.0,1.2
5185,1,2008,Alfredo Amezaga,FLA,30,125,311,337,82,61,...,101,101,123,111,64,0,0.178,0.244,1.2,
2481,2,2004,Garret Anderson,ANA,32,112,442,475,133,98,...,103,75,78,106,98,0,0.176,0.270,0.8,-0.2
4094,2,2005,Garret Anderson,LAA,33,142,575,603,163,111,...,78,83,100,97,106,0,0.150,0.258,-0.2,0.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1029,30116,2024,Seiya Suzuki,CHC,29,132,512,585,145,91,...,124,92,88,90,122,356,0.206,0.294,3.9,
5110,31347,2023,Zach Neto,LAA,22,84,289,329,65,39,...,82,103,81,108,97,216,0.161,0.290,1.1,3.5
3017,31347,2024,Zach Neto,LAA,23,155,542,602,135,77,...,85,82,85,101,105,407,0.164,0.289,3.4,
2425,31837,2023,Masataka Yoshida,BOS,29,140,537,580,155,104,...,102,117,123,99,91,458,0.212,0.285,0.6,0.8


In [16]:
batting.dtypes

IDfg          int64
Season        int64
Name         object
Team         object
Age           int64
             ...   
Events        int64
CStr%       float64
CSW%        float64
L-WAR       float64
Next_WAR    float64
Length: 131, dtype: object

In [17]:
batting.dtypes[batting.dtypes == "object"]

Name       object
Team       object
Dol        object
Age Rng    object
dtype: object

In [18]:
batting["Dol"]

5512      $5.5
4936     $11.2
5185      $7.2
2481      $3.4
4094    ($1.2)
         ...  
1029     $28.4
5110      $8.9
3017     $27.9
2425      $4.9
2579      $6.4
Name: Dol, Length: 6792, dtype: object

In [19]:
del batting["Dol"]

In [20]:
batting["Age Rng"]

5512    28 - 28
4936    29 - 29
5185    30 - 30
2481    32 - 32
4094    33 - 33
         ...   
1029    29 - 29
5110    22 - 22
3017    23 - 23
2425    29 - 29
2579    30 - 30
Name: Age Rng, Length: 6792, dtype: object

In [21]:
del batting["Age Rng"]

In [22]:
batting["team_code"] = batting["Team"].astype("category").cat.codes

In [23]:
batting_full = batting.copy()
batting = batting.dropna().copy()

SELECTING USEFUL FEATURES

In [24]:
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

rr = Ridge(alpha=1)

split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr, 
                                n_features_to_select=20, 
                                direction="forward",
                                cv=split,
                                n_jobs=8
                               )

In [25]:
removed_columns = ["Next_WAR", "Name", "Team", "IDfg", "Season"]
selected_columns = batting.columns[~batting.columns.isin(removed_columns)]

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])

In [27]:
batting.describe()

Unnamed: 0,IDfg,Season,Age,G,AB,PA,H,1B,2B,3B,...,Oppo%+,Soft%+,Med%+,Hard%+,Events,CStr%,CSW%,L-WAR,Next_WAR,team_code
count,5577.0,5577.0,5577.0,5577.0,5577.0,5577.0,5577.0,5577.0,5577.0,5577.0,...,5577.0,5577.0,5577.0,5577.0,5577.0,5577.0,5577.0,5577.0,5577.0,5577.0
mean,7040.263583,2013.232921,0.356354,0.648221,0.475212,0.476626,0.359968,0.285242,0.388086,0.09989,...,0.402155,0.411375,0.509788,0.468073,0.228809,0.498106,0.552471,0.339167,1.775238,0.479126
std,6178.38068,5.71179,0.14562,0.255769,0.241272,0.260829,0.181155,0.137825,0.167089,0.104146,...,0.130893,0.121221,0.132873,0.133259,0.293145,0.136282,0.121862,0.127086,1.958694,0.305069
min,1.0,2004.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-3.1,0.0
25%,1679.0,2008.0,0.230769,0.470085,0.27518,0.256055,0.211207,0.174528,0.254237,0.043478,...,0.315789,0.331461,0.42029,0.375,0.0,0.408511,0.471366,0.246667,0.4,0.205882
50%,5227.0,2013.0,0.346154,0.709402,0.496403,0.5,0.362069,0.278302,0.372881,0.086957,...,0.398496,0.404494,0.507246,0.472222,0.0,0.493617,0.550661,0.32,1.4,0.470588
75%,11368.0,2018.0,0.461538,0.871795,0.683453,0.704152,0.50431,0.382075,0.508475,0.130435,...,0.488722,0.488764,0.594203,0.555556,0.477462,0.591489,0.634361,0.406667,2.9,0.735294
max,31837.0,2023.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,11.2,1.0


In [28]:
sfs.fit(batting[selected_columns], batting["Next_WAR"])

In [29]:
predictors = list(selected_columns[sfs.get_support()])

In [30]:
predictors

['Age',
 'IBB',
 'SO',
 'SB',
 'IFH',
 'BU',
 'BB%',
 'ISO',
 'BABIP',
 'WAR',
 'Spd',
 'PH',
 'WPA/LI',
 'CB%',
 'O-Contact%',
 'Def',
 'Soft%',
 'LD+%',
 'Oppo%+',
 'Hard%+']

MAKING PREDICTIONS WITH MACHINE LEARNING

In [31]:
def backtest(data, model, predictors, start=5, step=1):
    all_predictions = []
    
    years = sorted(data["Season"].unique())
    
    for i in range(start, len(years), step):
        current_year = years[i]
        train = data[data["Season"] < current_year]
        test = data[data["Season"] == current_year]
        
        model.fit(train[predictors], train["Next_WAR"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["Next_WAR"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [32]:
predictions = backtest(batting, rr, predictors)

In [33]:
predictions.shape

(4131, 2)

In [34]:
from sklearn.metrics import mean_squared_error

mean_squared_error(predictions["actual"], predictions["prediction"])

2.6572887445466122

In [35]:
batting["Next_WAR"].describe()

count    5577.000000
mean        1.775238
std         1.958694
min        -3.100000
25%         0.400000
50%         1.400000
75%         2.900000
max        11.200000
Name: Next_WAR, dtype: float64

In [36]:
2.6572887445466122 ** .5

1.6301192424318574

IMPROVING ACCURACY

In [None]:
def player_history(df):
    df = df.sort_values("Season")
        
    df["player_season"] = range(0, df.shape[0])
    df["war_corr"] = list(df[["player_season", "WAR"]].expanding().corr().loc[(slice(None), "player_season"),"WAR"])
    df["war_corr"].fillna(0, inplace=True)
    
    df["war_diff"] = df["WAR"] / df["WAR"].shift(1)
    df["war_diff"].fillna(1, inplace=True)
    df["war_diff"][df["war_diff"] == np.inf] = 1
    
    return df

batting = batting.groupby("IDfg", group_keys=False).apply(player_history)

In [38]:
def group_averages(df):
    return df["WAR"] / df["WAR"].mean()

In [39]:
batting["war_season"] = batting.groupby("Season", group_keys=False).apply(group_averages)

  batting["war_season"] = batting.groupby("Season", group_keys=False).apply(group_averages)


In [40]:
new_predictors = predictors + ["player_season", "war_corr", "war_season", "war_diff"]

In [41]:
predictions = backtest(batting, rr, new_predictors)

In [42]:
mean_squared_error(predictions["actual"], predictions["prediction"])

2.6214500384465405

DIAGNOSING ISSUES WITH THE MODELS

In [43]:
pd.Series(rr.coef_, index=new_predictors).sort_values()

Age             -2.903316
WAR             -1.932126
BABIP           -1.692800
BU              -1.362915
ISO             -1.304615
Soft%           -0.906878
PH              -0.685577
SO              -0.503037
CB%             -0.497896
war_diff        -0.355573
LD+%            -0.348379
WPA/LI          -0.181569
war_corr        -0.138593
player_season    0.019889
Def              0.246210
IFH              0.412690
O-Contact%       0.468907
Oppo%+           0.623830
BB%              0.756562
Spd              0.778685
SB               1.065257
IBB              1.099024
Hard%+           2.533072
war_season       3.160473
dtype: float64

In [44]:
diff = predictions["actual"] - predictions["prediction"]

In [None]:
merged = predictions.merge(batting, left_index=True, right_index=True)

In [51]:
merged["diff"] = (predictions["actual"] - predictions["prediction"]).abs()

In [53]:
merged[["IDfg", "Season", "Name", "WAR", "Next_WAR", "diff"]].sort_values(["diff"])

Unnamed: 0,IDfg,Season,Name,WAR,Next_WAR,diff
2113,1825,2009,David DeJesus,0.446667,2.5,0.000591
7001,17696,2021,Kevin Newman,0.180000,1.2,0.000846
4326,13590,2022,Jesse Winker,0.253333,1.3,0.001048
4001,7007,2014,Yadier Molina,0.420000,2.4,0.001103
1698,9368,2016,Evan Longoria,0.486667,2.4,0.001407
...,...,...,...,...,...,...
790,9166,2010,Buster Posey,0.466667,9.8,6.553429
409,15640,2021,Aaron Judge,0.573333,11.1,7.266878
82,15640,2023,Aaron Judge,0.520000,11.2,7.492557
2431,11579,2014,Bryce Harper,0.313333,9.3,7.511512
