In [279]:
import os
import pandas as pd
import numpy as np
from pybaseball import batting_stats

# Data docs at https://www.fangraphs.com/players/shohei-ohtani/19755/stats?position=DH

In [280]:
START = 2002
END = 2022

In [281]:
if os.path.exists("batting.csv"):
    batting = pd.read_csv("batting.csv", index_col=0)
else:
    batting = batting_stats(START, END, qual=200)
    batting.to_csv("batting.csv")

In [282]:
batting = batting.groupby("IDfg", group_keys=False).filter(lambda x: x.shape[0] > 1)

In [283]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA
0,1109,2002,Barry Bonds,SFG,37,143,403,612,149,70,...,,,,,0,0.127,0.191,,,
1,1109,2004,Barry Bonds,SFG,39,147,373,617,135,60,...,,,,,0,0.124,0.164,,,
15,13611,2018,Mookie Betts,BOS,25,136,520,614,180,96,...,0.131,110.6,217.0,0.500,434,0.220,0.270,,,
2,1109,2003,Barry Bonds,SFG,38,130,390,550,133,65,...,,,,,0,0.135,0.223,,,
78,10155,2013,Mike Trout,LAA,21,157,589,716,190,115,...,,,0.0,,0,0.200,0.266,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6861,1698,2010,Gerald Laird,DET,30,89,270,299,56,40,...,,,0.0,,0,0.166,0.252,,,
7019,9272,2018,Chris Davis,BAL,32,128,470,522,79,51,...,0.096,111.8,113.0,0.401,282,0.174,0.316,,,
6655,319,2011,Adam Dunn,CHW,31,122,415,496,66,39,...,,,0.0,,0,0.169,0.295,,,
6962,620,2002,Neifi Perez,KCR,29,145,554,585,131,104,...,,,,,0,0.130,0.187,,,


In [284]:
def next_season(player):
    player = player.sort_values("Season")
    player["Next_WAR"] = player["WAR"].shift(-1)
    return player

batting = batting.groupby("IDfg", group_keys=False).apply(next_season)

In [285]:
null_count = batting.isnull().sum()

In [286]:
complete_cols = list(batting.columns[null_count == 0])
batting = batting[complete_cols + ["Next_WAR"]].copy()

In [287]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,Pull%+,Cent%+,Oppo%+,Soft%+,Med%+,Hard%+,Events,CStr%,CSW%,Next_WAR
5549,1,2006,Alfredo Amezaga,FLA,28,132,334,378,87,72,...,86,107,113,143,109,63,0,0.188,0.256,2.0
5000,1,2007,Alfredo Amezaga,FLA,29,133,400,448,105,80,...,92,101,112,109,113,75,0,0.175,0.227,1.2
5243,1,2008,Alfredo Amezaga,FLA,30,125,311,337,82,61,...,99,101,101,123,111,64,0,0.178,0.244,
1168,2,2002,Garret Anderson,ANA,30,158,638,678,195,107,...,118,91,80,65,97,129,0,0.137,0.232,5.1
866,2,2003,Garret Anderson,ANA,31,159,638,673,201,119,...,112,101,80,90,99,109,0,0.164,0.252,0.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5980,24655,2022,Owen Miller,CLE,25,119,391,433,97,66,...,92,111,99,127,102,82,315,0.191,0.269,
4880,26197,2021,Andrew Vaughn,CHW,23,127,417,469,98,61,...,87,104,116,84,99,110,321,0.185,0.285,0.5
2097,26197,2022,Andrew Vaughn,CHW,24,118,456,497,132,88,...,88,108,108,93,99,105,382,0.205,0.287,
6604,27506,2021,Ha-seong Kim,SDP,25,117,267,298,54,32,...,126,99,59,137,96,88,201,0.216,0.303,2.6


In [288]:
batting.dtypes[batting.dtypes == "object"]

Name       object
Team       object
Dol        object
Age Rng    object
dtype: object

In [289]:
del batting["Age Rng"]
del batting["Dol"]

In [290]:
batting["team_code"] = batting["Team"].astype("category").cat.codes

In [291]:
batting_full = batting.copy()
batting = batting.dropna().copy()

In [292]:
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

rr = Ridge(alpha=1)

split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr, 
                                n_features_to_select=20, 
                                direction="forward",
                                cv=split,
                                n_jobs=8
                               )

In [293]:
removed_columns = ["Next_WAR", "Name", "Team", "IDfg", "Season"]
selected_columns = batting.columns[~batting.columns.isin(removed_columns)]

In [294]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
batting.loc[:,selected_columns] = scaler.fit_transform(batting[selected_columns])

In [295]:
sfs.fit(batting[selected_columns], batting["Next_WAR"])

SequentialFeatureSelector(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                          estimator=Ridge(alpha=1), n_features_to_select=20,
                          n_jobs=8)

In [296]:
predictors = list(selected_columns[sfs.get_support()])

In [297]:
predictors

['Age',
 'IBB',
 'SO',
 'SB',
 'BU',
 'BABIP',
 'IFH%',
 'WAR',
 'Spd',
 'PH',
 'CB%',
 'Z-Contact%',
 'SwStr%',
 'wGDP',
 'Oppo%',
 'SLG+',
 'LD+%',
 'Pull%+',
 'Soft%+',
 'Hard%+']

In [298]:
def backtest(data, model, predictors, start=5, step=1):
    all_predictions = []
    
    years = sorted(data["Season"].unique())
    
    for i in range(start, len(years), step):
        current_year = years[i]
        train = data[data["Season"] < current_year]
        test = data[data["Season"] == current_year]
        
        model.fit(train[predictors], train["Next_WAR"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["Next_WAR"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [299]:
predictions = backtest(batting, rr, predictors)

In [300]:
predictions.shape

(4115, 2)

In [301]:
from sklearn.metrics import mean_squared_error

mean_squared_error(predictions["actual"], predictions["prediction"])

2.767180714329271

In [302]:
def player_history(df):
    df = df.sort_values("Season")
        
    df["player_season"] = range(0, df.shape[0])
    df["war_corr"] = list(df[["player_season", "WAR"]].expanding().corr().loc[(slice(None), "player_season"),"WAR"])
    df["war_corr"].fillna(0, inplace=True)
    
    df["war_diff"] = df["WAR"] / df["WAR"].shift(1)
    df["war_diff"].fillna(1, inplace=True)
    df["war_diff"][df["war_diff"] == np.inf] = 1
    
    return df

batting = batting.groupby("IDfg", group_keys=False).apply(player_history)

In [303]:
def group_averages(df):
    return df["WAR"] / df["WAR"].mean()

In [304]:
batting["war_season"] = batting.groupby("Season", group_keys=False).apply(group_averages)

In [305]:
new_predictors = predictors + ["player_season", "war_corr", "war_season", "war_diff"]

In [306]:
predictions = backtest(batting, rr, new_predictors)

In [307]:
mean_squared_error(predictions["actual"], predictions["prediction"]) 

2.6747538079398696

In [308]:
pd.Series(rr.coef_, index=new_predictors).sort_values()

Age             -2.685008
WAR             -1.926992
BABIP           -1.564527
SLG+            -1.254824
Soft%+          -1.253514
SwStr%          -1.034504
BU              -1.001154
PH              -0.729490
SO              -0.664919
Z-Contact%      -0.658578
war_diff        -0.589077
wGDP            -0.469103
CB%             -0.258188
Pull%+          -0.232500
LD+%            -0.231191
war_corr        -0.086429
player_season    0.008270
IFH%             0.383520
Oppo%            0.657206
Spd              0.708913
SB               1.048349
IBB              1.812376
Hard%+           2.262003
war_season       3.471276
dtype: float64

In [309]:
diff = predictions["actual"] - predictions["prediction"]

In [310]:
merged = predictions.merge(batting, left_index=True, right_index=True)

In [311]:
merged["diff"] = (predictions["actual"] - predictions["prediction"]).abs()

In [312]:
merged[["IDfg", "Season", "Name", "WAR", "Next_WAR", "diff"]].sort_values(["diff"])

Unnamed: 0,IDfg,Season,Name,WAR,Next_WAR,diff
1585,2158,2008,Greg Dobbs,0.267081,-0.1,0.000816
6448,1818,2007,Bobby Crosby,0.211180,1.3,0.000926
963,13066,2021,Teoscar Hernandez,0.478261,2.1,0.001077
1364,1849,2014,Rickie Weeks Jr.,0.291925,-0.1,0.001349
3432,14854,2021,Mike Yastrzemski,0.360248,1.4,0.001460
...,...,...,...,...,...,...
3165,4810,2007,Brian McCann,0.304348,8.6,6.378203
3249,5631,2010,Matt Kemp,0.211180,8.3,6.380206
3825,1875,2009,Josh Hamilton,0.291925,8.4,6.580501
873,9166,2010,Buster Posey,0.459627,10.1,6.610423
