In [223]:
import os
import pandas as pd
import numpy as np
from pybaseball import batting_stats, chadwick_register

# Data docs at https://www.fangraphs.com/players/shohei-ohtani/19755/stats?position=DH

In [224]:
START = 2002
END = 2022

In [225]:
if os.path.exists("batting.csv"):
    batting = pd.read_csv("batting.csv", index_col=0)
else:
    batting = batting_stats(START, END)
    batting.to_csv("batting.csv")

In [226]:
batting = batting.groupby("IDfg", group_keys=False).filter(lambda x: x.shape[0] > 1)

In [227]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA
0,1109,2002,Barry Bonds,SFG,37,143,403,612,149,70,...,,,,,0,0.127,0.191,,,
1,1109,2004,Barry Bonds,SFG,39,147,373,617,135,60,...,,,,,0,0.124,0.164,,,
16,13611,2018,Mookie Betts,BOS,25,136,520,614,180,96,...,0.131,110.6,217.0,0.500,434,0.220,0.270,,,
2,1109,2003,Barry Bonds,SFG,38,130,390,550,133,65,...,,,,,0,0.135,0.223,,,
64,10155,2013,Mike Trout,LAA,21,157,589,716,190,115,...,,,0.0,,0,0.200,0.266,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2733,1213,2009,Aubrey Huff,- - -,32,150,536,597,129,83,...,,,0.0,,0,0.169,0.246,,,
2681,857,2005,Bernie Williams,NYY,36,141,485,546,121,89,...,,,,,0,0.183,0.255,,,
3036,8585,2009,Yuniesky Betancourt,- - -,27,134,470,508,115,83,...,,,0.0,,0,0.181,0.232,,,
3088,9272,2018,Chris Davis,BAL,32,128,470,522,79,51,...,0.096,111.8,113.0,0.401,282,0.174,0.316,,,


In [228]:
def next_season(player):
    player = player.sort_values("Season")
    player["Next_WAR"] = player["WAR"].shift(-1)
    return player

batting = batting.groupby("IDfg", group_keys=False).apply(next_season)

In [229]:
null_count = batting.isnull().sum()

In [230]:
complete_cols = list(batting.columns[null_count == 0])
batting = batting[complete_cols + ["Next_WAR"]].copy()

In [231]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,Pull%+,Cent%+,Oppo%+,Soft%+,Med%+,Hard%+,Events,CStr%,CSW%,Next_WAR
832,2,2002,Garret Anderson,ANA,30,158,638,678,195,107,...,118,91,80,65,97,129,0,0.137,0.232,5.1
642,2,2003,Garret Anderson,ANA,31,159,638,673,201,119,...,112,101,80,90,99,109,0,0.164,0.252,-0.2
2398,2,2005,Garret Anderson,LAA,33,142,575,603,163,111,...,126,78,83,100,97,106,0,0.150,0.258,0.1
2311,2,2006,Garret Anderson,LAA,34,141,543,588,152,105,...,113,96,86,88,94,119,0,0.155,0.253,1.4
2034,2,2008,Garret Anderson,LAA,36,145,557,593,163,118,...,121,86,84,94,103,97,0,0.175,0.258,-1.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
597,19755,2022,Shohei Ohtani,LAA,27,138,513,589,136,72,...,95,99,109,54,98,128,373,0.140,0.266,
257,20123,2019,Juan Soto,WSN,20,150,542,659,153,82,...,97,99,106,74,104,106,416,0.161,0.251,2.6
3,20123,2020,Juan Soto,WSN,21,47,154,196,54,27,...,80,125,97,93,86,124,126,0.185,0.250,7.0
76,20123,2021,Juan Soto,WSN,22,151,502,654,157,106,...,77,124,103,97,89,119,414,0.200,0.263,3.0


In [232]:
batting.dtypes[batting.dtypes == "object"]

Name       object
Team       object
Dol        object
Age Rng    object
dtype: object

In [233]:
del batting["Age Rng"]
del batting["Dol"]

In [234]:
batting["team_code"] = batting["Team"].astype("category").cat.codes

In [235]:
batting_full = batting.copy()
batting = batting.dropna()

In [236]:
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

rf = Ridge(alpha=10)

split = TimeSeriesSplit(n_splits=2)

sfs = SequentialFeatureSelector(rf, 
                                n_features_to_select=20, 
                                direction="forward",
                                cv=split,
                                n_jobs=8
                               )

In [237]:
removed_columns = ["Next_WAR", "Name", "Team"]
selected_columns = batting.columns[~batting.columns.isin(removed_columns)]

In [238]:
sfs.fit(batting[selected_columns], batting["Next_WAR"])

SequentialFeatureSelector(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=2, test_size=None),
                          estimator=Ridge(alpha=10), n_features_to_select=20,
                          n_jobs=8)

In [239]:
predictors = list(selected_columns[sfs.get_support()])

In [240]:
predictors

['Season',
 'Age',
 'G',
 '3B',
 'IBB',
 'SO',
 'SH',
 'SB',
 'Pitches',
 'BB/K',
 'IFH%',
 'Pos',
 'WAR',
 'Spd',
 'wCH',
 'wFB/C',
 'BABIP+',
 'Oppo%+',
 'Soft%+',
 'Hard%+']

In [241]:
def backtest(data, model, predictors, start=5, step=1):
    all_predictions = []
    
    years = sorted(data["Season"].unique())
    
    for i in range(start, len(years), step):
        current_year = years[i]
        train = data[data["Season"] < current_year]
        test = data[data["Season"] == current_year]
        
        model.fit(train[predictors], train["Next_WAR"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["Next_WAR"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [242]:
from xgboost import XGBRegressor

xgb = XGBRegressor(random_state=1, learning_rate=.01, n_estimators=400, n_jobs=8, min_child_weight=10)

from sklearn.linear_model import Ridge

model = Ridge(alpha=10)

In [243]:
predictions = backtest(batting, model, predictors)

In [244]:
predictions.shape

(1677, 2)

In [245]:
from sklearn.metrics import mean_squared_error

mean_squared_error(predictions["actual"], predictions["prediction"])

3.3181152407081957

In [246]:
batting = batting_full.copy().dropna()

In [247]:
def player_history(df):
    df = df.sort_values("Season")
        
    df["player_season"] = range(0, df.shape[0])
    df["war_corr"] = list(df[["player_season", "WAR"]].expanding().corr().loc[(slice(None), "player_season"),"WAR"])
    df["war_corr"].fillna(0, inplace=True)
    
    return df

batting = batting.groupby("IDfg", group_keys=False).apply(player_history)

In [248]:
def group_averages(df):
    return df["WAR"] / df["WAR"].mean()

In [249]:
batting["war_season"] = batting.groupby("Season", group_keys=False).apply(group_averages)

In [250]:
new_predictors = predictors + ["player_season", "war_corr", "war_season"]

In [251]:
predictions = backtest(batting, model, new_predictors)

In [252]:
mean_squared_error(predictions["actual"], predictions["prediction"]) 

3.279927246939554

In [253]:
pd.Series(model.coef_, index=new_predictors).sort_values()

war_corr        -0.269479
wFB/C           -0.232165
Age             -0.128756
Season          -0.036820
3B              -0.035068
SH              -0.033161
G               -0.020716
BABIP+          -0.018270
wCH             -0.012253
Soft%+          -0.010531
SO              -0.005326
Pitches          0.000886
Oppo%+           0.004245
SB               0.007195
player_season    0.010647
Hard%+           0.012936
Pos              0.021567
IBB              0.034914
Spd              0.139679
BB/K             0.186804
WAR              0.230630
IFH%             0.570639
war_season       0.612938
dtype: float64