In [40]:
import os
import pandas as pd
from pybaseball.lahman import download_lahman, fielding as fielding_download, batting as batting_download
import numpy as np

# Data docs at https://www.fangraphs.com/players/shohei-ohtani/19755/stats?position=DH

In [41]:
download_lahman()

batting = batting_download()
batting = batting[batting["yearID"] >= 1992].copy()

In [42]:
fielding = fielding_download()
fielding = fielding[fielding["yearID"] >= 1992].copy()

for col in ["SB", "CS", "stint", "teamID", "G", "lgID"]:
    del fielding[col]

In [43]:
batting = batting.merge(fielding, on=["playerID", "yearID"])

In [44]:
batting = batting[batting["HR"] > 5].copy()

In [45]:
batting = batting.groupby("playerID", group_keys=False).filter(lambda x: x.shape[0] > 1)

In [46]:
batting

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,POS,GS,InnOuts,PO,A,E,DP,PB,WP,ZR
17,alomaro01,1992,1,TOR,AL,152,571,105,177,27,...,2B,149.0,3830.0,287,378,5.0,66,,,
19,aloumo01,1992,1,MON,NL,115,341,53,96,28,...,OF,80.0,2245.0,170,6,4.0,1,,,
27,anderbr01,1992,1,BAL,AL,159,623,100,169,28,...,OF,158.0,4248.0,382,10,8.0,6,,,
31,anthoer01,1992,1,HOU,NL,137,440,45,105,15,...,OF,110.0,2710.0,173,6,5.0,0,,,
48,baergca01,1992,1,CLE,AL,161,657,92,205,32,...,2B,160.0,4302.0,400,475,19.0,138,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62177,youngan02,2021,1,ARI,NL,58,91,13,19,7,...,3B,0.0,8.0,0,0,1.0,0,,,
62178,youngan02,2021,1,ARI,NL,58,91,13,19,7,...,OF,1.0,15.0,1,0,1.0,0,,,
62183,zimmebr01,2021,1,CLE,AL,99,299,44,68,9,...,OF,84.0,2280.0,187,1,1.0,0,,,
62187,zimmery01,2021,1,WAS,NL,110,255,27,62,16,...,1B,45.0,1202.0,338,15,0.0,30,,,


In [47]:
from pybaseball import chadwick_register, batting_stats

register = chadwick_register()

Gathering player lookup table. This may take a moment.


In [34]:
war_info = batting_stats(1992,2022)

KeyError: "['playerID'] not in index"

In [48]:
war_info = war_info[["WAR", "Season", "IDfg"]]

In [49]:
batting = batting.merge(register[["key_fangraphs", "key_bbref"]], left_on="playerID", right_on="key_bbref")

In [52]:
batting = batting.merge(war_info, left_on=["key_fangraphs", "yearID"], right_on=["IDfg", "Season"])

In [53]:
batting

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,E,DP,PB,WP,ZR,key_fangraphs,key_bbref,WAR,Season,IDfg
0,alomaro01,1992,1,TOR,AL,152,571,105,177,27,...,5.0,66,,,,860,alomaro01,6.1,1992,860
1,alomaro01,1993,1,TOR,AL,153,589,109,192,35,...,14.0,92,,,,860,alomaro01,5.7,1993,860
2,alomaro01,1994,1,TOR,AL,107,392,78,120,25,...,4.0,71,,,,860,alomaro01,2.0,1994,860
3,alomaro01,1995,1,TOR,AL,130,517,71,155,24,...,4.0,84,,,,860,alomaro01,2.2,1995,860
4,alomaro01,1996,1,BAL,AL,153,588,132,193,43,...,11.0,107,,,,860,alomaro01,5.2,1996,860
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5931,smithpa04,2021,1,ARI,NL,145,498,68,133,27,...,2.0,25,,,,19892,smithpa04,0.2,2021,19892
5932,smithpa04,2021,1,ARI,NL,145,498,68,133,27,...,2.0,0,,,,19892,smithpa04,0.2,2021,19892
5933,uriaslu01,2021,1,MIL,NL,150,490,77,122,25,...,1.0,5,,,,16622,uriaslu01,2.1,2021,16622
5934,uriaslu01,2021,1,MIL,NL,150,490,77,122,25,...,9.0,8,,,,16622,uriaslu01,2.1,2021,16622


In [54]:
for col in ["key_fangraphs", "key_bbref", "Season", "IDfg"]:
    del batting[col]

In [55]:
def clean_stats(df):
    info_cols = ["teamID", "lgID", "POS"]
    new_df = df[info_cols].iloc[0:1,:]
    
    numeric_cols = list(df.columns[df.dtypes != "object"])
    new_df[numeric_cols] = df[numeric_cols].sum()
    return new_df

batting = batting.groupby(["playerID", "yearID"]).apply(clean_stats)

In [56]:
batting = batting.reset_index(0).droplevel(0)

In [57]:
def next_season(player):
    player = player.sort_values("yearID")
    player["Next_WAR"] = player["WAR"].shift(-1)
    return player

batting = batting.groupby("playerID", group_keys=False).apply(next_season)

In [59]:
null_count = batting.isnull().sum()

In [60]:
complete_cols = list(batting.columns[null_count == 0])
batting = batting[complete_cols + ["Next_WAR"]].copy()

In [61]:
batting

Unnamed: 0,playerID,teamID,lgID,POS,yearID,stint,G,AB,R,H,...,InnOuts,PO,A,E,DP,PB,WP,ZR,WAR,Next_WAR
1220,abbotku01,FLO,NL,SS,1994.0,1.0,101.0,345.0,41.0,86.0,...,2476.0,162.0,260.0,15.0,61.0,0.0,0.0,0.0,0.5,1.8
1221,abbotku01,FLO,NL,SS,1995.0,1.0,120.0,420.0,60.0,107.0,...,2925.0,149.0,290.0,19.0,66.0,0.0,0.0,0.0,1.8,
1951,abreubo01,PHI,NL,OF,1998.0,1.0,151.0,497.0,68.0,155.0,...,3781.0,272.0,17.0,8.0,0.0,0.0,0.0,0.0,6.5,6.3
1952,abreubo01,PHI,NL,OF,1999.0,1.0,152.0,546.0,118.0,183.0,...,3803.0,260.0,8.0,3.0,0.0,0.0,0.0,0.0,6.3,6.9
1953,abreubo01,PHI,NL,OF,2000.0,1.0,154.0,576.0,103.0,182.0,...,3992.0,337.0,13.0,4.0,2.0,0.0,0.0,0.0,6.9,5.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4269,zobribe01,CHN,NL,1B,6054.0,3.0,417.0,1365.0,201.0,417.0,...,2922.0,229.0,126.0,1.0,36.0,0.0,0.0,0.0,9.6,15.2
4240,zobribe01,TBA,AL,1B,8040.0,4.0,604.0,2164.0,308.0,516.0,...,3884.0,356.0,130.0,4.0,36.0,0.0,0.0,0.0,15.2,13.6
4265,zobribe01,CHN,NL,1B,8064.0,4.0,588.0,2092.0,376.0,568.0,...,3795.0,221.0,251.0,7.0,52.0,0.0,0.0,0.0,13.6,43.5
4235,zobribe01,TBA,AL,1B,10045.0,5.0,760.0,2505.0,455.0,745.0,...,3628.0,270.0,247.0,7.0,55.0,0.0,0.0,0.0,43.5,30.0


In [62]:
batting.dtypes[batting.dtypes == "object"]

playerID    object
teamID      object
lgID        object
POS         object
dtype: object

In [63]:
for col in ["teamID", "lgID", "POS"]:
    batting[f"{col}_code"] = batting[col].astype("category").cat.codes

In [64]:
batting.columns

Index(['playerID', 'teamID', 'lgID', 'POS', 'yearID', 'stint', 'G', 'AB', 'R',
       'H', '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'IBB', 'HBP',
       'SH', 'SF', 'GIDP', 'GS', 'InnOuts', 'PO', 'A', 'E', 'DP', 'PB', 'WP',
       'ZR', 'WAR', 'Next_WAR', 'teamID_code', 'lgID_code', 'POS_code'],
      dtype='object')

In [65]:
batting_full = batting.copy()
batting = batting.dropna()

In [66]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

rf = RandomForestRegressor(n_estimators=10, random_state=1, min_samples_split=100)

split = TimeSeriesSplit(n_splits=2)

sfs = SequentialFeatureSelector(rf, 
                                n_features_to_select=15, 
                                direction="forward",
                                cv=split,
                                n_jobs=8
                               )

In [67]:
removed_columns = ["Next_WAR", "Name", "teamID", "playerID", "lgID", "POS"]
selected_columns = batting.columns[~batting.columns.isin(removed_columns)]

In [69]:
sfs.fit(batting[selected_columns], batting["Next_WAR"])

SequentialFeatureSelector(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=2, test_size=None),
                          estimator=RandomForestRegressor(min_samples_split=100,
                                                          n_estimators=10,
                                                          random_state=1),
                          n_features_to_select=15, n_jobs=8)

In [24]:
predictors = list(selected_columns[sfs.get_support()])

In [25]:
predictors

['yearID', '3B', 'HR', 'CS', 'GIDP', 'InnOuts', 'A', 'E', 'PB', 'WP']

In [70]:
def backtest(data, model, predictors, start=15, step=2):
    all_predictions = []
    
    years = sorted(data["yearID"].unique())
    
    for i in range(start, len(years), step):
        current_year = years[i]
        train = data[data["yearID"] < current_year]
        test = data[data["yearID"] == current_year]
        
        model.fit(train[predictors], train["Next_WAR"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["Next_WAR"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [None]:
from xgboost import XGBRegressor

xgb = XGBRegressor(random_state=1, learning_rate=.01, n_estimators=200, n_jobs=8, min_child_weight=250)

predictions = backtest(batting, xgb, predictors)

In [None]:
predictions.shape

In [None]:
from sklearn.metrics import mean_squared_error

mean_squared_error(predictions["actual"], predictions["prediction"])

In [30]:
batting = batting_full.copy().dropna()

In [None]:
def player_history(df):
    df = df.sort_values("yearID")
        
    df["player_season"] = range(0, df.shape[0])
    df["war_corr"] = list(df[["player_season", "HR"]].expanding().corr().droplevel(0).loc["player_season","WAR"])
    df["war_corr"].fillna(0, inplace=True)
    df["war_prev"] = df["WAR"] / (df["WAR"].shift(1) + .01)
    
    return df.iloc[1:,]

batting = batting.groupby("playerID", group_keys=False).apply(player_history)

In [None]:
def group_averages(df):
    return df["WAR"] / df["WAR"].mean()

In [None]:
batting["war_season"] = batting.groupby("yearID", group_keys=False).apply(group_averages)
batting["war_pos"] = batting.groupby("POS", group_keys=False).apply(group_averages)

In [None]:
new_predictors = predictors + ["player_season", "war_corr", "war_prev", "war_season", "war_pos"]

In [None]:
predictions = backtest(batting, xgb, new_predictors)

In [None]:
mean_squared_error(predictions["actual"], predictions["prediction"]) 