In [233]:
import os
import pandas as pd
import numpy as np
from pybaseball import batting_stats, chadwick_register

# Data docs at https://www.fangraphs.com/players/shohei-ohtani/19755/stats?position=DH

In [234]:
START = 1992
END = 2022

In [None]:
batting = batting_stats(START, END)
batting.to_csv("batting.csv")

In [206]:
if os.path.exists("batting.csv"):
    batting = pd.read_csv("batting.csv", index_col=0)
else:
    batting = batting_stats(START, END)
    batting.to_csv("batting.csv")

In [207]:
from pybaseball.lahman import download_lahman, fielding as fielding_download

download_lahman()

fielding = fielding_download()
fielding = fielding[fielding["yearID"] >= START].copy()

for col in ["G", "SB", "CS"]:
    del fielding[col]

In [208]:
def clean_stats(df):
    info_cols = ["teamID", "lgID", "POS"]
    new_df = df[info_cols].iloc[0:1,:].copy()
    
    numeric_cols = list(df.columns[df.dtypes != "object"])
    new_df[numeric_cols] = df[numeric_cols].sum()
    return new_df

fielding = fielding.groupby(["playerID", "yearID"]).apply(clean_stats)

In [209]:
fielding = fielding.reset_index(0).droplevel(0)

In [210]:
register = chadwick_register()

Gathering player lookup table. This may take a moment.


In [211]:
batting = batting.merge(register[["key_fangraphs", "key_bbref"]], left_on="IDfg", right_on="key_fangraphs")

In [212]:
batting = batting.merge(fielding, left_on=["key_bbref", "Season"], right_on=["playerID", "yearID"])

In [213]:
batting = batting.groupby("playerID", group_keys=False).filter(lambda x: x.shape[0] > 1)

In [214]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,stint,GS,InnOuts,PO,A,E,DP,PB,WP,ZR
0,1109,2002,Barry Bonds,SFG,37,143,403,612,149,70,...,1.0,133.0,3345.0,241.0,4.0,8.0,2.0,0.0,0.0,0.0
1,1109,2004,Barry Bonds,SFG,39,147,373,617,135,60,...,1.0,132.0,3392.0,214.0,11.0,4.0,0.0,0.0,0.0,0.0
2,1109,2003,Barry Bonds,SFG,38,130,390,550,133,65,...,1.0,122.0,3132.0,236.0,5.0,2.0,2.0,0.0,0.0,0.0
3,13611,2016,Mookie Betts,BOS,23,158,672,730,214,136,...,1.0,157.0,4145.0,346.0,14.0,1.0,4.0,0.0,0.0,0.0
4,13611,2019,Mookie Betts,BOS,26,150,597,706,176,102,...,1.0,142.0,3820.0,320.0,10.0,2.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1962,6319,2008,Ryan Garko,CLE,27,141,495,563,135,99,...,1.0,121.0,3176.0,1039.0,80.0,4.0,123.0,0.0,0.0,0.0
1968,2231,2006,Mike Jacobs,FLA,25,136,469,520,123,65,...,1.0,120.0,2916.0,931.0,57.0,7.0,101.0,0.0,0.0,0.0
1969,2231,2008,Mike Jacobs,FLA,27,141,477,519,118,57,...,1.0,119.0,2782.0,825.0,62.0,11.0,67.0,0.0,0.0,0.0
1983,869,2002,Roger Cedeno,NYM,27,149,511,562,133,105,...,1.0,125.0,3262.0,225.0,2.0,8.0,0.0,0.0,0.0,0.0


In [215]:
for col in ["key_fangraphs", "key_bbref", "yearID", "playerID", "teamID"]:
    del batting[col]

In [216]:
def next_season(player):
    player = player.sort_values("Season")
    player["Next_WAR"] = player["WAR"].shift(-1)
    return player

batting = batting.groupby("IDfg", group_keys=False).apply(next_season)

In [217]:
null_count = batting.isnull().sum()

In [218]:
complete_cols = list(batting.columns[null_count == 0])
batting = batting[complete_cols + ["Next_WAR"]].copy()

In [219]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,GS,InnOuts,PO,A,E,DP,PB,WP,ZR,Next_WAR
916,2,2002,Garret Anderson,ANA,30,158,638,678,195,107,...,147.0,3830.0,302.0,7.0,2.0,3.0,0.0,0.0,0.0,5.1
915,2,2003,Garret Anderson,ANA,31,159,638,673,201,119,...,144.0,3724.0,326.0,13.0,1.0,2.0,0.0,0.0,0.0,-0.2
919,2,2005,Garret Anderson,LAA,33,142,575,603,163,111,...,106.0,2760.0,201.0,4.0,5.0,1.0,0.0,0.0,0.0,0.1
918,2,2006,Garret Anderson,LAA,34,141,543,588,152,105,...,94.0,2438.0,192.0,1.0,0.0,0.0,0.0,0.0,0.0,1.4
917,2,2008,Garret Anderson,LAA,36,145,557,593,163,118,...,80.0,2068.0,144.0,9.0,0.0,2.0,0.0,0.0,0.0,-1.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
505,19611,2019,Vladimir Guerrero Jr.,TOR,20,123,464,514,126,83,...,94.0,2473.0,66.0,182.0,17.0,23.0,0.0,0.0,0.0,0.2
504,19611,2020,Vladimir Guerrero Jr.,TOR,21,60,221,243,58,34,...,34.0,897.0,265.0,22.0,3.0,25.0,0.0,0.0,0.0,
338,20123,2019,Juan Soto,WSN,20,150,542,659,153,82,...,150.0,3982.0,273.0,0.0,2.0,0.0,0.0,0.0,0.0,2.6
339,20123,2020,Juan Soto,WSN,21,47,154,196,54,27,...,42.0,997.0,64.0,1.0,0.0,0.0,0.0,0.0,0.0,7.0


In [220]:
batting.dtypes[batting.dtypes == "object"]

Name       object
Team       object
Dol        object
Age Rng    object
lgID       object
POS        object
dtype: object

In [228]:
del batting["Age Rng"]

KeyError: 'Age Rng'

In [229]:
del batting["Dol"]

In [222]:
for col in ["Team", "lgID", "POS"]:
    batting[f"{col}_code"] = batting[col].astype("category").cat.codes

In [223]:
batting.columns

Index(['IDfg', 'Season', 'Name', 'Team', 'Age', 'G', 'AB', 'PA', 'H', '1B',
       ...
       'A', 'E', 'DP', 'PB', 'WP', 'ZR', 'Next_WAR', 'Team_code', 'lgID_code',
       'POS_code'],
      dtype='object', length=150)

In [224]:
batting_full = batting.copy()
batting = batting.dropna()

In [225]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

rf = RandomForestRegressor(n_estimators=10, random_state=1, min_samples_split=100)

split = TimeSeriesSplit(n_splits=2)

sfs = SequentialFeatureSelector(rf, 
                                n_features_to_select=25, 
                                direction="forward",
                                cv=split,
                                n_jobs=8
                               )

In [231]:
removed_columns = ["Next_WAR", "Name", "Team", "lgID", "POS"]
selected_columns = batting.columns[~batting.columns.isin(removed_columns)]

In [232]:
sfs.fit(batting[selected_columns], batting["Next_WAR"])

Process LokyProcess-19:
Traceback (most recent call last):
  File "/usr/local/Cellar/python@3.9/3.9.13_1/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/local/Cellar/python@3.9/3.9.13_1/Frameworks/Python.framework/Versions/3.9/lib/python3.9/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.9/site-packages/joblib/externals/loky/process_executor.py", line 483, in _process_worker
    gc.collect()
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
predictors = list(selected_columns[sfs.get_support()])

In [None]:
predictors

In [None]:
def backtest(data, model, predictors, start=5, step=1):
    all_predictions = []
    
    years = sorted(data["Season"].unique())
    
    for i in range(start, len(years), step):
        current_year = years[i]
        train = data[data["Season"] < current_year]
        test = data[data["Season"] == current_year]
        
        model.fit(train[predictors], train["Next_WAR"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["Next_WAR"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [None]:
from xgboost import XGBRegressor

xgb = XGBRegressor(random_state=1, learning_rate=.01, n_estimators=400, n_jobs=8, min_child_weight=10)

In [None]:
predictions = backtest(batting, xgb, predictors)

In [None]:
predictions.shape

In [None]:
from sklearn.metrics import mean_squared_error

mean_squared_error(predictions["actual"], predictions["prediction"])

In [None]:
batting = batting_full.copy().dropna()

In [None]:
def player_history(df):
    df = df.sort_values("Season")
        
    df["player_season"] = range(0, df.shape[0])
    df["war_corr"] = list(df[["player_season", "WAR"]].expanding().corr().loc[(slice(None), "player_season"),"WAR"])
    df["war_corr"].fillna(0, inplace=True)
    df["war_prev"] = df["WAR"] / (df["WAR"].shift(1) + .01)
    
    return df.iloc[1:,]

batting = batting.groupby("IDfg", group_keys=False).apply(player_history)

In [None]:
def group_averages(df):
    return df["WAR"] / df["WAR"].mean()

In [None]:
batting["war_season"] = batting.groupby("Season", group_keys=False).apply(group_averages)

In [None]:
new_predictors = predictors + ["player_season", "war_corr", "war_prev", "war_season"]

In [None]:
predictions = backtest(batting, xgb, new_predictors)

In [None]:
mean_squared_error(predictions["actual"], predictions["prediction"]) 