In [16]:
from pybaseball import schedule_and_record, pitching_stats, batting_stats

In [17]:
import os
import pandas as pd

# Data docs at https://www.fangraphs.com/players/shohei-ohtani/19755/stats?position=DH

if os.path.exists("batting.csv"):
    batting = pd.read_csv("batting.csv", index_col=0)
else:
    batting = batting_stats(2002, 2022)
    batting.to_csv("batting.csv")

In [18]:
bat_2002 = batting[batting["Season"] == 2002]

In [19]:
bat_2002

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA
0,1109,2002,Barry Bonds,SFG,37,143,403,612,149,70,...,,,,,0,0.127,0.191,,,
60,1274,2002,Alex Rodriguez,TEX,26,162,624,725,187,101,...,,,,,0,0.169,0.279,,,
6,409,2002,Jim Thome,CLE,31,147,480,613,146,73,...,,,,,0,0.151,0.274,,,
80,778,2002,Vladimir Guerrero,MON,27,161,614,709,206,128,...,,,,,0,0.066,0.194,,,
23,990,2002,Brian Giles,PIT,31,153,497,644,148,68,...,,,,,0,0.177,0.219,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1873,497,2002,Randall Simon,DET,27,130,482,506,145,108,...,,,,,0,0.071,0.164,,,
2955,1002,2002,Aramis Ramirez,PIT,24,142,522,570,122,78,...,,,,,0,0.114,0.251,,,
3060,84,2002,Vinny Castilla,ATL,34,143,543,578,126,89,...,,,,,0,0.086,0.218,,,
3021,454,2002,Juan Uribe,COL,23,155,566,618,136,98,...,,,,,0,0.126,0.264,,,


In [20]:
def next_season_war(player):
    player = player.sort_values("Season", ascending=True)
    player["Next_WAR"] = player["WAR"].shift(-1)
    return player

batting = batting.groupby("IDfg", group_keys=False).apply(next_season_war)

In [21]:
null_pct = batting.apply(lambda x: pd.isnull(x).sum()/len(x))

In [22]:
complete_cols = list(batting.columns[null_pct == 0])
batting = batting[complete_cols + ["Next_WAR"]].copy()

In [23]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,Pull%+,Cent%+,Oppo%+,Soft%+,Med%+,Hard%+,Events,CStr%,CSW%,Next_WAR
832,2,2002,Garret Anderson,ANA,30,158,638,678,195,107,...,118,91,80,65,97,129,0,0.137,0.232,5.1
640,2,2003,Garret Anderson,ANA,31,159,638,673,201,119,...,112,101,80,90,99,109,0,0.164,0.252,-0.2
2395,2,2005,Garret Anderson,LAA,33,142,575,603,163,111,...,126,78,83,100,97,106,0,0.150,0.258,0.1
2307,2,2006,Garret Anderson,LAA,34,141,543,588,152,105,...,113,96,86,88,94,119,0,0.155,0.253,1.4
2033,2,2008,Garret Anderson,LAA,36,145,557,593,163,118,...,121,86,84,94,103,97,0,0.175,0.258,-1.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1375,23697,2022,Julio Rodriguez,SEA,21,119,467,509,126,79,...,102,101,95,88,96,114,331,0.146,0.295,
1897,24610,2022,Steven Kwan,CLE,24,121,453,513,132,103,...,74,106,133,110,121,56,408,0.254,0.284,
2503,25764,2022,Bobby Witt Jr.,KCR,22,126,492,528,121,72,...,87,115,100,90,101,104,382,0.154,0.271,
1511,26197,2022,Andrew Vaughn,CHW,24,111,430,469,122,81,...,88,109,107,91,100,105,358,0.206,0.291,


In [24]:
batting.dtypes[batting.dtypes == "object"]

Name       object
Team       object
Dol        object
Age Rng    object
dtype: object

In [25]:
del batting["Dol"]
del batting["Age Rng"]

In [26]:
batting["Team_Code"] = batting["Team"].astype("category").cat.codes

In [27]:
batting

Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,1B,...,Cent%+,Oppo%+,Soft%+,Med%+,Hard%+,Events,CStr%,CSW%,Next_WAR,Team_Code
832,2,2002,Garret Anderson,ANA,30,158,638,678,195,107,...,91,80,65,97,129,0,0.137,0.232,5.1,1
640,2,2003,Garret Anderson,ANA,31,159,638,673,201,119,...,101,80,90,99,109,0,0.164,0.252,-0.2,1
2395,2,2005,Garret Anderson,LAA,33,142,575,603,163,111,...,78,83,100,97,106,0,0.150,0.258,0.1,15
2307,2,2006,Garret Anderson,LAA,34,141,543,588,152,105,...,96,86,88,94,119,0,0.155,0.253,1.4,15
2033,2,2008,Garret Anderson,LAA,36,145,557,593,163,118,...,86,84,94,103,97,0,0.175,0.258,-1.1,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1375,23697,2022,Julio Rodriguez,SEA,21,119,467,509,126,79,...,101,95,88,96,114,331,0.146,0.295,,27
1897,24610,2022,Steven Kwan,CLE,24,121,453,513,132,103,...,106,133,110,121,56,408,0.254,0.284,,9
2503,25764,2022,Bobby Witt Jr.,KCR,22,126,492,528,121,72,...,115,100,90,101,104,382,0.154,0.271,,14
1511,26197,2022,Andrew Vaughn,CHW,24,111,430,469,122,81,...,109,107,91,100,105,358,0.206,0.291,,7


In [33]:
batting_full = batting.copy()
batting = batting.dropna()

In [34]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

rf = RandomForestRegressor(n_estimators=10, random_state=1)

split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rf, 
                                n_features_to_select=10, 
                                direction="forward",
                                cv=split
                               )

In [35]:
removed_columns = ["Next_WAR", "Name", "Team", "IDfg", "Season"]
selected_columns = batting.columns[~batting.columns.isin(removed_columns)]

sfs.fit(batting[selected_columns], batting["Next_WAR"])

SequentialFeatureSelector(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                          estimator=RandomForestRegressor(n_estimators=10,
                                                          random_state=1),
                          n_features_to_select=10)

In [37]:
predictors = list(selected_columns[sfs.get_support()])

In [38]:
predictors

['3B', 'SO', 'LD', 'Rep', 'WAR', 'WPA', 'SLv', 'SFv', 'O-Swing%', 'Events']

In [59]:
def backtest(data, model, split, start=5, step=1):
    all_predictions = []
    
    years = data["Season"].unique()
    
    for i in range(start, len(years), step):
        current_year = years[i]
        train = data[data["Season"] < current_year]
        test = data[data["Season"] == current_year]
        
        model.fit(train[predictors], train["Next_WAR"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["Next_WAR"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [60]:
rf = RandomForestRegressor(n_estimators=150, random_state=1, min_samples_split=10)

predictions = backtest(batting, rf, split)

In [61]:
predictions.shape

(1681, 2)

In [65]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(predictions["actual"], predictions["prediction"]) 

1.5411265772791407

In [66]:
batting["WAR"].describe()

count    2310.000000
mean        3.149957
std         2.039631
min        -2.900000
25%         1.800000
50%         3.000000
75%         4.400000
max        12.700000
Name: WAR, dtype: float64