In [152]:
import os
import pandas as pd
from pybaseball.lahman import download_lahman, fielding as fielding_download, batting as batting_download
import numpy as np

# Data docs at https://www.fangraphs.com/players/shohei-ohtani/19755/stats?position=DH

In [153]:
download_lahman()

fielding = fielding_download()
fielding = fielding[fielding["yearID"] >= 1992].copy()

In [154]:
batting = batting_download()
batting = batting[batting["yearID"] >= 1992].copy()

In [155]:
batting = batting.merge(fielding, on=["playerID", "yearID"])

In [156]:
batting = batting[[c for c in batting.columns if not c.endswith("_y")]]

In [157]:
batting.columns = [c.replace("_x", "") for c in batting.columns]

In [158]:
batting

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,POS,GS,InnOuts,PO,A,E,DP,PB,WP,ZR
0,abbotji01,1992,1,CAL,AL,29,0,0,0,0,...,P,29.0,633.0,11,35,0.0,1,,,
1,abbotky01,1992,1,PHI,NL,31,29,1,2,1,...,P,19.0,400.0,3,15,0.0,0,,,
2,abbotpa01,1992,1,MIN,AL,6,0,0,0,0,...,P,0.0,33.0,2,3,0.0,0,,,
3,abnersh01,1992,1,CHA,AL,97,208,21,58,10,...,OF,48.0,1544.0,155,2,0.0,0,,,
4,ackerji01,1992,1,SEA,AL,17,0,0,0,0,...,P,0.0,92.0,0,5,0.0,0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62185,zimmejo02,2021,1,MIL,NL,2,1,0,0,0,...,P,0.0,17.0,0,1,0.0,0,,,
62186,zimmeky01,2021,1,KCA,AL,52,0,0,0,0,...,P,2.0,162.0,4,5,0.0,3,,,
62187,zimmery01,2021,1,WAS,NL,110,255,27,62,16,...,1B,45.0,1202.0,338,15,0.0,30,,,
62188,zuberty01,2021,1,KCA,AL,31,1,0,0,0,...,P,0.0,82.0,1,3,0.0,0,,,


In [159]:
def clean_stats(df):
    info_cols = ["teamID", "lgID", "POS"]
    new_df = df[info_cols].iloc[0:1,:]
    
    numeric_cols = list(df.columns[df.dtypes != "object"])
    new_df[numeric_cols] = df[numeric_cols].sum()
    return new_df

batting = batting.groupby(["playerID", "yearID"]).apply(clean_stats)

In [167]:
batting = batting.reset_index(0).droplevel(0)

In [169]:
def next_season_hr(player):
    player = player.sort_values("yearID")
    player["Next_HR"] = player["HR"].shift(-1)
    return player

batting = batting.groupby("playerID", group_keys=False).apply(next_season_hr)

In [170]:
null_count = batting.isnull().sum()

In [171]:
complete_cols = list(batting.columns[null_count == 0])
batting = batting[complete_cols + ["Next_HR"]].copy()

In [172]:
batting

Unnamed: 0,playerID,teamID,lgID,POS,yearID,stint,G,AB,R,H,...,GS,InnOuts,PO,A,E,DP,PB,WP,ZR,Next_HR
22584,aardsda01,SFN,NL,P,2004.0,1.0,11.0,0.0,0.0,0.0,...,0.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26485,aardsda01,CHN,NL,P,2006.0,1.0,45.0,2.0,0.0,0.0,...,0.0,159.0,1.0,5.0,0.0,1.0,0.0,0.0,0.0,0.0
28616,aardsda01,CHA,AL,P,2007.0,1.0,25.0,0.0,0.0,0.0,...,0.0,97.0,2.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0
30708,aardsda01,BOS,AL,P,2008.0,1.0,47.0,1.0,0.0,0.0,...,0.0,146.0,3.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0
32725,aardsda01,SEA,AL,P,2009.0,1.0,73.0,0.0,0.0,0.0,...,0.0,214.0,2.0,5.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3291,zupcibo01,BOS,AL,OF,1993.0,1.0,141.0,286.0,40.0,69.0,...,72.0,2229.0,179.0,7.0,4.0,2.0,0.0,0.0,0.0,4.0
4699,zupcibo01,BOS,AL,OF,15952.0,12.0,144.0,368.0,40.0,72.0,...,44.0,1322.0,98.0,8.0,0.0,0.0,0.0,0.0,0.0,
47718,zychto01,SEA,AL,P,2015.0,1.0,13.0,0.0,0.0,0.0,...,1.0,55.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
50004,zychto01,SEA,AL,P,2016.0,1.0,12.0,0.0,0.0,0.0,...,0.0,41.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [173]:
batting.dtypes[batting.dtypes == "object"]

playerID    object
teamID      object
lgID        object
POS         object
dtype: object

In [174]:
for col in ["teamID", "lgID", "POS"]:
    batting[f"{col}_code"] = batting[col].astype("category").cat.codes

In [179]:
batting.columns

Index(['playerID', 'teamID', 'lgID', 'POS', 'yearID', 'stint', 'G', 'AB', 'R',
       'H', '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'IBB', 'HBP',
       'SH', 'SF', 'GIDP', 'GS', 'InnOuts', 'PO', 'A', 'E', 'DP', 'PB', 'WP',
       'ZR', 'Next_HR', 'teamID_code', 'lgID_code', 'POS_code'],
      dtype='object')

In [176]:
batting_full = batting.copy()
batting = batting.dropna()

In [177]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

rf = RandomForestRegressor(n_estimators=10, random_state=1, min_samples_split=100)

split = TimeSeriesSplit(n_splits=2)

sfs = SequentialFeatureSelector(rf, 
                                n_features_to_select=8, 
                                direction="forward",
                                cv=split,
                                n_jobs=8
                               )

In [191]:
predictors = ["G", "RBI", "AB", "SO", "POS_code", "lgID_code", "teamID_code"]

In [181]:
removed_columns = ["Next_HR", "Name", "teamID", "playerID", "lgID", "POS"] + predictors
selected_columns = batting.columns[~batting.columns.isin(removed_columns)]

In [182]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

batting[selected_columns] = scaler.fit_transform(batting[selected_columns])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batting[selected_columns] = scaler.fit_transform(batting[selected_columns])


In [184]:
sfs.fit(batting[selected_columns], batting["Next_HR"])

SequentialFeatureSelector(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=2, test_size=None),
                          estimator=RandomForestRegressor(min_samples_split=100,
                                                          n_estimators=10,
                                                          random_state=1),
                          n_features_to_select=8, n_jobs=8)

In [192]:
predictors = list(set(selected_columns[sfs.get_support()]) | set(predictors))

In [193]:
predictors

['A',
 '2B',
 'teamID_code',
 'G',
 'RBI',
 'SH',
 'PO',
 'HR',
 'yearID',
 'POS_code',
 'stint',
 'lgID_code',
 'AB',
 'SO',
 'SB']

In [194]:
def backtest(data, model, predictors, start=5, step=1):
    all_predictions = []
    
    years = sorted(data["yearID"].unique())
    
    for i in range(start, len(years), step):
        current_year = years[i]
        train = data[data["yearID"] < current_year]
        test = data[data["yearID"] == current_year]
        
        model.fit(train[predictors], train["Next_HR"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["Next_HR"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [None]:
from xgboost import XGBRegressor

xgb = XGBRegressor(random_state=1, learning_rate=.01, n_estimators=200, n_jobs=8, min_child_weight=50)

predictions = backtest(batting, xgb, predictors)

In [None]:
predictions.shape

In [None]:
from sklearn.metrics import mean_squared_error

mean_squared_error(predictions["actual"], predictions["prediction"])

In [73]:
batting = batting_full.copy().dropna()
batting = batting.groupby("playerID", group_keys=False).filter(lambda x: x.shape[0] > 1)

In [74]:
def player_history(df):
    df = df.sort_values("yearID")
        
    df["player_season"] = range(0, df.shape[0])
    df["hr_corr"] = list(df[["player_season", "HR"]].expanding().corr().droplevel(0).loc["player_season","HR"])
    df["hr_corr"].fillna(0, inplace=True)
    df["hr_prev"] = df["HR"] / df["HR"].shift(1)
    
    return df.iloc[1:,]

batting = batting.groupby("playerID", group_keys=False).apply(player_history)

In [75]:
def group_averages(df):
    return df["HR"] / df["HR"].mean()

In [76]:
batting["hr_season"] = batting.groupby("yearID").apply(group_averages)
batting["hr_pos"] = batting.groupby("POS").apply(group_averages)

TypeError: incompatible index of inserted column with frame index

In [None]:
new_predictors = predictors + ["player_season", "hr_corr", "hr_prev", "hr_season", "hr_pos"]

In [None]:
predictions = backtest(batting, xgb, new_predictors)

In [None]:
mean_squared_error(predictions["actual"], predictions["prediction"]) 