<a href="https://colab.research.google.com/github/daivikvennela/googlecolabml/blob/main/BaseballHRPredictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#intall pybaseball 
!pip install pybaseball

In [3]:
#Install Dependencies 
import os
import pandas as pd
import numpy as np
from pybaseball import batting_stats

In [65]:
START = 2002
END = 2022

In [66]:
#Takes stats from 2002 to 2022 with minimum 200 PA
batting = batting_stats(START, END, qual = 250)

In [39]:
# Read csv
batting.to_csv("batting.csv")

In [40]:
# Drop players with less than one season
batting = batting.groupby("IDfg", group_keys=False).filter(lambda x: x.shape[0] > 1)


In [41]:
batting[["Name", "Season", "HR"]]

Unnamed: 0,Name,Season,HR
0,Barry Bonds,2002,46
1,Barry Bonds,2004,45
14,Mookie Betts,2018,32
2,Barry Bonds,2003,45
73,Mike Trout,2013,27
...,...,...,...
5787,Gerald Laird,2010,5
5883,Chris Davis,2018,16
5668,Adam Dunn,2011,11
5856,Neifi Perez,2002,3


In [42]:
# takes data and creates a new column called next war and puts last eyars data in that slot 
def next_season(player):
  player = player.sort_values("Season")
  player["Next_HR"] = player["HR"].shift(-1)
  return player

batting = batting.groupby("IDfg", group_keys=False).apply(next_season)

In [43]:
batting[["Name", "Season", "HR", "Next_HR"]]

Unnamed: 0,Name,Season,HR,Next_HR
4857,Alfredo Amezaga,2006,3,2.0
4409,Alfredo Amezaga,2007,2,3.0
4606,Alfredo Amezaga,2008,3,
1068,Garret Anderson,2002,29,29.0
798,Garret Anderson,2003,29,14.0
...,...,...,...,...
1535,Shohei Ohtani,2019,18,46.0
316,Shohei Ohtani,2021,46,
342,Juan Soto,2018,22,34.0
308,Juan Soto,2019,34,29.0


In [44]:
# count the number of null values in a column
null_count = batting.isnull().sum()

In [45]:
#create a list where there are no null values in the columns 
complete_cols = list(batting.columns[null_count == 0])

In [46]:
#create a copy of the dataframe with the complete_cols and the "NextWAR" Columns 
batting = batting[complete_cols + ["Next_HR"]].copy()

In [47]:
#delete unecessry data columns such as dollar values and age 
del batting['Dol']
del batting['Age Rng']


In [48]:
#turn team name into a catergrolical type and convert it into a numeric code
batting["team code"] = batting["Team"].astype("category").cat.codes

In [49]:
#create copy and drop rows where the war is missing 
batting_full = batting.copy()
batting = batting.dropna()

In [50]:
#Import linear regression model, feature selector and time series split which is used as a part of the feature selsector
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

# Initialize the ridge regression model, one parameter "alapha" which if set higher reduced overfitting and the lower it is closer to a pure linear regresion
rr = Ridge(alpha = 1)

#Split data up into three parts and make predictions 
split = TimeSeriesSplit(n_splits = 3)

#Initatializ sequential feature selector, functions(ridge regresion model, number of features to select, direction, cv, number of threads/processor cores)
# Starts with 0 and evalutes features to find the best one until it has 20
sfs = SequentialFeatureSelector(rr, n_features_to_select= 20, direction = "forward", cv = split, n_jobs=4)




In [51]:
# Remove colums that will not work with sfs
removed_columns = ["Next_HR", "Name", "Team", "IDfg", "Season"]
#New set of columns w/o removed columns 


selected_columns = batting.columns[~batting.columns.isin(removed_columns)]

In [52]:
#scale data so that the mean is 0 and the stnadard devetions is 1, min max scaling helps put all values between 0 and 1 
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


In [53]:
#fit model 
sfs.fit(batting[selected_columns], batting["Next_HR"])

SequentialFeatureSelector(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                          estimator=Ridge(alpha=1), n_features_to_select=20,
                          n_jobs=4)

In [54]:
# extract features 
predictors = list(selected_columns[sfs.get_support()])

In [None]:
# backtest function: generates the predictions for the model , starts from 2007 and uses older data to predict the stats 

In [55]:
def backtest(data, model, predictions, start=5, step = 1):
  all_predictions= []
  years = sorted(data["Season"].unique())

  for i in range(start, len(years), step):
    current_year = years[i]

    train = data[data["Season"] < current_year]
    test = data[data["Season"] == current_year]
#predict function returns a numpy array and is later converted into a pandas series so it is easier to work with, use concat to combine
    model.fit(train[predictors], train["Next_HR"])
        
    preds = model.predict(test[predictors])
    preds = pd.Series(preds, index=test.index)
    combined = pd.concat([test["Next_HR"], preds], axis=1)
    combined.columns = ["actual", "prediction"]
        
    all_predictions.append(combined)
  return pd.concat(all_predictions)


In [56]:
predictions = backtest(batting, rr, predictors)

In [57]:
from sklearn.metrics import mean_squared_error
mean_squared_error(predictions["actual"], predictions["prediction"])

54.557480721431645

In [58]:
batting["Next_HR"].describe()

count    4596.000000
mean       15.532202
std        10.166366
min         0.000000
25%         8.000000
50%        13.000000
75%        22.000000
max        59.000000
Name: Next_HR, dtype: float64

In [59]:
diff = predictions["actual"] - predictions["prediction"]

In [60]:
merged = predictions.merge(batting, left_index=True, right_index=True)

In [61]:
merged["diff"] = (predictions["actual"] - predictions["prediction"]).abs()

In [None]:
merged[["IDfg", "Season", "Name", "HR", "Next_HR", "diff"]].sort_values(["diff"])

In [64]:
merged[["IDfg", "Season", "Name", "HR", "Next_HR", "diff"]].sort_values(["diff"])

Unnamed: 0,IDfg,Season,Name,HR,Next_HR,diff
3020,7571,2016,Lonnie Chisenhall,0.135593,12.0,0.003017
3865,6141,2009,Tony Gwynn,0.033898,3.0,0.003354
1301,1825,2008,David DeJesus,0.203390,13.0,0.004571
4279,11737,2014,Nick Castellanos,0.186441,15.0,0.004749
3659,3123,2014,Gregor Blanco,0.084746,5.0,0.006955
...,...,...,...,...,...,...
2867,19611,2019,Vladimir Guerrero Jr.,0.254237,48.0,27.209550
2979,6876,2015,Mark Trumbo,0.372881,47.0,27.501251
1446,14221,2018,Jorge Soler,0.152542,48.0,28.562762
1954,4949,2016,Giancarlo Stanton,0.457627,59.0,31.795136
