### xgboost model to predict mean player outcomes
used in conjunction with nn algs to determine mean wrc for player. Three diffrent models based on available data then combinded to create an xgb predection.

In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt

os.chdir('C:/Users/dalto/OneDrive/Pictures/Documents/Projects/MLB Season Model/')

In [2]:
# import data
data_02_24 = pd.read_csv('./data/cleaned/data_02_24.csv')
df = data_02_24.copy() # import data
df = df.drop(columns=['Unnamed: 0'])

#### find the z_score of each stat
weighted avg of year, std of year, 10 away from 100 is one std


In [3]:
def mean_year(group):
        avg = np.average(group, weights=group.index.map(lambda x: df.loc[x, 'PA']))
        return avg

In [4]:
# weighted std to account for low PA outlier preformances
def calculate_weighted_std(group):
    avg = np.average(group, weights=group.index.map(lambda x: df.loc[x, 'PA']))
    variance = np.average((group - avg)**2, weights=group.index.map(lambda x: df.loc[x, 'PA']))
    return np.sqrt(variance)

In [5]:
def z_scores(player_df, mean_df, std_df):
    # z scores for each stat and player, also creating a new df flow, add back names based on MLBAMID later
    z_scores = pd.DataFrame(index=player_df.index, columns=player_df.select_dtypes(include=[np.number]).columns)

    # scaled for each col
    for col in player_df.select_dtypes(include=[np.number]).columns:
        if col not in ['Season', 'MLBAMID']:  # not needed
            for idx in player_df.index:
                season = player_df.loc[idx, 'Season']
                value = player_df.loc[idx, col]
                mean = mean_df.loc[mean_df['Season'] == season, col].iloc[0]
                std = std_df.loc[std_df['Season'] == season, col].iloc[0]
                # scale so that 10 is 1 std away
                z_scores.loc[idx, col] = 100 + ((value - mean) / std * 10)

    z_scores['Season'] = player_df['Season']
    
    # cols to numeric
    for col in z_scores.columns:
        if col not in ['Season', 'MLBAMID']:
            z_scores[col] = pd.to_numeric(z_scores[col])
    
    # reinsert player names
    z_scores.insert(0, 'Name', player_df['Name']) 

    return z_scores

In [6]:
# df to store mean results
mean_values = pd.DataFrame(index=df['Season'].unique())

for col in df.select_dtypes(include=[np.number]).columns:
    if col != 'Season' and col != 'MLBAMID':  # skip meaningless numeric
        yearly_mean = df.groupby('Season')[col].apply(mean_year)
        mean_values[col] = yearly_mean

# reset and rename index
mean_values = mean_values.reset_index()
mean_values = mean_values.rename(columns={'index': 'Season'})

# sort and display
mean_values = mean_values.sort_values(by=['Season'])
mean_values.head()

Unnamed: 0,Season,PA,Age,BB%,K%,BABIP,wRC+,BsR,Off,Def,WAR,O-Swing%,Z-Swing%,O-Contact%,Z-Contact%,Contact%,SwStr%,CSW%
19,2002,473.684181,29.200233,0.088931,0.162881,0.294877,99.856949,0.221509,3.922156,-0.767819,1.888873,0.181982,0.703447,0.507012,0.860107,0.797336,0.094299,0.256508
22,2003,475.461939,29.191111,0.086575,0.159114,0.296402,99.927082,0.141683,4.016513,-0.608826,1.909976,0.221738,0.691527,0.527634,0.882055,0.80012,0.093138,0.260791
20,2004,477.651063,29.335336,0.087845,0.163399,0.299236,99.906874,0.255112,4.303784,-0.517574,1.945867,0.204546,0.693372,0.623672,0.855852,0.806547,0.088749,0.255794
14,2005,475.451649,29.259353,0.083354,0.159429,0.297611,100.116588,0.277356,3.656253,-0.893002,1.858494,0.214757,0.681891,0.55061,0.886206,0.813783,0.086017,0.255201
15,2006,483.988528,29.191618,0.085995,0.162801,0.303676,100.090682,0.168365,3.574442,-0.760455,1.862631,0.232215,0.671328,0.584224,0.889358,0.816041,0.084597,0.251676


In [7]:
# Create empty DataFrame to store results
std_values = pd.DataFrame(index=df['Season'].unique())

# Loop through numeric columns
for col in df.select_dtypes(include=[np.number]).columns:
    if col != 'Season' and col != 'MLBAMID':  # skip meaningless numeric
        yearly_std = df.groupby('Season')[col].apply(calculate_weighted_std)
        std_values[col] = yearly_std

std_values = std_values.reset_index()
std_values = std_values.rename(columns={'index': 'Season'})

std_values = std_values.sort_values(by=['Season'])
std_values.head()

Unnamed: 0,Season,PA,Age,BB%,K%,BABIP,wRC+,BsR,Off,Def,WAR,O-Swing%,Z-Swing%,O-Contact%,Z-Contact%,Contact%,SwStr%,CSW%
19,2002,184.243302,4.009692,0.037986,0.055977,0.035674,31.64566,3.121745,19.28665,9.129972,2.195603,0.047095,0.056035,0.108927,0.051809,0.061293,0.032167,0.028209
22,2003,185.10606,4.078477,0.035647,0.053696,0.034696,30.915634,3.282536,17.866265,9.428022,2.134635,0.048461,0.058644,0.107443,0.047224,0.060704,0.031931,0.027495
20,2004,189.82091,4.146485,0.037057,0.057978,0.036772,30.199233,3.575905,17.922925,9.712096,2.177108,0.051101,0.059314,0.106975,0.055553,0.063824,0.033339,0.030215
14,2005,182.884548,4.155987,0.032447,0.054885,0.035186,28.238546,3.328612,16.513,10.060394,1.95379,0.057376,0.059317,0.117271,0.047543,0.061264,0.03213,0.02791
15,2006,184.138008,4.310582,0.035272,0.055416,0.036208,28.100105,3.600161,16.729148,9.906593,1.979241,0.060376,0.05952,0.112736,0.046223,0.061019,0.031551,0.028022


apply z scores

In [8]:
z_score_df = z_scores(player_df=df, mean_df=mean_values, std_df=std_values)
z_score_df.head()

Unnamed: 0,Name,Season,PA,Age,BB%,K%,BABIP,wRC+,BsR,Off,Def,WAR,O-Swing%,Z-Swing%,O-Contact%,Z-Contact%,Contact%,SwStr%,CSW%
0,brent clevlen,2007,74.82687,85.958971,74.686219,193.060161,107.344122,43.282571,98.65317,96.363387,100.657718,89.901996,138.703183,88.950287,71.666459,22.211544,37.187201,169.893575,204.59646
1,brett hayes,2009,74.297907,90.367791,72.475999,127.859623,108.219882,117.931506,99.866315,98.235778,100.843512,91.58048,170.668602,80.735141,60.421763,105.381033,65.57805,147.469352,199.428878
2,carlos peguero,2014,75.723755,96.257739,107.348483,147.612705,148.436984,92.738037,100.377615,97.711317,100.971051,91.256604,122.457976,73.452584,41.259125,91.336666,33.462464,156.57838,183.278087
3,seby zavala,2019,75.887131,92.057094,73.040518,181.646207,108.098997,44.146682,99.594613,95.874994,101.729855,90.245625,129.458257,111.272068,55.70808,34.476706,40.158323,179.134016,179.9933
4,franklin barreto,2020,77.834198,88.981943,76.647649,142.109822,78.737137,59.956124,101.610132,90.946521,99.887662,86.747334,120.942395,87.572409,55.233155,84.442687,56.517822,144.969583,165.94178


### xgboost model

In [9]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

make wrc+ value the next year

In [10]:
# remove 2024 to train model bc we do not know the following year
train_z = z_score_df[z_score_df['Season'] < 2024].copy()
train_z['next_wrc+'] = np.nan # empty col


for idx in train_z.index:
    # retrive name, season
    name = train_z.loc[idx, 'Name']
    season = train_z.loc[idx, 'Season']
    # full df
    next_year_noscale = df[(df['Name'] == name) & (df['Season'] == season + 1)]
    next_year_scaled = z_score_df[(z_score_df['Name'] == name) & (z_score_df['Season'] == season + 1)]

    # add to df
    if not next_year_noscale.empty:
        train_z.loc[idx, 'next_wrc+'] = next_year_noscale['wRC+'].iloc[0]
        train_z.loc[idx, 'next_pa'] = next_year_scaled['PA'].iloc[0] # need for later anaylsis

# remove rows of nas (player retired or smth)
train_z = train_z.dropna(subset=['next_wrc+'])

# account for outliers


# display
train_z.head()

Unnamed: 0,Name,Season,PA,Age,BB%,K%,BABIP,wRC+,BsR,Off,...,WAR,O-Swing%,Z-Swing%,O-Contact%,Z-Contact%,Contact%,SwStr%,CSW%,next_wrc+,next_pa
0,brent clevlen,2007,74.82687,85.958971,74.686219,193.060161,107.344122,43.282571,98.65317,96.363387,...,89.901996,138.703183,88.950287,71.666459,22.211544,37.187201,169.893575,204.59646,40.620897,76.461476
1,brett hayes,2009,74.297907,90.367791,72.475999,127.859623,108.219882,117.931506,99.866315,98.235778,...,91.58048,170.668602,80.735141,60.421763,105.381033,65.57805,147.469352,199.428878,68.663763,79.520422
2,carlos peguero,2014,75.723755,96.257739,107.348483,147.612705,148.436984,92.738037,100.377615,97.711317,...,91.256604,122.457976,73.452584,41.259125,91.336666,33.462464,156.57838,183.278087,92.949027,79.756145
5,domingo santana,2014,76.161502,80.937328,92.701724,191.474452,27.049843,35.300547,100.388715,95.262033,...,88.346507,104.533169,117.006084,37.346862,46.681858,34.97386,163.064364,174.135337,111.75759,85.107766
8,eliezer alfonzo,2008,75.550163,100.406271,73.75179,132.428726,56.320994,41.119256,98.386518,95.994778,...,89.755634,150.740042,83.057646,74.748966,56.559446,48.8952,160.671646,180.916565,19.347656,80.120523


In [11]:
train_z.corr(numeric_only=True)

Unnamed: 0,Season,PA,Age,BB%,K%,BABIP,wRC+,BsR,Off,Def,WAR,O-Swing%,Z-Swing%,O-Contact%,Z-Contact%,Contact%,SwStr%,CSW%,next_wrc+,next_pa
Season,1.0,0.014546,-0.019569,0.029076,-0.008916,0.022997,0.01887,0.021108,0.002037,0.008562,0.007766,-0.047145,-0.026187,-0.006109,0.003512,0.004835,-0.018717,0.004084,0.000939,0.015591
PA,0.014546,1.0,0.191096,0.199412,-0.343358,0.193752,0.479191,0.069551,0.407104,-0.064043,0.670455,-0.12383,0.005106,0.224908,0.211138,0.242266,-0.250521,-0.316426,0.361969,0.654934
Age,-0.019569,0.191096,1.0,0.134701,-0.181553,-0.057687,0.090597,-0.173281,0.015743,-0.099561,0.044732,-0.135402,-0.039985,0.101,0.118745,0.144529,-0.161465,-0.152785,-0.017348,-0.030097
BB%,0.029076,0.199412,0.134701,1.0,0.022289,0.040331,0.403509,-0.035738,0.386002,-0.152251,0.299057,-0.629797,-0.25673,-0.095223,-0.093689,-0.039231,-0.178478,-0.048652,0.21998,0.166223
K%,-0.008916,-0.343358,-0.181553,0.022289,1.0,-0.001656,-0.260139,0.004901,-0.098615,-0.062773,-0.237804,0.113298,0.110484,-0.715728,-0.75283,-0.840315,0.763811,0.707644,-0.107718,-0.275708
BABIP,0.022997,0.193752,-0.057687,0.040331,-0.001656,1.0,0.643368,0.09145,0.343506,-0.033747,0.316834,-0.011984,0.028817,-0.014869,-0.015361,-0.021388,0.018797,-0.007627,0.109904,0.17428
wRC+,0.01887,0.479191,0.090597,0.403509,-0.260139,0.643368,1.0,-0.00238,0.673244,-0.148332,0.625416,-0.226157,0.008656,0.086486,0.077121,0.10567,-0.164731,-0.245755,0.345374,0.427962
BsR,0.021108,0.069551,-0.173281,-0.035738,0.004901,0.09145,-0.00238,1.0,0.182185,0.171864,0.238929,-0.052483,-0.10987,0.032315,0.037251,0.046845,-0.054323,0.060881,-0.002778,0.078367
Off,0.002037,0.407104,0.015743,0.386002,-0.098615,0.343506,0.673244,0.182185,1.0,-0.196865,0.819123,-0.182267,0.0169,0.004441,-0.010294,0.008665,-0.063911,-0.147038,0.382569,0.407933
Def,0.008562,-0.064043,-0.099561,-0.152251,-0.062773,-0.033747,-0.148332,0.171864,-0.196865,1.0,0.271187,0.041378,-0.054239,0.092516,0.086778,0.103715,-0.073243,-0.003418,-0.105743,-0.023805


'O-Contact%','Z-Swing%', 'O-Swing%' .49
Contact% SwStr%

In [12]:
X = train_z.drop(columns=['Name', 'Season', 'BsR', 'Def', 'next_wrc+', 'O-Contact%','Z-Swing%', 'O-Swing%'])
Y = train_z['next_wrc+']
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.25, random_state=26)

model

In [13]:
model = xgb.XGBRegressor(n_jobs=-1, n_estimators=50000, device = "cuda", eval_metric = ['mae', 'rmse']) 

##### hyper para tuning

random search to narrow space into grid search

In [13]:
from sklearn.model_selection import RandomizedSearchCV

In [14]:
rnd_search_params = {
    'learning_rate': np.linspace(0.05, 0.2, 4),          
    'max_leaves': np.linspace(1, 20, 10, dtype=int),     
    'min_child_weight': np.linspace(1, 10, 10),         
    'subsample': np.linspace(0.3, 0.9, 7),               
    'colsample_bytree': np.linspace(0.4, 1, 7),          
    'early_stopping_rounds': np.linspace(10, 100, 10, dtype=int),  
    'max_depth': np.linspace(0, 20, 11, dtype=int)       
}

rnd_searcher = RandomizedSearchCV(model, rnd_search_params, random_state=26, n_iter=100, cv=10, n_jobs=-1, verbose=4)
rnd_searcher.fit(xtrain, ytrain, eval_set=[(xtest, ytest)])
print(rnd_searcher.best_params_)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


[0]	validation_0-mae:26.29414	validation_0-rmse:34.87658
[1]	validation_0-mae:25.86382	validation_0-rmse:34.34586
[2]	validation_0-mae:25.64799	validation_0-rmse:34.10080
[3]	validation_0-mae:25.46788	validation_0-rmse:33.88486
[4]	validation_0-mae:25.31679	validation_0-rmse:33.69554
[5]	validation_0-mae:25.29427	validation_0-rmse:33.66531
[6]	validation_0-mae:25.12667	validation_0-rmse:33.47196
[7]	validation_0-mae:24.97477	validation_0-rmse:33.30591
[8]	validation_0-mae:24.64465	validation_0-rmse:32.89981
[9]	validation_0-mae:24.51117	validation_0-rmse:32.74812
[10]	validation_0-mae:24.37899	validation_0-rmse:32.59856
[11]	validation_0-mae:24.08703	validation_0-rmse:32.24028
[12]	validation_0-mae:23.97465	validation_0-rmse:32.11488




[13]	validation_0-mae:23.88970	validation_0-rmse:32.00941
[14]	validation_0-mae:23.79563	validation_0-rmse:31.89053
[15]	validation_0-mae:23.72231	validation_0-rmse:31.80460
[16]	validation_0-mae:23.63432	validation_0-rmse:31.69440
[17]	validation_0-mae:23.40401	validation_0-rmse:31.40535
[18]	validation_0-mae:23.19934	validation_0-rmse:31.14244
[19]	validation_0-mae:23.13220	validation_0-rmse:31.06843
[20]	validation_0-mae:22.94678	validation_0-rmse:30.85391
[21]	validation_0-mae:22.88410	validation_0-rmse:30.78492
[22]	validation_0-mae:22.70842	validation_0-rmse:30.57366
[23]	validation_0-mae:22.56388	validation_0-rmse:30.38930
[24]	validation_0-mae:22.42546	validation_0-rmse:30.22386
[25]	validation_0-mae:22.39518	validation_0-rmse:30.18670
[26]	validation_0-mae:22.34750	validation_0-rmse:30.13244
[27]	validation_0-mae:22.30226	validation_0-rmse:30.07965
[28]	validation_0-mae:22.26739	validation_0-rmse:30.03744
[29]	validation_0-mae:22.14371	validation_0-rmse:29.87988
[30]	validatio

grid search

In [17]:
from sklearn.model_selection import GridSearchCV
grid_search_params = {
    'learning_rate': [0.05],   
    'max_leaves': [8],
    'min_child_weight': [5, 6, 7, 8],      
    'subsample': [0.7],            
    'colsample_bytree':[0.5],     
    'early_stopping_rounds':[80, 90, 100],
    'max_depth': [5]
}


grid_searcher = GridSearchCV(model, grid_search_params, cv=5, n_jobs=-1, verbose=4)
grid_searcher.fit(xtrain, ytrain, eval_set=[(xtest, ytest)])
print(grid_searcher.best_params_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[0]	validation_0-mae:26.25788	validation_0-rmse:34.82212
[1]	validation_0-mae:25.82098	validation_0-rmse:34.27357
[2]	validation_0-mae:25.59482	validation_0-rmse:34.00480
[3]	validation_0-mae:25.38678	validation_0-rmse:33.75682
[4]	validation_0-mae:25.22651	validation_0-rmse:33.56129
[5]	validation_0-mae:25.18762	validation_0-rmse:33.51517
[6]	validation_0-mae:25.01294	validation_0-rmse:33.30594
[7]	validation_0-mae:24.84422	validation_0-rmse:33.10842
[8]	validation_0-mae:24.48427	validation_0-rmse:32.67231
[9]	validation_0-mae:24.33771	validation_0-rmse:32.50138
[10]	validation_0-mae:24.21135	validation_0-rmse:32.35541
[11]	validation_0-mae:23.91735	validation_0-rmse:31.99575
[12]	validation_0-mae:23.79954	validation_0-rmse:31.85583
[13]	validation_0-mae:23.69763	validation_0-rmse:31.73450
[14]	validation_0-mae:23.59837	validation_0-rmse:31.61466
[15]	validation_0-mae:23.52622	validation_0-rmse:31.53187
[16]	validation_0-mae



[34]	validation_0-mae:21.53437	validation_0-rmse:29.14985
[35]	validation_0-mae:21.44091	validation_0-rmse:29.04178
[36]	validation_0-mae:21.35259	validation_0-rmse:28.93427
[37]	validation_0-mae:21.33058	validation_0-rmse:28.91667
[38]	validation_0-mae:21.31717	validation_0-rmse:28.90166
[39]	validation_0-mae:21.30615	validation_0-rmse:28.88732
[40]	validation_0-mae:21.23209	validation_0-rmse:28.80050
[41]	validation_0-mae:21.21805	validation_0-rmse:28.78796
[42]	validation_0-mae:21.21167	validation_0-rmse:28.77903
[43]	validation_0-mae:21.20606	validation_0-rmse:28.76985
[44]	validation_0-mae:21.19442	validation_0-rmse:28.75555
[45]	validation_0-mae:21.12742	validation_0-rmse:28.66702
[46]	validation_0-mae:21.05523	validation_0-rmse:28.57677
[47]	validation_0-mae:21.05111	validation_0-rmse:28.57201
[48]	validation_0-mae:21.04200	validation_0-rmse:28.56633
[49]	validation_0-mae:21.03930	validation_0-rmse:28.56356
[50]	validation_0-mae:20.98199	validation_0-rmse:28.49242
[51]	validatio

In [19]:
opti_para = {'colsample_bytree': 0.5, 'early_stopping_rounds': 80, 'learning_rate': 0.05, 'max_depth': 5, 'max_leaves': 8, 'min_child_weight': 5, 'subsample': 0.7}
# update model parameters
model.set_params(**opti_para)

xgb.cv for detailed analysis, optimal boosting rounds

In [20]:
matrix_data = xgb.DMatrix(X, Y)

fmodel = xgb.cv(opti_para, dtrain=matrix_data, num_boost_round=1000, nfold=10, metrics= ['mae', 'rmse'], as_pandas=True, seed=26, early_stopping_rounds=80)
fmodel.head()

Parameters: { "early_stopping_rounds" } are not used.



Unnamed: 0,train-mae-mean,train-mae-std,train-rmse-mean,train-rmse-std,test-mae-mean,test-mae-std,test-rmse-mean,test-rmse-std
0,28.535052,0.113416,38.673486,0.166659,28.544,0.930363,38.657942,1.521591
1,27.985998,0.108742,38.003249,0.15796,28.002097,0.921702,37.99705,1.495284
2,27.486352,0.10721,37.394226,0.15549,27.509561,0.913279,37.39899,1.467533
3,27.021902,0.100562,36.819027,0.144712,27.059481,0.905397,36.840328,1.442037
4,26.598926,0.095975,36.293968,0.136308,26.646101,0.897856,36.32714,1.422087


#### opti model

In [14]:
model.set_params(n_estimators=200, early_stopping_rounds=None)
model.fit(xtrain, ytrain)
ypreds = model.predict(xtest)



##### results df

In [15]:
results_df = xtest.copy()
results_df['Name'] = train_z.loc[xtest.index, 'Name']
results_df['Season'] = train_z.loc[xtest.index, 'Season']
results_df['next_pa'] = train_z.loc[xtest.index, 'next_pa']
results_df['pred_wrc+'] = ypreds
results_df['season_plus1_wrc+'] = ytest
results_df = results_df[['Name', 'Season', 'pred_wrc+', 'season_plus1_wrc+', 'next_pa']]
results_df['miss'] = (results_df['season_plus1_wrc+'] - results_df['pred_wrc+']).abs()
results_df.head()

Unnamed: 0,Name,Season,pred_wrc+,season_plus1_wrc+,next_pa,miss
12473,scooter gennett,2015,91.99324,92.114351,104.160553,0.121111
9093,manny machado,2023,118.933899,121.960891,110.781131,3.026992
9240,barry larkin,2003,95.196571,99.415008,95.171709,4.218437
11548,edwin encarnacion,2011,125.459061,150.162933,110.102242,24.703872
10022,oswaldo cabrera,2023,83.090958,87.606563,93.253068,4.515605


In [16]:
results_df = results_df.sort_values(by='season_plus1_wrc+')
results_df = results_df.sort_values(by='season_plus1_wrc+')
# Filter out extreme values
results_df = results_df[(results_df['season_plus1_wrc+'] < 150) & (results_df['season_plus1_wrc+'] > 50)]

# Calculate correlation
r2Results = results_df['pred_wrc+'].corr(results_df['season_plus1_wrc+'])
print(f"Removing extreme wrc+ r^2 {r2Results}")

Removing extreme wrc+ r^2 0.4374667806479644


#### Metrics

In [17]:
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_error

r2 = r2_score(y_true=ytest, y_pred=ypreds)
mse = mean_absolute_error(y_true=ytest, y_pred=ypreds)
rmse = root_mean_squared_error(y_true=ytest, y_pred=ypreds)


print(f"R^2: {r2} \n mse: {mse} \n rmse: {rmse}")

R^2: 0.23900253292502227 
 mse: 23.507823807102355 
 rmse: 33.65242355255114
