# Model Training/Predicting with XGBoost

Predict ERA of remaining games for each KBO team

In [1]:
import pandas as pd
import numpy as np

import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, train_test_split

In [2]:
# Load Data

df = pd.read_csv('data/final_dataset.csv')

In [3]:
# Remove outliers for better results

y = df['l']
removed_outliers = y.between(y.quantile(.05), y.quantile(.95))

print(str(y[removed_outliers].size) + " data points remain.") 

index_names = df[~removed_outliers].index
df_o = df.drop(index_names)

X = df_o.drop(['game_key', 'hit_r', 'r','l'], axis=1)  # player stats
y = df_o['l']  # runs conceded

# train / test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48)

5355 data points remain.


## BayesianOptimization

Used Bayesian Optimazation for finding the best parameters

In [4]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

In [5]:
import xgboost as xgb
from bayes_opt import BayesianOptimization
from sklearn.metrics import mean_squared_error

In [6]:
def xgb_evaluate(max_depth, subsample, eta, gamma, colsample_bytree):
    params = {'eval_metric': 'rmse',
              'max_depth': int(max_depth),
              'subsample': subsample,
              'eta': eta,
              'gamma': gamma,
              'colsample_bytree': colsample_bytree}

    cv_result = xgb.cv(params, dtrain, num_boost_round=100, nfold=3)    
    
    # Bayesian optimization only knows how to maximize, not minimize, so return the negative RMSE
    return -1.0 * cv_result['test-rmse-mean'].iloc[-1]

In [8]:
# Search parameters
xgb_bo = BayesianOptimization(xgb_evaluate, {'max_depth': (3, 12), 
                                             'subsample': (0.4, 1.0),
                                             'eta': (0.01, 0.2),
                                             'gamma': (0, 1),
                                             'colsample_bytree': (0.3, 0.9)})
# Use the expected improvement acquisition function to handle negative numbers
# Optimally needs quite a few more initiation points and number of iterations
xgb_bo.maximize(init_points=3, n_iter=60, acq='ei')

|   iter    |  target   | colsam... |    eta    |   gamma   | max_depth | subsample |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-3.009   [0m | [0m 0.7593  [0m | [0m 0.0802  [0m | [0m 0.3115  [0m | [0m 10.98   [0m | [0m 0.522   [0m |
| [95m 2       [0m | [95m-2.995   [0m | [95m 0.6442  [0m | [95m 0.09685 [0m | [95m 0.9352  [0m | [95m 6.409   [0m | [95m 0.8093  [0m |
| [95m 3       [0m | [95m-2.947   [0m | [95m 0.4362  [0m | [95m 0.05138 [0m | [95m 0.3639  [0m | [95m 8.111   [0m | [95m 0.5245  [0m |
| [0m 4       [0m | [0m-3.031   [0m | [0m 0.4232  [0m | [0m 0.1053  [0m | [0m 0.297   [0m | [0m 8.079   [0m | [0m 0.5635  [0m |
| [0m 5       [0m | [0m-3.027   [0m | [0m 0.8942  [0m | [0m 0.18    [0m | [0m 0.07787 [0m | [0m 4.588   [0m | [0m 0.7559  [0m |
| [95m 6       [0m | [95m-2.925   [0m | [95m 0.611   [0m | [95m 0.06316 [0m | [95m 0.9318  [0m

| [0m 58      [0m | [0m-2.893   [0m | [0m 0.8429  [0m | [0m 0.06038 [0m | [0m 0.8931  [0m | [0m 3.58    [0m | [0m 0.4788  [0m |
| [0m 59      [0m | [0m-3.04    [0m | [0m 0.599   [0m | [0m 0.01587 [0m | [0m 0.1719  [0m | [0m 7.645   [0m | [0m 0.7365  [0m |
| [0m 60      [0m | [0m-2.962   [0m | [0m 0.5405  [0m | [0m 0.08219 [0m | [0m 0.7812  [0m | [0m 10.65   [0m | [0m 0.8932  [0m |
| [95m 61      [0m | [95m-2.88    [0m | [95m 0.8304  [0m | [95m 0.04359 [0m | [95m 0.8722  [0m | [95m 3.567   [0m | [95m 0.4505  [0m |
| [0m 62      [0m | [0m-3.02    [0m | [0m 0.6183  [0m | [0m 0.01657 [0m | [0m 0.1793  [0m | [0m 7.661   [0m | [0m 0.6527  [0m |
| [0m 63      [0m | [0m-2.902   [0m | [0m 0.7526  [0m | [0m 0.06241 [0m | [0m 0.1488  [0m | [0m 4.432   [0m | [0m 0.7012  [0m |


In [9]:
params = xgb_bo.max
params

{'target': -2.8802603333333336,
 'params': {'colsample_bytree': 0.8304055976664708,
  'eta': 0.043594201548329985,
  'gamma': 0.8722420541985678,
  'max_depth': 3.5666111255973103,
  'subsample': 0.4505322438827028}}

In [27]:
# Train model with search result

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48)
XGB_RG = xgb.XGBRegressor(max_depth=int(params['params']['max_depth']), subsample=params['params']['subsample'],
                         eta=params['params']['eta'], colsample_bytree=params['params']['colsample_bytree'],
                         gamma=params['params']['gamma'])
XGB_RG.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8304055976664708,
             eta=0.043594201548329985, gamma=0.8722420541985678, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.0435942002, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             subsample=0.4505322438827028, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [28]:
import pickle

# Save model
with open('./model/model_era.pkl', 'wb') as f:
    pickle.dump(XGB_RG, f)
    
# Load model
with open('./model/model_era.pkl', 'rb') as f:
    voting_clf = pickle.load(f)

In [29]:
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
score = r2_score(y_test, y_pred)
print("RMSE: %f" % (rmse))
print("R2: %f" % (score))

RMSE: 2.858035
R2: 0.021920


In [30]:
# Load predict data

predict = pd.read_csv('data/team_final.csv')
games = predict.drop(['TeamA', 'TeamB'], axis=1)
games

Unnamed: 0,hp_1,hp_2,hp_3,h1_1,h1_2,h1_3,h2_1,h2_2,h2_3,h3_1,...,a6_3,a7_1,a7_2,a7_3,a8_1,a8_2,a8_3,a9_1,a9_2,a9_3
0,5.616154,1.640308,1.844615,0.246684,0.354105,0.321579,0.240809,0.344404,0.329383,0.213191,...,0.329407,0.229510,0.296020,0.334824,0.252969,0.324615,0.351092,0.193071,0.264333,0.269024
1,5.616154,1.640308,1.844615,0.246684,0.354105,0.321579,0.240809,0.344404,0.329383,0.213191,...,0.382113,0.253984,0.319903,0.368871,0.232129,0.301500,0.345710,0.237645,0.299887,0.331774
2,5.616154,1.640308,1.844615,0.246684,0.354105,0.321579,0.240809,0.344404,0.329383,0.213191,...,0.452828,0.271406,0.346266,0.414828,0.262922,0.323219,0.368328,0.240219,0.304672,0.332859
3,5.616154,1.640308,1.844615,0.246684,0.354105,0.321579,0.240809,0.344404,0.329383,0.213191,...,0.385531,0.264984,0.337859,0.380094,0.253500,0.318500,0.379219,0.254188,0.321078,0.358328
4,5.616154,1.640308,1.844615,0.246684,0.354105,0.321579,0.240809,0.344404,0.329383,0.213191,...,0.380855,0.260726,0.337855,0.367097,0.201645,0.285065,0.269565,0.229242,0.303774,0.302984
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,4.310000,1.324697,2.588939,0.287106,0.389985,0.406712,0.288924,0.382470,0.496621,0.342273,...,0.466746,0.281333,0.350921,0.473810,0.280048,0.339317,0.442333,0.253841,0.324825,0.368254
96,4.310000,1.324697,2.588939,0.287106,0.389985,0.406712,0.288924,0.382470,0.496621,0.342273,...,0.386250,0.271222,0.343460,0.375079,0.262733,0.334350,0.349217,0.266746,0.328143,0.345238
97,4.310000,1.324697,2.588939,0.287106,0.389985,0.406712,0.288924,0.382470,0.496621,0.342273,...,0.353123,0.211123,0.284908,0.287738,0.209815,0.280600,0.290569,0.221714,0.290603,0.291238
98,4.310000,1.324697,2.588939,0.287106,0.389985,0.406712,0.288924,0.382470,0.496621,0.342273,...,0.391338,0.263554,0.328508,0.387062,0.258769,0.320000,0.391892,0.253738,0.315000,0.354908


In [31]:
# Predict results
result = model.predict(games)

### Calculate ERA of remaining games based on prediction result

In [32]:
result

array([4.35043  , 4.285108 , 4.4645205, 4.720843 , 4.577303 , 5.044766 ,
       4.8986683, 4.180875 , 4.625833 , 4.624205 , 4.1606774, 4.6022215,
       4.800764 , 4.9972525, 4.7637973, 5.4662957, 5.07759  , 4.255019 ,
       4.8519053, 4.7407565, 4.419239 , 4.774128 , 4.8738365, 5.1733956,
       4.944789 , 5.5781775, 5.148485 , 4.4138594, 4.8655334, 4.8426914,
       4.1644125, 4.714135 , 4.8112483, 4.943251 , 4.8079524, 5.4972353,
       5.026218 , 4.2630196, 4.75844  , 4.7212667, 4.273122 , 4.6838474,
       4.8444843, 5.0479455, 4.8471556, 5.486792 , 5.065421 , 4.323579 ,
       4.8937583, 4.8299766, 4.2363257, 4.597541 , 4.694505 , 4.859173 ,
       4.664375 , 5.335716 , 4.936728 , 4.156541 , 4.6576424, 4.61454  ,
       4.1342344, 4.6534147, 4.692177 , 4.8957267, 4.6888804, 5.450224 ,
       4.9612346, 4.2999268, 4.6821485, 4.6928673, 4.621785 , 4.7448874,
       4.7605333, 5.1093273, 4.8610134, 5.374394 , 5.1011114, 4.5614343,
       4.9828696, 4.914292 , 4.6072927, 4.968619 , 

In [33]:
teams = ['HH', 'HT', 'KT', 'LG', 'LT', 'NC', 'OB', 'SK', 'SS', 'WO']
HH = result[:10]
HT = result[10:20]
KT = result[20:30]
LG = result[30:40]
LT = result[40:50]
NC = result[50:60]
OB = result[60:70]
SK = result[70:80]
SS = result[80:90]
WO = result[90:]

In [34]:
# ERA for each match-up
# index = Home Team

result_comp = [HH, HT, KT, LG, LT, NC, OB, SK, SS, WO]
result_df = pd.DataFrame(result_comp, columns=teams)
result_df['Team'] = teams
result_df.set_index('Team', inplace=True, drop=True)
result_df

Unnamed: 0_level_0,HH,HT,KT,LG,LT,NC,OB,SK,SS,WO
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
HH,4.35043,4.285108,4.46452,4.720843,4.577303,5.044766,4.898668,4.180875,4.625833,4.624205
HT,4.160677,4.602221,4.800764,4.997252,4.763797,5.466296,5.07759,4.255019,4.851905,4.740757
KT,4.419239,4.774128,4.873837,5.173396,4.944789,5.578177,5.148485,4.413859,4.865533,4.842691
LG,4.164412,4.714135,4.811248,4.943251,4.807952,5.497235,5.026218,4.26302,4.75844,4.721267
LT,4.273122,4.683847,4.844484,5.047945,4.847156,5.486792,5.065421,4.323579,4.893758,4.829977
NC,4.236326,4.597541,4.694505,4.859173,4.664375,5.335716,4.936728,4.156541,4.657642,4.61454
OB,4.134234,4.653415,4.692177,4.895727,4.68888,5.450224,4.961235,4.299927,4.682148,4.692867
SK,4.621785,4.744887,4.760533,5.109327,4.861013,5.374394,5.101111,4.561434,4.98287,4.914292
SS,4.607293,4.968619,5.03115,5.30201,5.132349,5.671068,5.36601,4.599652,5.083802,5.019737
WO,4.16516,4.597424,4.671093,4.85585,4.748633,5.259591,4.847064,4.128472,4.573229,4.665631
