# Model Training/Predicting with XGBoost

Predicting ERA using XGBoost

In [1]:
import pandas as pd
import numpy as np

import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, train_test_split

In [9]:
import xgboost as xgb
from bayes_opt import BayesianOptimization
from sklearn.metrics import mean_squared_error

In [79]:
# load data

df = pd.read_csv('final_dataset.csv')

In [80]:
# remove outliers

y = df['l']
removed_outliers = y.between(y.quantile(.05), y.quantile(.95))

print(str(y[removed_outliers].size) + " data points remain.") 

index_names = df[~removed_outliers].index
df_o = df.drop(index_names)

X = df_o.drop(['game_key', 'hit_r', 'r','l'], axis=1)
y = df_o['l']

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48)

5355 data points remain.


## BayesianOptimization

Use bayesian optimazation for best parameters

In [81]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

In [82]:
def xgb_evaluate(max_depth, subsample, eta, gamma, colsample_bytree):
    params = {'eval_metric': 'rmse',
              'max_depth': int(max_depth),
              'subsample': subsample,
              'eta': eta,
              'gamma': gamma,
              'colsample_bytree': colsample_bytree}

    cv_result = xgb.cv(params, dtrain, num_boost_round=100, nfold=3)    
    
    # Bayesian optimization only knows how to maximize, not minimize, so return the negative RMSE
    return -1.0 * cv_result['test-rmse-mean'].iloc[-1]

In [83]:
xgb_bo = BayesianOptimization(xgb_evaluate, {'max_depth': (3, 12), 
                                             'subsample': (0.4, 1.0),
                                             'eta': (0.01, 0.2),
                                             'gamma': (0, 1),
                                             'colsample_bytree': (0.3, 0.9)})

xgb_bo.maximize(init_points=3, n_iter=30, acq='ei')

|   iter    |  target   | colsam... |    eta    |   gamma   | max_depth | subsample |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-2.923   [0m | [0m 0.5436  [0m | [0m 0.08727 [0m | [0m 0.9494  [0m | [0m 3.374   [0m | [0m 0.4417  [0m |
| [95m 2       [0m | [95m-2.896   [0m | [95m 0.5272  [0m | [95m 0.06668 [0m | [95m 0.6869  [0m | [95m 3.124   [0m | [95m 0.4805  [0m |
| [0m 3       [0m | [0m-2.996   [0m | [0m 0.6506  [0m | [0m 0.09674 [0m | [0m 0.1298  [0m | [0m 10.05   [0m | [0m 0.8469  [0m |
| [95m 4       [0m | [95m-2.876   [0m | [95m 0.5017  [0m | [95m 0.035   [0m | [95m 0.2763  [0m | [95m 3.0     [0m | [95m 0.5417  [0m |
| [0m 5       [0m | [0m-3.331   [0m | [0m 0.9     [0m | [0m 0.01    [0m | [0m 0.0     [0m | [0m 3.0     [0m | [0m 1.0     [0m |
| [0m 6       [0m | [0m-3.015   [0m | [0m 0.3484  [0m | [0m 0.1899  [0m | [0m 0.1626  [0m | [

In [19]:
# train model with parameter results

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48)
model = xgb.XGBRegressor(max_depth=3, subsample=0.5417, learning_rate=0.035, 
                        colsample_bytree=0.5017, gamma=0.2763)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
score = r2_score(y_test, y_pred)
print("RMSE: %f" % (rmse))
print("R2: %f" % (score))

RMSE: 2.858637
R2: 0.021508


In [87]:
# load prediction data

predict = pd.read_csv('team_final.csv')
games = predict.drop(['TeamA', 'TeamB'], axis=1)
games

Unnamed: 0,hp_1,hp_2,hp_3,h1_1,h1_2,h1_3,h2_1,h2_2,h2_3,h3_1,...,a6_3,a7_1,a7_2,a7_3,a8_1,a8_2,a8_3,a9_1,a9_2,a9_3
0,5.616154,1.640308,1.844615,0.246684,0.354105,0.321579,0.240809,0.344404,0.329383,0.213191,...,0.329407,0.229510,0.296020,0.334824,0.252969,0.324615,0.351092,0.193071,0.264333,0.269024
1,5.616154,1.640308,1.844615,0.246684,0.354105,0.321579,0.240809,0.344404,0.329383,0.213191,...,0.382113,0.253984,0.319903,0.368871,0.232129,0.301500,0.345710,0.237645,0.299887,0.331774
2,5.616154,1.640308,1.844615,0.246684,0.354105,0.321579,0.240809,0.344404,0.329383,0.213191,...,0.452828,0.271406,0.346266,0.414828,0.262922,0.323219,0.368328,0.240219,0.304672,0.332859
3,5.616154,1.640308,1.844615,0.246684,0.354105,0.321579,0.240809,0.344404,0.329383,0.213191,...,0.385531,0.264984,0.337859,0.380094,0.253500,0.318500,0.379219,0.254188,0.321078,0.358328
4,5.616154,1.640308,1.844615,0.246684,0.354105,0.321579,0.240809,0.344404,0.329383,0.213191,...,0.380855,0.260726,0.337855,0.367097,0.201645,0.285065,0.269565,0.229242,0.303774,0.302984
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,4.310000,1.324697,2.588939,0.287106,0.389985,0.406712,0.288924,0.382470,0.496621,0.342273,...,0.466746,0.281333,0.350921,0.473810,0.280048,0.339317,0.442333,0.253841,0.324825,0.368254
96,4.310000,1.324697,2.588939,0.287106,0.389985,0.406712,0.288924,0.382470,0.496621,0.342273,...,0.386250,0.271222,0.343460,0.375079,0.262733,0.334350,0.349217,0.266746,0.328143,0.345238
97,4.310000,1.324697,2.588939,0.287106,0.389985,0.406712,0.288924,0.382470,0.496621,0.342273,...,0.353123,0.211123,0.284908,0.287738,0.209815,0.280600,0.290569,0.221714,0.290603,0.291238
98,4.310000,1.324697,2.588939,0.287106,0.389985,0.406712,0.288924,0.382470,0.496621,0.342273,...,0.391338,0.263554,0.328508,0.387062,0.258769,0.320000,0.391892,0.253738,0.315000,0.354908


In [35]:
# get predictions
result = model.predict(games)

### Get ERA of every match-up

In [36]:
result

array([4.6806507, 4.957661 , 4.802639 , 5.0591054, 4.8616   , 5.161333 ,
       5.152258 , 4.6189075, 4.6906905, 4.8581076, 4.284381 , 4.888907 ,
       4.8316007, 5.005898 , 4.802678 , 5.2383084, 5.064868 , 4.3718815,
       4.5974584, 4.7567883, 4.4682627, 4.863547 , 4.8782372, 5.045968 ,
       4.812854 , 5.3462877, 5.049031 , 4.555764 , 4.70412  , 4.817758 ,
       4.2574916, 4.7836967, 4.7884274, 4.9169707, 4.7757883, 5.1804304,
       4.9759407, 4.2965307, 4.570568 , 4.755156 , 4.187068 , 4.748202 ,
       4.7737837, 4.877705 , 4.72401  , 5.178299 , 4.924163 , 4.274568 ,
       4.5396414, 4.7010145, 4.122489 , 4.695462 , 4.717879 , 4.824344 ,
       4.687554 , 5.1322913, 4.8257375, 4.161528 , 4.5000186, 4.620152 ,
       4.1954603, 4.809965 , 4.7821765, 4.9394684, 4.7857738, 5.2502446,
       4.9983087, 4.282961 , 4.590965 , 4.752339 , 4.6276736, 5.0069485,
       4.87598  , 5.1621375, 4.9337387, 5.222472 , 5.1847467, 4.7151747,
       4.8099465, 4.9138904, 4.517344 , 5.029713 , 

In [39]:
teams = ['HH', 'HT', 'KT', 'LG', 'LT', 'NC', 'OB', 'SK', 'SS', 'WO']
HH = result[:10]
HT = result[10:20]
KT = result[20:30]
LG = result[30:40]
LT = result[40:50]
NC = result[50:60]
OB = result[60:70]
SK = result[70:80]
SS = result[80:90]
WO = result[90:]

In [44]:
# ERA prediction result
# index = Home Team

result_comp = [HH, HT, KT, LG, LT, NC, OB, SK, SS, WO]
result_df = pd.DataFrame(result_comp, columns=teams)
result_df['Team'] = teams
result_df.set_index('Team', inplace=True, drop=True)
result_df

Unnamed: 0_level_0,HH,HT,KT,LG,LT,NC,OB,SK,SS,WO
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
HH,4.680651,4.957661,4.802639,5.059105,4.8616,5.161333,5.152258,4.618907,4.690691,4.858108
HT,4.284381,4.888907,4.831601,5.005898,4.802678,5.238308,5.064868,4.371881,4.597458,4.756788
KT,4.468263,4.863547,4.878237,5.045968,4.812854,5.346288,5.049031,4.555764,4.70412,4.817758
LG,4.257492,4.783697,4.788427,4.916971,4.775788,5.18043,4.975941,4.296531,4.570568,4.755156
LT,4.187068,4.748202,4.773784,4.877705,4.72401,5.178299,4.924163,4.274568,4.539641,4.701015
NC,4.122489,4.695462,4.717879,4.824344,4.687554,5.132291,4.825737,4.161528,4.500019,4.620152
OB,4.19546,4.809965,4.782176,4.939468,4.785774,5.250245,4.998309,4.282961,4.590965,4.752339
SK,4.627674,5.006948,4.87598,5.162138,4.933739,5.222472,5.184747,4.715175,4.809947,4.91389
SS,4.517344,5.029713,4.920074,5.177837,4.916982,5.316728,5.215197,4.604846,4.767742,4.90838
WO,4.07464,4.668559,4.627849,4.801426,4.624291,5.038113,4.83059,4.084135,4.382512,4.497387
