In [1]:
import pandas as pd
from bayes_opt import BayesianOptimization
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import xgboost as xgb
import warnings

In [2]:
# Load Data
df = pd.read_csv('data/final_dataset.csv')

In [3]:
# Create target label - Win: 1, Lose: 0
temp = []
for i in range(len(df)):
    if df.loc[i,'r'] >  df.loc[i,'l']:
        temp.append(1)
    elif df.loc[i,'r'] <  df.loc[i,'l']:
        temp.append(0)
    else :
        temp.append('draw')
df['win'] = temp        

# drop overlapping games
for i in range(int(len(df)/2)):
    df.drop(2*i+1, inplace=True)

# drop games with 'draw'    
df = df[df['win'] != 'draw']
df.set_index('game_key', inplace=True, drop=True)
df['win'] = pd.to_numeric(df['win'])
df

Unnamed: 0_level_0,hp_1,hp_2,hp_3,h1_1,h1_2,h1_3,h2_1,h2_2,h2_3,h3_1,...,a8_1,a8_2,a8_3,a9_1,a9_2,a9_3,hit_r,r,l,win
game_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20160413OBHH0,7.200000,2.000000,1.000000,0.250000,0.250000,0.250000,0.325581,0.355556,0.441860,0.257143,...,1.000000,1.000000,4.000000,0.275862,0.400000,0.310345,0.375000,3,7,0
20160414HTSK0,2.571429,1.071429,1.500000,0.291667,0.392857,0.333333,0.218750,0.315789,0.281250,0.250000,...,0.160000,0.222222,0.400000,0.192308,0.300000,0.192308,0.378378,7,6,1
20160414KTWO0,4.500000,1.500000,1.125000,0.234043,0.357143,0.297872,0.200000,0.200000,0.200000,0.346154,...,0.142857,0.250000,0.142857,0.216216,0.230769,0.378378,0.233333,4,5,0
20160414LTLG0,2.250000,1.000000,2.500000,0.270270,0.391304,0.324324,0.500000,0.500000,0.833333,0.272727,...,0.379310,0.424242,0.413793,0.235294,0.315789,0.235294,0.228571,0,9,0
20160414NCSS0,4.500000,1.416667,2.200000,0.388889,0.521739,0.722222,0.230769,0.302326,0.307692,0.315789,...,0.200000,0.230769,0.240000,0.166667,0.250000,0.277778,0.333333,7,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200719HHLG0,4.628571,1.400000,2.789474,0.247619,0.400000,0.380952,0.269767,0.327731,0.441860,0.285714,...,0.209302,0.306122,0.348837,0.260870,0.368852,0.347826,0.218750,6,2,1
20200719KTNC0,5.837838,1.508108,1.600000,0.310185,0.366255,0.439815,0.295918,0.348837,0.352041,0.306383,...,0.409091,0.458333,0.409091,0.272727,0.360000,0.340909,0.333333,8,2,1
20200719LTSS0,3.476395,1.171674,2.941176,0.339535,0.439689,0.432558,0.252632,0.284314,0.294737,0.345912,...,0.151899,0.200000,0.151899,0.266667,0.388889,0.266667,0.225806,1,2,0
20200719OBHT0,5.108108,1.378378,1.818182,0.350000,0.422222,0.425000,0.309322,0.398524,0.593220,0.319048,...,0.163636,0.175439,0.236364,0.261084,0.313636,0.359606,0.250000,4,8,0


In [4]:
# Create train, test dataset

X = df.iloc[:, :60].values
y = df.iloc[:, 63].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=23, stratify=y)


dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [5]:
# Define cross-validation variables used for parameter search

def XGB_CV(max_depth,
           gamma,
           min_child_weight,
           max_delta_step,
           subsample,
           colsample_bytree,
           eta,
         ):

    global AUCbest
    global ITERbest

    paramt = {
              'booster' : 'gbtree',
              'max_depth' : int(max_depth),
              'gamma' : gamma,
              'eta' : eta,
              'objective' : 'binary:logistic',
              'nthread' : 4,
              #'silent' : True,
              'eval_metric': 'auc',
              'subsample' : max(min(subsample, 1), 0),
              'colsample_bytree' : max(min(colsample_bytree, 1), 0),
              'min_child_weight' : min_child_weight,
              'max_delta_step' : int(max_delta_step),
              'seed' : 1001
      
              }
    folds = 10
    cv_score = 0
    print("\n Search parameters (%d-fold validation):\n %s" % (folds, paramt), file=log_file )
    log_file.flush()

    xgbc = xgb.cv(
                    paramt,
                    dtrain,
                    num_boost_round = 20000,
                    stratified = True,
                    nfold = folds,
                    early_stopping_rounds = 100,
                    metrics = 'auc',
                    show_stdv = True
                    
               )
    
    val_score = xgbc['test-auc-mean'].iloc[-1]
    train_score = xgbc['train-auc-mean'].iloc[-1]
    print(' Stopped after %d iterations with train-auc = %f val-auc = %f ( diff = %f ) train-gini = %f val-gini = %f' % ( len(xgbc), train_score, val_score, (train_score - val_score), (train_score*2-1),
(val_score*2-1)) )
    if ( val_score > AUCbest ):
        AUCbest = val_score
        ITERbest = len(xgbc)

    return (val_score*2) - 1

In [6]:
# Define the log file. If you repeat this run, new output will be added to it
log_file = open('model/Porto-AUC-5fold-XGB-run-01-v1-full.log', 'a')
AUCbest = -1.
ITERbest = 0

In [7]:
XGB_BO = BayesianOptimization(XGB_CV, {
                                    'max_depth': (2, 12),
                                    'gamma': (0.001, 10.0),
                                    'min_child_weight': (0, 20),
                                    'max_delta_step': (0, 10),
                                    'subsample': (0.4, 1.0),
                                    'colsample_bytree' :(0.4, 1.0),
                                    'eta' : (0.01, 0.3)
                                    })

In [8]:
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    XGB_BO.maximize(init_points=2, n_iter=30, acq='ei', xi=0.0)

|   iter    |  target   | colsam... |    eta    |   gamma   | max_de... | max_depth | min_ch... | subsample |
-------------------------------------------------------------------------------------------------------------
 Stopped after 1 iterations with train-auc = 0.720302 val-auc = 0.524264 ( diff = 0.196038 ) train-gini = 0.440604 val-gini = 0.048528
| [0m 1       [0m | [0m 0.04853 [0m | [0m 0.4629  [0m | [0m 0.1167  [0m | [0m 5.215   [0m | [0m 0.1682  [0m | [0m 11.24   [0m | [0m 6.034   [0m | [0m 0.8964  [0m |
 Stopped after 22 iterations with train-auc = 0.725406 val-auc = 0.524258 ( diff = 0.201148 ) train-gini = 0.450812 val-gini = 0.048516
| [0m 2       [0m | [0m 0.04852 [0m | [0m 0.7892  [0m | [0m 0.2228  [0m | [0m 9.809   [0m | [0m 6.388   [0m | [0m 4.879   [0m | [0m 19.26   [0m | [0m 0.6188  [0m |
 Stopped after 94 iterations with train-auc = 0.687756 val-auc = 0.521887 ( diff = 0.165869 ) train-gini = 0.375513 val-gini = 0.043775
| [0m 3 

 Stopped after 5 iterations with train-auc = 0.780684 val-auc = 0.534590 ( diff = 0.246095 ) train-gini = 0.561368 val-gini = 0.069179
| [0m 27      [0m | [0m 0.06918 [0m | [0m 0.5651  [0m | [0m 0.1202  [0m | [0m 0.8501  [0m | [0m 2.093   [0m | [0m 8.401   [0m | [0m 19.14   [0m | [0m 0.9481  [0m |
 Stopped after 2 iterations with train-auc = 0.598909 val-auc = 0.535548 ( diff = 0.063361 ) train-gini = 0.197817 val-gini = 0.071095
| [0m 28      [0m | [0m 0.0711  [0m | [0m 0.4348  [0m | [0m 0.2816  [0m | [0m 7.821   [0m | [0m 6.476   [0m | [0m 3.177   [0m | [0m 5.267   [0m | [0m 0.5468  [0m |
 Stopped after 6 iterations with train-auc = 0.676370 val-auc = 0.528674 ( diff = 0.147696 ) train-gini = 0.352740 val-gini = 0.057348
| [0m 29      [0m | [0m 0.05735 [0m | [0m 0.7077  [0m | [0m 0.1858  [0m | [0m 9.846   [0m | [0m 1.567   [0m | [0m 8.514   [0m | [0m 15.51   [0m | [0m 0.6364  [0m |
 Stopped after 3 iterations with train-auc = 0.81

In [11]:
XGB_BO2 = BayesianOptimization(XGB_CV, {
                                    'max_depth': (2, 12),
                                    'gamma': (0.001, 10.0),
                                    'min_child_weight': (0, 20),
                                    'max_delta_step': (0, 10),
                                    'subsample': (0.4, 1.0),
                                    'colsample_bytree' :(0.4, 1.0),
                                    'eta' : (0.01, 0.1)
                                    })
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    XGB_BO2.maximize(init_points=2, n_iter=30, acq='ei', xi=0.0)

|   iter    |  target   | colsam... |    eta    |   gamma   | max_de... | max_depth | min_ch... | subsample |
-------------------------------------------------------------------------------------------------------------
 Stopped after 5 iterations with train-auc = 0.831337 val-auc = 0.520853 ( diff = 0.310484 ) train-gini = 0.662674 val-gini = 0.041705
| [0m 1       [0m | [0m 0.04171 [0m | [0m 0.4529  [0m | [0m 0.083   [0m | [0m 2.182   [0m | [0m 5.787   [0m | [0m 9.67    [0m | [0m 7.653   [0m | [0m 0.7494  [0m |
 Stopped after 5 iterations with train-auc = 0.654452 val-auc = 0.535725 ( diff = 0.118727 ) train-gini = 0.308904 val-gini = 0.071449
| [95m 2       [0m | [95m 0.07145 [0m | [95m 0.7682  [0m | [95m 0.05808 [0m | [95m 3.928   [0m | [95m 3.211   [0m | [95m 6.849   [0m | [95m 18.34   [0m | [95m 0.4474  [0m |
 Stopped after 27 iterations with train-auc = 0.807479 val-auc = 0.521219 ( diff = 0.286260 ) train-gini = 0.614958 val-gini = 0.042439
|

 Stopped after 1 iterations with train-auc = 0.714888 val-auc = 0.529102 ( diff = 0.185786 ) train-gini = 0.429776 val-gini = 0.058203
| [0m 27      [0m | [0m 0.0582  [0m | [0m 0.5843  [0m | [0m 0.05022 [0m | [0m 4.911   [0m | [0m 5.709   [0m | [0m 8.606   [0m | [0m 9.401   [0m | [0m 0.9743  [0m |
 Stopped after 5 iterations with train-auc = 0.992779 val-auc = 0.543524 ( diff = 0.449255 ) train-gini = 0.985559 val-gini = 0.087048
| [0m 28      [0m | [0m 0.08705 [0m | [0m 0.8998  [0m | [0m 0.04258 [0m | [0m 2.428   [0m | [0m 3.303   [0m | [0m 10.63   [0m | [0m 0.9692  [0m | [0m 0.959   [0m |
 Stopped after 129 iterations with train-auc = 0.742555 val-auc = 0.524181 ( diff = 0.218374 ) train-gini = 0.485109 val-gini = 0.048361
| [0m 29      [0m | [0m 0.04836 [0m | [0m 0.6362  [0m | [0m 0.03127 [0m | [0m 2.481   [0m | [0m 8.673   [0m | [0m 2.202   [0m | [0m 10.77   [0m | [0m 0.5093  [0m |
 Stopped after 72 iterations with train-auc = 0

In [10]:
XGB_BO3 = BayesianOptimization(XGB_CV, {
                                    'max_depth': (2, 12),
                                    'gamma': (0.001, 10.0),
                                    'min_child_weight': (0, 20),
                                    'max_delta_step': (0, 10),
                                    'subsample': (0.4, 1.0),
                                    'colsample_bytree' :(0.4, 1.0),
                                    'eta' : (0.01, 0.2)
                                    })
with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    XGB_BO3.maximize(init_points=2, n_iter=30, acq='ei', xi=0.0)

|   iter    |  target   | colsam... |    eta    |   gamma   | max_de... | max_depth | min_ch... | subsample |
-------------------------------------------------------------------------------------------------------------
 Stopped after 43 iterations with train-auc = 0.947553 val-auc = 0.532249 ( diff = 0.415303 ) train-gini = 0.895105 val-gini = 0.064499
| [0m 1       [0m | [0m 0.0645  [0m | [0m 0.8739  [0m | [0m 0.06731 [0m | [0m 3.206   [0m | [0m 9.938   [0m | [0m 6.469   [0m | [0m 10.39   [0m | [0m 0.7307  [0m |
 Stopped after 6 iterations with train-auc = 0.667532 val-auc = 0.526722 ( diff = 0.140810 ) train-gini = 0.335064 val-gini = 0.053443
| [0m 2       [0m | [0m 0.05344 [0m | [0m 0.4147  [0m | [0m 0.04601 [0m | [0m 1.99    [0m | [0m 6.783   [0m | [0m 10.8    [0m | [0m 14.5    [0m | [0m 0.4211  [0m |
 Stopped after 16 iterations with train-auc = 0.660197 val-auc = 0.528364 ( diff = 0.131834 ) train-gini = 0.320394 val-gini = 0.056727
| [0m 3 

 Stopped after 29 iterations with train-auc = 0.721433 val-auc = 0.520497 ( diff = 0.200936 ) train-gini = 0.442866 val-gini = 0.040993
| [0m 26      [0m | [0m 0.04099 [0m | [0m 0.7497  [0m | [0m 0.09485 [0m | [0m 3.739   [0m | [0m 6.246   [0m | [0m 2.032   [0m | [0m 6.577   [0m | [0m 0.8529  [0m |
 Stopped after 7 iterations with train-auc = 0.689320 val-auc = 0.534068 ( diff = 0.155252 ) train-gini = 0.378640 val-gini = 0.068136
| [0m 27      [0m | [0m 0.06814 [0m | [0m 0.9327  [0m | [0m 0.09842 [0m | [0m 9.753   [0m | [0m 5.042   [0m | [0m 4.394   [0m | [0m 18.77   [0m | [0m 0.8495  [0m |
 Stopped after 3 iterations with train-auc = 0.785166 val-auc = 0.548488 ( diff = 0.236678 ) train-gini = 0.570331 val-gini = 0.096975
| [95m 28      [0m | [95m 0.09698 [0m | [95m 0.8562  [0m | [95m 0.01318 [0m | [95m 2.898   [0m | [95m 3.879   [0m | [95m 10.69   [0m | [95m 12.48   [0m | [95m 0.9745  [0m |
 Stopped after 39 iterations with train

In [12]:
params = XGB_BO.max
params2 = XGB_BO2.max
params3 = XGB_BO3.max

In [13]:
# Train classifiers
XGB_clf1 = xgb.XGBClassifier(booster = 'gbtree',
                             max_depth = int(params['params']['max_depth']),
                             gamma = params['params']['gamma'],
                             eta = params['params']['eta'],
                             objective = 'binary:logistic',
                             eval_metric= 'auc',
                             subsample = max(min(params['params']['subsample'] , 1), 0),
                             colsample_bytree = max(min(params['params']['colsample_bytree'], 1), 0),\
                             min_child_weight = params['params']['min_child_weight'],
                             max_delta_step = int(params['params']['max_delta_step']),
                             n_estimators = 500
                            )

In [14]:
XGB_clf2 = xgb.XGBClassifier(booster = 'gbtree',
                             max_depth = int(params2['params']['max_depth']),
                             gamma = params2['params']['gamma'],
                             eta = params2['params']['eta'],
                             objective = 'binary:logistic',
                             eval_metric= 'auc',
                             subsample = max(min(params2['params']['subsample'] , 1), 0),
                             colsample_bytree = max(min(params2['params']['colsample_bytree'], 1), 0),\
                             min_child_weight = params2['params']['min_child_weight'],
                             max_delta_step = int(params2['params']['max_delta_step']),
                             n_estimators = 500
                            )

In [15]:
XGB_clf3 = xgb.XGBClassifier(booster = 'gbtree',
                             max_depth = int(params3['params']['max_depth']),
                             gamma = params3['params']['gamma'],
                             eta = params3['params']['eta'],
                             objective = 'binary:logistic',
                             eval_metric= 'auc',
                             subsample = max(min(params3['params']['subsample'] , 1), 0),
                             colsample_bytree = max(min(params3['params']['colsample_bytree'], 1), 0),\
                             min_child_weight = params3['params']['min_child_weight'],
                             max_delta_step = int(params3['params']['max_delta_step']),
                             n_estimators = 500
                            )

In [16]:
clf= RandomForestClassifier(n_estimators=300, criterion='entropy', min_samples_split=2,
  min_samples_leaf=1, max_features='auto', bootstrap=False, oob_score=False, n_jobs=-1, random_state=123,
  verbose=0)


param_grid = {'max_depth' : [8,12,14, 16],
              'min_samples_leaf' : [4, 6, 8, 12, 18],
              'min_samples_split' : [6, 8, 16],
             }

grid_search = GridSearchCV(clf, param_grid=param_grid, verbose=1,scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

RF_BEST = grid_search.best_estimator_
'''
RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=14,
                       min_samples_leaf=8, min_samples_split=8,
                       n_estimators=300, n_jobs=4, random_state=123)
'''

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   24.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  3.6min finished


"\nRandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=14,\n                       min_samples_leaf=8, min_samples_split=8,\n                       n_estimators=300, n_jobs=4, random_state=123)\n"

In [17]:
voting_clf = VotingClassifier(
    estimators=[('xgb1', XGB_clf1),
                ('xgb2', XGB_clf2),
                ('xgb3', XGB_clf3), 
                ('rf', RF_BEST)],
    voting='soft', n_jobs=-1)

In [20]:
import pickle
# Save model

with open('./model/voting_model_win.pkl', 'wb') as f:
    pickle.dump(voting_clf, f)
    
# Load model
with open('./model/voting_model_win.pkl', 'rb') as f:
    voting_clf = pickle.load(f)

In [21]:
# Test results
voting_clf.fit(X_train,y_train)
voting_clf.score(X_test, y_test)

0.5787671232876712