Training and validation of the final model(s) per round and comparison to the wisdom of the crowd.

In [None]:
cd ..

In [None]:
import sys
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.stats import linregress

from matplotlib import pyplot as plt
import plotly.graph_objs as go

from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score

sys.path.append('auxiliary/')
from data_processing import load_features, shape_data, shape_data_scaler

### Choose settings for the final model validation

In [None]:
test_season = '2018-2019'  # hold-out season for validation
level = 'match'  # match or team level features to use
min_round_train = 5  # minimum number of first rounds to skip in every season (train set)
min_round_test = 5  # minimum number of first rounds to skip in every season (test set)
norm = True  # whether to normalise or not the features
random_state = 10  # random state for the classifier

### Choose model hyper-parameters and feature sets for the models to validate

In [None]:
params = [
     {'features': ['Position_x', 'Offence_x', 'Offence_y', 'Defence_y',
                   'Diff_y', 'Home F4', 'Away F4'],
      'n_estimators': 115, 
      'learning_rate': 0.7},
     {'features': ['Position_x', 'Position_y', 'Offence_x', 'Offence_y',
                   'Defence_y', 'Diff_y', 'Away F4'],
      'n_estimators': 141, 
      'learning_rate': 0.7},
     {'features': ['Position_x', 'Position_y', 'Offence_x', 'Offence_y',
                   'Defence_x', 'Defence_y', 'form_x', 'form_y',
                   'Diff_x', 'Diff_y', 'Home F4', 'Away F4'],
      'n_estimators': 121, 
      'learning_rate': 1.0}
]

### Load Features

In [None]:
df = load_features(level)

### Train and Predict progressively

In [None]:
# Every week has each own model
rounds = np.arange(2, 31, dtype=int)
print('Rounds for validation:', rounds)
accuracy = np.zeros((rounds.shape[0], len(params)))
waccuracy = np.zeros((rounds.shape[0], len(params)))
models_results = pd.DataFrame({'game_round': rounds.repeat(8)})
for j, param in enumerate(tqdm(params)):
    features = param['features']
    n_estimators = param['n_estimators']
    learning_rate = param['learning_rate']
    model = AdaBoostClassifier(n_estimators=n_estimators, random_state=10,
                               learning_rate=learning_rate)

    y_pred_all = np.array([])
    y_test_all = np.array([])
    for i, game_round in enumerate(rounds):
        train_inds = (df['Season'] != test_season) | ((df['Season'] == test_season) & (df['Round'] < game_round))
        test_inds = ~ train_inds
        X_train, y_train, df_train, _, scaler = shape_data_scaler(df[train_inds], features,
                                                                  norm=norm, min_round=1)
        model.fit(X_train, y_train)

        X_test, y_test, df_test, _, _ = shape_data_scaler(df[test_inds], features,
                                                          norm=scaler, min_round=1)

        y_pred = model.predict(X_test)
        
        accur = accuracy_score(y_test, y_pred)
        w_accur = balanced_accuracy_score(y_test, y_pred)
        
        # store the predictions, actuals of the current round
        y_pred_all = np.concatenate((y_pred_all, y_pred[:8]))
        y_test_all = np.concatenate((y_test_all, y_test[:8]))

        accuracy[i, j] = accur
        waccuracy[i, j] = w_accur
    
    if 'actual' not in models_results.columns:
        models_results['Actual'] = y_test_all.astype(int)
    models_results['Pred_%d' % j] = y_pred_all.astype(int)

In [None]:
models_results['Pred_comb'] = np.where(models_results[['Pred_0', 'Pred_1', 'Pred_2']].sum(axis=1) > 1.5, 1, 0)

In [None]:
models_results['Pred_Majority'] = np.zeros(models_results.shape[0], dtype=int)

### Print Scores

In [None]:
model_list = [u for u in models_results.columns if u.startswith('Pred')]

In [None]:
print('Accuracy scores')
for col in model_list:
    print('%s:' % col, 
          accuracy_score(models_results['Actual'], 
                         models_results[col]))

In [None]:
print('Weighted accuracy scores')
for col in model_list:
    print('%s:' % col, 
          balanced_accuracy_score(models_results['Actual'], 
                                  models_results[col]))

In [None]:
print('ROC-AUC scores')
for col in model_list:
    print('%s:' % col, roc_auc_score(models_results['Actual'], models_results[col]))

### Plot Accuracy per round

In [None]:
uniq_rounds = np.unique(models_results['game_round'].values)
n_rounds = uniq_rounds.shape[0]
round_accuracy = np.zeros(n_rounds)
n_correct = np.zeros(n_rounds)
for i, u in enumerate(uniq_rounds):
    ii = models_results['game_round'] == u
    n_correct[i] =  (models_results.loc[ii, 'Actual'].values == models_results.loc[ii, 'Pred_1'].values).sum()
    round_accuracy[i] = accuracy_score(models_results.loc[ii, 'Actual'].values, models_results.loc[ii, 'Pred_1'].values)

In [None]:
data = go.Bar(x=rounds, y=n_correct)
layout = go.Layout(yaxis={'title': 'Number of Correctly Predicted Games'},
                   xaxis={'title': 'Game Round'})
fig = go.Figure(data, layout)
fig.show()

In [None]:
slope, interc, _, _, _ = linregress(uniq_rounds, round_accuracy)
y = slope * uniq_rounds + interc
data = [
    go.Scatter(x=uniq_rounds, y=round_accuracy, mode='markers'),
    go.Scatter(x=uniq_rounds, y=y)
]
layout = go.Layout(yaxis={'title': 'Accuracy'}, xaxis={'title': 'Game Round'}, showlegend=False)
fig = go.Figure(data, layout)
fig.show()

# The Wisdom of the Crowds
The data for this task is available upon request.

In [None]:
predict_files_pattern = os.path.expanduser('~/Documents/mia_syn_mia_app/output/2018-2019/predictions_day_%d.csv')

In [None]:
woc_results = np.array([])
for i in rounds:
    try:
        woc_df = pd.read_csv(predict_files_pattern % i)
        xx = woc_df[['game_%d' % u for u in range(1, 9)]].mode().values[0, :].flatten()
    except:
        print('File not found: round', i)
        xx = np.full(8, np.nan)
    woc_results = np.concatenate((woc_results, xx))

In [None]:
# WoC predictions
models_results['Pred_WoC'] = woc_results - 1

In [None]:
if 'Pred_WoC' not in model_list:
    model_list.append('Pred_WoC')

### Comparison of results without the missing round(s)

In [None]:
# exclude the missing round(s) (if any)
ii = pd.notna(models_results['Pred_WoC'])

In [None]:
print('Accuracy Scores')
for col in model_list:
    print('%s: \t' % col, 
          accuracy_score(models_results.loc[ii, 'Actual'].values,
                         models_results.loc[ii, col].values)
         )

In [None]:
print('Weighted-Accuracy Scores')
for col in model_list:
    print('%s: \t' % col, 
          balanced_accuracy_score(models_results.loc[ii, 'Actual'].values, 
                                  models_results.loc[ii, col].values)
         )