In [1]:
import os
import pandas as pd
import numpy as np
import pickle

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

  from numpy.core.umath_tests import inner1d


## Dropping columns and splitting data into train and test

In [2]:
model_config = 'All' 
# model_config = 'Diff'

In [3]:
df = pd.read_csv('../data/final_data.csv')

metrics = ['FT{}G', '{}S', '{}ST', '{}Acc', '{}Conv']
team_metrics = ['{}Value', '{}CurrentPoints', '{}Rating']
refs = ['H', 'A']

stats_cols = [m.format(ref) for m, ref in [(m, r) for m in metrics for r in refs]]
team_cols = [m.format(ref) + '_5Avg' for m, ref in [(m, r) for m in metrics for r in refs]] + [m.format(ref) for m, ref in [(m, r) for m in team_metrics for r in refs]]
diff_cols = [m.format('') + '_5Avg_Diff' for m in metrics] + [m.format('') +'_Diff' for m in team_metrics]
other_cols = ['Season', 'Date', 'HomeTeam', 'AwayTeam']

dropped_columns = ['FTR'] + other_cols + stats_cols
if model_config == 'All':  dropped_columns += diff_cols
elif model_config == 'Diff': dropped_columns += team_cols 
    
df_train = df[df.Season <= '2017-2018']
df_train_x = df_train.drop(columns = dropped_columns)
df_train_y = df_train.FTR

df_test = df[df.Season == '2018-2019']
df_test_x = df_test.drop(columns = dropped_columns)
df_test_y = df_test.FTR

## Initialization

In [4]:
def evaluate(model, df_train_x, df_train_y, df_test_x, df_test_y):
    train_predictions = model.predict(df_train_x)
    train_accuracy = (train_predictions == df_train_y).mean()
    print('Training Accuracy = {:0.3f}%'.format(train_accuracy))
    test_predictions = model.predict(df_test_x)
    test_accuracy = (test_predictions == df_test_y).mean()
    print('Testing Accuracy = {:0.3f}%'.format(test_accuracy))
    
def join_pred_on_data(model, df_test):
    df_test_pred = df_test.copy()
    classes = model.classes_
    preds_proba = model.predict_proba(df_test_x)
    preds = model.predict(df_test_x)
    
    for i, cl in enumerate(classes):
        df_test_pred['PRED_{}'.format(cl)] = np.vectorize(lambda x: round(x, 3))(preds_proba[:, i])
    
    df_test_pred['PRED_RESULT'] = preds
    df_test_pred = df_test_pred[[x for x in df_test_pred.columns if x != 'FTR'] + ['FTR']]
        
    return df_test_pred

param_grid = {
    'bootstrap': [True],
    'max_depth': [50, 70, 90],
    'max_features': [2, 3, 4],
    'min_samples_leaf': [5, 7],
    'min_samples_split': [10, 12],
    'n_estimators': [20, 50, 500, 1500]
}


## Training model with Cross validation on training set

In [5]:
re_train = True

In [6]:
if model_config == 'All': model_file = '../saved_models/RF_all.sav'
elif model_config == 'Diff': model_file = '../saved_models/RF_diff.sav'

if re_train:
    rf = RandomForestClassifier()
    grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 1)
    grid_search.fit(df_train_x, df_train_y)
    best_rf = grid_search.best_estimator_
    
else:
    best_rf = pickle.load(model_file)

Fitting 3 folds for each of 144 candidates, totalling 432 fits


[Parallel(n_jobs=-1)]: Done  80 tasks      | elapsed:   36.7s
[Parallel(n_jobs=-1)]: Done 230 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 432 out of 432 | elapsed:  3.7min finished


## Assessing performance

In [7]:
evaluate(best_rf, df_train_x, df_train_y, df_test_x, df_test_y)

Training Accuracy = 0.768%
Testing Accuracy = 0.594%


In [8]:
pickle.dump(best_rf, open(model_file, 'wb'))

In [9]:
# df_test_pred = join_pred_on_data(best_rf, df_test)
# df_test_pred.iloc[104]