# Performance Forecasting with ARIMA

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# plt.style.use('fivethirtyeight')
%matplotlib inline
plt.style.use('seaborn-paper')
import matplotlib as mpl
mpl.style.use('seaborn')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
pd.set_option('display.max_columns', None)
# import seaborn as sns
# sns.set(style='ticks', context='talk')

from sklearn.metrics import r2_score, median_absolute_error, mean_absolute_error
from sklearn.metrics import median_absolute_error, mean_squared_error, mean_squared_log_error

from scipy.optimize import minimize
import statsmodels.tsa.api as smt
import statsmodels.api as sm

def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

import warnings
warnings.filterwarnings('ignore')
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from pmdarima.arima import auto_arima
from pmdarima import tsdisplay
from pmdarima.arima.stationarity import ADFTest
from pmdarima.arima import StepwiseContext
from pmdarima.arima.utils import ndiffs, nsdiffs
from pmdarima.metrics import smape
import logging
from logging import debug, info, warning, error
logging.getLogger('matplotlib.font_manager').disabled = True
logging.getLogger().setLevel(logging.ERROR)

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from statsmodels.tsa.stattools import adfuller

# import plotly.express as px

In [2]:
LEAGUE_DATAPATH = '/home/egkiastas/thesis/player-performance-prediction/data/all_players_league_match_info.csv'

league_data = pd.read_csv(LEAGUE_DATAPATH, index_col=['startTimestamp'], parse_dates=['startTimestamp', 'player_birth', 'previous_date'])
league_data = league_data[['player_name','age',\
    'fifa_rating','fifa_potential','after_injury','injury_days','rest_days',\
    'current_team_category','opponent_category','home_fixture',\
    'Performance']]
league_data

data = league_data[league_data['player_name']=='Debruyne']
data = data.drop(labels=['player_name'], axis=1, inplace=False)
data


Unnamed: 0_level_0,player_name,age,fifa_rating,fifa_potential,after_injury,injury_days,rest_days,current_team_category,opponent_category,home_fixture,Performance
startTimestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2015-08-23 16:30:00,Messi,28.18,94,95,0,0,78.0,1,0,0,7.8
2015-08-29 18:30:00,Messi,28.20,94,95,0,0,6.0,1,0,1,9.4
2015-09-12 18:30:00,Messi,28.24,94,95,0,0,14.0,1,1,0,8.6
2015-09-20 18:30:00,Messi,28.26,94,95,0,0,4.0,1,0,1,10.0
2015-09-23 18:00:00,Messi,28.27,94,95,0,0,3.0,1,0,0,9.1
...,...,...,...,...,...,...,...,...,...,...,...
2022-01-15 12:30:00,Azpilicueta,32.41,83,83,0,0,3.0,1,1,0,7.2
2022-01-18 20:00:00,Azpilicueta,32.41,83,83,0,0,3.0,1,0,0,7.2
2022-01-23 16:30:00,Azpilicueta,32.43,83,83,0,0,5.0,1,1,1,7.4
2022-03-10 19:30:00,Azpilicueta,32.55,83,83,1,7,16.0,1,0,0,6.5


Unnamed: 0_level_0,age,fifa_rating,fifa_potential,after_injury,injury_days,rest_days,current_team_category,opponent_category,home_fixture,Performance
startTimestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2015-08-16 13:30:00,24.15,86,89,0,0,30.0,0,0,1,6.3
2015-08-22 13:30:00,24.17,86,89,0,0,6.0,0,0,0,6.6
2015-09-12 14:00:00,24.22,86,89,0,0,21.0,1,0,0,7.7
2015-09-19 16:30:00,24.24,86,89,0,0,4.0,1,0,1,8.0
2015-09-26 11:45:00,24.26,86,89,0,0,7.0,1,1,0,5.7
...,...,...,...,...,...,...,...,...,...,...
2022-02-26 17:30:00,30.69,91,91,0,0,7.0,1,1,0,7.2
2022-03-06 16:30:00,30.71,91,91,0,0,8.0,1,1,1,9.9
2022-03-14 20:00:00,30.73,91,91,0,0,8.0,1,0,0,7.5
2022-04-02 14:00:00,30.78,91,91,0,0,13.0,1,0,0,7.6


## Helper functions

In [10]:
def get_metrics(test_y, pred_y):
    mae = mean_absolute_error(test_y, pred_y)
    mape = mean_absolute_percentage_error(test_y, pred_y)
    rmse = mean_squared_error(test_y, pred_y, squared=False)
    smaperror = smape(test_y, pred_y)
    r2 = r2_score(test_y, pred_y)
    return round(mae, 3), round(mape, 3), round(rmse, 3), round(smaperror, 3), round(r2, 3)

def print_metrics(mae, mape, rmse, smaperror, r2):
    print('Mean Absolute Error: %.3f' % mae)
    print('Mean Absolute Percentage Error: %.3f' % mape)
    print('Root Mean Squared Error: %.3f' % rmse)
    print('Symmetric Mean Absolute Percentage Error: %.3f' % smaperror)
    print('R^2: %.3f' % r2)

def get_percentage(test_y, pred_y, train_y_last):
    true_updowns, pred_updowns = [], []
    previous_perf = np.concatenate([train_y_last.reshape(-1,1), test_y.reshape(-1,1)])[:-1]
    previous_perf_pred = np.concatenate([train_y_last.reshape(-1,1), pred_y.reshape(-1,1)])[:-1]
    # compare previous_perf with test_y and pred_y
    for i in range(test_y.shape[0]):
        if previous_perf[i] < test_y[i]:
            true_updowns.append(1)
        elif previous_perf[i]> test_y[i]:
            true_updowns.append(0)
        elif previous_perf[i]== test_y[i]:
            true_updowns.append(2)
        else:
            pass
        if previous_perf_pred[i] < pred_y[i]:
            pred_updowns.append(1)
        elif previous_perf_pred[i]> pred_y[i]:
            pred_updowns.append(0)
        elif previous_perf_pred[i]== pred_y[i]:
            pred_updowns.append(2)
        else:
            pass
    # find percentage
    pred_corrects = 0
    for i in range(len(pred_updowns)):
        if pred_updowns[i] == true_updowns[i]:
            pred_corrects += 1
    return pred_corrects

In [11]:
def plot_preds_real(data, preds, train_size, conf_int, player_name, model_name):
    # plot forecasts against actual outcomes
    mpl.style.use('seaborn')
    plt.figure(figsize=(15,8), dpi=100);
    plt.grid(True)
    plt.ylim(5, 10.2)
    plt.plot(data[train_size:], alpha=0.85, linewidth=1.90, label="Target Values", color='blue', linestyle=(0, (5, 1)));
    plt.plot(preds, alpha=0.85, linewidth=1.90, label="Predicted Values", color='red');
    if conf_int is not None:
        plt.fill_between(data[-preds.shape[0]:].index,
                        conf_int[:, 0],
                        conf_int[:, 1],
                        alpha=0.2, color='g');
    plt.ylabel("Performance");
    plt.xlabel('Games');
    plt.title('Next '+str(len(preds)) + " games forecasts for "+player_name+' with '+model_name);
    plt.legend();
    plt.savefig('img/'+player_name+'_'+model_name+'_forecasts.jpg');
    # plt.show();
    plt.close();

def test_stationarity(data):
    X = data.values
    result = adfuller(X)
    info('ADF Statistic: %f' % result[0])
    info('p-value: %f' % result[1])
    info('Critical Values:')
    for key, value in result[4].items():
        info('\t%s: %.3f' % (key, value))
    if result[1] <= 0.05:
        info("Stationary Data")
        return True
    else:
        info("Non-Stationary Data")
        return False

def compute_differences(data):
    station_flag = test_stationarity(data)
    # Test whether we should difference at the alpha=0.05
    # significance level
    adf_test = ADFTest(alpha=0.05)
    # p_val, should_diff = adf_test.should_diff(data) # throws singular matrix exception
    # info("P-value:{}, should_diff:{}".format(p_val, should_diff))
    # Estimate the number of differences using an ADF test:
    d_adf = ndiffs(data, test='adf', alpha=0.05, max_d=3)
    # Or a KPSS test (auto_arima default):
    d_kpss = ndiffs(data, test='kpss', alpha=0.05, max_d=3)
    # estimate number of seasonal differences using a Canova-Hansen test
    D_1 = nsdiffs(data, m=7,  # commonly requires knowledge of dataset
                max_D=12, test='ch')
    # or use the OCSB test (by default)
    D_2 = nsdiffs(data, m=7, max_D=12, test='ocsb')
    info("d_adf:{}, d_kpss:{}, D_1:{}, D_2:{}".format(d_adf, d_kpss, D_1, D_2))

    n_diffs = max(d_adf, d_kpss)
    n_sdiffs = D_1
    # # Let's look at the series, its ACF plot, and a histogram of its values
    # tsdisplay(data, lag_max=90, title="Daily Flows", show=True)
    # test_seasonality(data)
    return n_diffs, n_sdiffs, station_flag

def check_arima_forecasting(data, train_size, player_name, model_name):
    data.reset_index(inplace=True)
    # check_lag_plot(data, path_figs+'check_lag_plots')
    n_diffs, n_sdiffs, station_flag = compute_differences(data['Performance'])

    #AUTO ARIMA
    train_df, test_df = data[0:train_size], data[train_size:data.shape[0]]
    train = train_df['Performance'].values
    train_exog = train_df.loc[:, train_df.columns != 'Performance'].values
    test = test_df['Performance'].values
    test_exog = test_df.loc[:, test_df.columns != 'Performance'].values

    # auto_reg(train,lags=16)
    
    # ARIMA
    with StepwiseContext(max_dur=300):
        arima_model = auto_arima(y=train, X=train_exog, start_p=1, d=n_diffs, start_q=1, max_p=7, max_d=n_diffs+2, max_q=7, start_P=1, D=1, start_Q=1,
                                max_P=4, max_D=n_sdiffs, max_Q=4, max_order=10, m=7, seasonal=False, stationary=station_flag,
                                n_jobs=-1, error_action="warn", trace=False, suppress_warnings=True, stepwise=True)
    # PREDICT with ARIMA MODEL
    preds, conf_int = arima_model.predict(n_periods=len(test), exogenous=test_exog, return_conf_int=True)
    # arima_model.plot_diagnostics(figsize=(15, 12))
    # plt.show()

    predictions = pd.DataFrame(preds)
    predictions.index = data[train_size:].index
    predictions = predictions.rename(columns={0: "Performance"})
    plot_preds_real(data['Performance'], predictions, train_size, None, player_name, model_name)

    return test, preds, train[-1]

## Run the model for each player and hold results/plots

In [12]:
%%time
model_evaluation = pd.DataFrame()

LEAGUE_DATAPATH = '/home/egkiastas/thesis/player-performance-prediction/data/all_players_league_match_info.csv'
league_data = pd.read_csv(LEAGUE_DATAPATH, parse_dates=['startTimestamp', 'player_birth', 'previous_date'])
league_data = league_data[['player_name','age',\
    'fifa_rating','fifa_potential','after_injury','injury_days','rest_days',\
    'current_team_category','opponent_category','home_fixture',\
    'Performance']]
# league_data
names_list = league_data['player_name'].unique().tolist()
# names_list=['Messi']
model_name='ARIMA'
for player_name in names_list:
    player_df = league_data[league_data['player_name']==player_name]
    player_df = player_df.drop(labels=['player_name'], axis=1, inplace=False)

    test_size = 10
    train_size = player_df.shape[0]-test_size

    player_evaluation = pd.DataFrame()

    test_y, pred_y, train_y_last = check_arima_forecasting(player_df, train_size, player_name, model_name)
    mae, mape, rmse, smaperror, r2 = get_metrics(test_y, pred_y)
    # print_metrics(mae, mape, rmse, smaperror, r2)
    if model_name != 'Naive':
        correct_pred_updowns = get_percentage(test_y, pred_y, train_y_last)
    else:
        correct_pred_updowns = 0
    
    results = [{'Player': player_name, model_name+'_MAE': mae, model_name+'_RMSE': rmse, model_name+'_R-squared': r2, model_name+'_correct_pred_updowns': correct_pred_updowns}]

    player_evaluation = player_evaluation.append(results, ignore_index=True, sort=False)
    player_evaluation = player_evaluation.apply(lambda x: pd.Series(x.dropna().values))
    player_evaluation.dropna(inplace=True)

    player_evaluation.to_csv('results/'+player_name+'_arima.csv')
    model_evaluation = pd.concat([model_evaluation, player_evaluation])

model_evaluation.to_csv('results/'+'model_evaluation_arima.csv')

CPU times: user 2min 16s, sys: 2min 27s, total: 4min 43s
Wall time: 2min 6s


In [13]:
for i in range(len(test_y)):
    print(test_y[i],pred_y[i])

7.2 7.257610449633604
6.5 7.130862532253191
7.2 7.164836626207556
7.4 7.2484374140034555
6.9 7.126443910937324
7.2 7.042451241619229
7.2 7.159873809673087
7.4 7.121770915673617
6.5 6.986162537149159
6.1 7.257400494051944
