# Performance Forecasting with various models

In [None]:
import pandas as pd
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
# plt.style.use('fivethirtyeight')
%matplotlib inline
plt.style.use('seaborn-paper')
import matplotlib as mpl
mpl.style.use('seaborn')

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from pmdarima.metrics import smape
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
import xgboost
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import LSTM
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from tensorflow.keras.utils import plot_model
from sklearn.model_selection import GridSearchCV
import time

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import os,sys
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore" # Also affect subprocesses

2022-07-18 14:05:13.729533: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-07-18 14:05:13.739176: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-07-18 14:05:13.739207: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## Helper functions

In [2]:
def get_metrics(test_y, pred_y):
    mae = mean_absolute_error(test_y, pred_y)
    mape = mean_absolute_percentage_error(test_y, pred_y)
    rmse = mean_squared_error(test_y, pred_y, squared=False)
    smaperror = smape(test_y, pred_y)
    r2 = r2_score(test_y, pred_y)
    return round(mae, 3), round(mape, 3), round(rmse, 3), round(smaperror, 3), round(r2, 3)

def print_metrics(mae, mape, rmse, smaperror, r2):
    print('Mean Absolute Error: %.3f' % mae)
    print('Mean Absolute Percentage Error: %.3f' % mape)
    print('Root Mean Squared Error: %.3f' % rmse)
    print('Symmetric Mean Absolute Percentage Error: %.3f' % smaperror)
    print('R^2: %.3f' % r2)

def get_percentage(test_y, pred_y, train_y_last):
    true_updowns, pred_updowns = [], []
    previous_perf = np.concatenate([train_y_last.reshape(-1,1), test_y])[:-1]
    previous_perf_pred = np.concatenate([train_y_last.reshape(-1,1), pred_y.reshape(-1,1)])[:-1]
    # compare previous_perf with test_y and pred_y
    for i in range(test_y.shape[0]):
        if previous_perf[i] < test_y[i]:
            true_updowns.append(1)
        elif previous_perf[i]> test_y[i]:
            true_updowns.append(0)
        elif previous_perf[i]== test_y[i]:
            true_updowns.append(2)
        else:
            pass
        if previous_perf_pred[i] < pred_y[i]:
            pred_updowns.append(1)
        elif previous_perf_pred[i]> pred_y[i]:
            pred_updowns.append(0)
        elif previous_perf_pred[i]== pred_y[i]:
            pred_updowns.append(2)
        else:
            pass
    # find percentage
    pred_corrects = 0
    for i in range(len(pred_updowns)):
        if pred_updowns[i] == true_updowns[i]:
            pred_corrects += 1
    return pred_corrects

# convert series to supervised learning
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
	n_vars = 1 if type(data) is list else data.shape[1]
	df = pd.DataFrame(data)
	cols, names = list(), list()
	# input sequence (t-n, ... t-1)
	for i in range(n_in, 0, -1):
		cols.append(df.shift(i))
		names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
	# forecast sequence (t, t+1, ... t+n)
	for i in range(0, n_out):
		cols.append(df.shift(-i))
		if i == 0:
			names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
		else:
			names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
	# put it all together
	agg = pd.concat(cols, axis=1)
	agg.columns = names
	# drop rows with NaN values
	if dropnan:
		agg.dropna(inplace=True)
	return agg

def plot_train_test(data, train_size, player_name):
    # plot train and test set
    plt.figure(figsize=(15,8), dpi=100);
    plt.grid(True)
    plt.ylim(4, 10.2)
    plt.plot(data[:train_size], alpha=0.6, linewidth=1.90, label="Train Values", color='blue');
    plt.plot(data[train_size:], alpha=0.85, linewidth=1.90, label="Test Values", color='blue', linestyle=(0, (5, 1)));
    plt.ylabel("Performance");
    plt.xlabel('Games');
    plt.title('All '+ str(len(data)) + " games for "+ player_name);
    plt.legend();
    plt.savefig('img/'+player_name+'_train-test.jpg');
    # plt.show();
    plt.close();

def plot_preds_real(data, predictions, train_size, player_name, model_name):
    # plot forecasts against actual outcomes
    plt.figure(figsize=(15,8), dpi=100);
    plt.grid(True)
    plt.ylim(5, 10.2)
    plt.plot(data[train_size:], alpha=0.85, linewidth=1.90, label="Target Values", color='blue', linestyle=(0, (5, 1)));
    plt.plot(predictions, alpha=0.85, linewidth=1.90, label="Predicted Values", color='red');
    plt.ylabel("Performance");
    plt.xlabel('Games');
    plt.title('Next '+str(len(predictions)) + " games forecasts for "+player_name+' with '+model_name);
    plt.legend();
    plt.savefig('img/'+player_name+'_'+model_name+'_forecasts.jpg');
    # ax1.show();
    plt.close();

### Multivariate LSTM Forecast Model

In [3]:
def lstm_train_predict(player_values=None, input_size = 3, output_size = 1, train_size=200, player_name=None):
	# ensure all data is float
	player_values = player_values.astype('float32')
	# frame as supervised learning
	reframed = series_to_supervised(player_values, input_size, output_size, dropnan=True)
	# print(reframed.columns.tolist())
	matchers = ['var10(t)', 'var10(t+']
	matching = [s for s in reframed.columns.tolist() if any(xs in s for xs in matchers)]
	# print(matching)
	new_cols = [col for col in reframed.columns if col not in matching] + matching
	reframed = reframed[new_cols]
	# print(reframed.columns.tolist())
	# print(reframed.shape)
	# split into train and test sets
	values = reframed.values
	train = values[:train_size, :]
	test = values[train_size:, :]
	# split into input and outputs
	train_X, train_y = train[:, :-output_size], train[:, -output_size:]
	test_X, test_y = test[:, :-output_size], test[:, -output_size:]
	# scale X
	sc_x = MinMaxScaler()
	train_X = sc_x.fit_transform(train_X)
	test_X = sc_x.transform(test_X)
	# scale y
	if output_size==1:
		sc_y = MinMaxScaler()
		train_y = sc_y.fit_transform(train_y.reshape(-1, 1))
		test_y = sc_y.transform(test_y.reshape(-1, 1))
	else:
		sc_y = MinMaxScaler()
		train_y = sc_y.fit_transform(train_y)
		test_y = sc_y.transform(test_y)

	# reshape input to be 3D [samples, timesteps, features]
	train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
	test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))
	# print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)
	# reshape output into [samples, timesteps, features]
	train_y = train_y.reshape((train_y.shape[0], train_y.shape[1], 1))
	test_y = test_y.reshape((test_y.shape[0], test_y.shape[1], 1))

	# define model
	n_timesteps, n_features, n_outputs = train_X.shape[1], train_X.shape[2], train_y.shape[1]
	model = Sequential()
	model.add(LSTM(20, activation='relu', input_shape=(n_timesteps, n_features)))
	model.add(RepeatVector(n_outputs))
	model.add(LSTM(20, activation='relu', return_sequences=True))
	model.add(TimeDistributed(Dense(100, activation='relu')))
	model.add(TimeDistributed(Dense(1)))
	model.compile(loss='mse', optimizer='adam')

	# plot_model(model, to_file='img/'+player_name+'_model_lstm.png')

	# fit network
	history = model.fit(train_X, train_y, epochs=20, batch_size=72, validation_split=0.1, verbose=0, shuffle=False)
	# plot history
	plt.figure(figsize=(15,8), dpi=100, facecolor='white');
	plt.grid(False)
	plt.plot(history.history['loss'], label='train');
	plt.plot(history.history['val_loss'], label='validation', color='orange');
	plt.legend();
	plt.xlabel('Number of epochs')
	plt.ylabel('Loss values')
	plt.title(' Training and Validation Loss with LSTM for '+ player_name)
	plt.savefig('img/'+player_name+'_loss_LSTM.jpg');
	# plt.show();
	plt.close();

	# make a prediction
	yhat = model.predict(test_X)
	# reshape output into [samples, timesteps, features]
	train_y = train_y.reshape((train_y.shape[0], train_y.shape[1]))
	test_y = test_y.reshape((test_y.shape[0], test_y.shape[1]))
	yhat = yhat.reshape((yhat.shape[0], yhat.shape[1]))

	yhat_inversed = sc_y.inverse_transform(yhat)
	# test_y = test_y.reshape((len(test_y), 1))
	train_y_inversed = sc_y.inverse_transform(train_y)
	train_y_last = train_y_inversed[-1]
	test_y_inversed = sc_y.inverse_transform(test_y)
	return test_y_inversed, yhat_inversed, values, train_y_last


### ML Models

In [4]:
def train_predict(model = None, player_values=None, input_size = 3, output_size = 1, train_size=None, player_name=None):
	# ensure all data is float
	player_values = player_values.astype('float32')
	# frame as supervised learning
	reframed = series_to_supervised(player_values, input_size, output_size, dropnan=True)
	# print(reframed.columns.tolist())
	matchers = ['var10(t)', 'var10(t+']
	matching = [s for s in reframed.columns.tolist() if any(xs in s for xs in matchers)]
	# print(matching)
	new_cols = [col for col in reframed.columns if col not in matching] + matching
	reframed = reframed[new_cols]
	# print(reframed.columns.tolist())
	# print(reframed.shape)
	# split into train and test sets
	values = reframed.values
	train = values[:train_size, :]
	test = values[train_size:, :]
	# split into input and outputs
	train_X, train_y = train[:, :-output_size], train[:, -output_size:]
	test_X, test_y = test[:, :-output_size], test[:, -output_size:]
	# print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)

	if model=='Naive':
		# print(train_y.mean())
		yhat_inversed = np.full(test_y.shape[0], train_y.mean())
		# print(yhat_inversed)
		test_y = test_y.reshape((len(test_y), 1))
		train_y_inversed = train_y
		test_y_inversed = test_y
		train_y_last = train_y_inversed[-1]
		return test_y_inversed, yhat_inversed, values, train_y_last

	# scale X
	sc_x = StandardScaler()
	train_X = sc_x.fit_transform(train_X)
	test_X = sc_x.transform(test_X)
	# scale y
	if output_size==1:
		sc_y = StandardScaler()
		train_y = sc_y.fit_transform(train_y.reshape(-1, 1))
		test_y = sc_y.transform(test_y.reshape(-1, 1))
	else:
		sc_y = StandardScaler()
		train_y = sc_y.fit_transform(train_y)
		test_y = sc_y.transform(test_y)
	# define model
	if model=='Linear Regression':
		if player_name=='Pedri': # avoid negative predictions
			model = LinearRegression(positive=True).fit(train_X, train_y)
		else:
			model = LinearRegression().fit(train_X, train_y)
		yhat = model.predict(test_X)
		yhat_inversed = sc_y.inverse_transform(yhat)
	elif model=='Support Vector Machines':
		svr_model = SVR()
		grid = dict()
		grid['C'] = [0.5, 0.7, 1]
		grid['epsilon'] = [7e-2, 1e-1, 1.0]
		grid['kernel'] = ['linear', 'poly', 'rbf', 'sigmoid']
		search = GridSearchCV(svr_model, grid, cv=5, n_jobs=-1)
		start = time.time()
		search.fit(train_X, train_y.ravel())
		# grid_svr_elapsedTime = time.time()-start
		# print("Tuning time: %.5f seconds" % grid_svr_elapsedTime)
		# print(search.best_params_)
		
		model = SVR(kernel=search.best_params_['kernel'], C=search.best_params_['C'], 
					epsilon=search.best_params_['epsilon']).fit(train_X, train_y.ravel())
		yhat = model.predict(test_X)
		yhat_inversed = sc_y.inverse_transform(yhat.reshape(-1, 1))
	elif model=='Random Forest':
		rf_model = RandomForestRegressor()
		grid = dict()
		grid['max_features'] = ['auto', 'sqrt', 'log2']
		grid['min_samples_leaf'] = [1, 8, 15]
		grid['min_samples_split'] = [2, 8, 14]
		grid['n_estimators'] = [60, 80, 100, 120]
		search = GridSearchCV(rf_model, grid, cv=5, n_jobs=-1)
		start = time.time()
		search.fit(train_X, train_y.ravel())
		# grid_rf_elapsedTime = time.time()-start
		# print("Tuning time: %.5f seconds" % grid_rf_elapsedTime)
		# print(search.best_params_)
		
		model = RandomForestRegressor(max_features=search.best_params_['max_features'], min_samples_leaf=search.best_params_['min_samples_leaf'], 
					min_samples_split=search.best_params_['min_samples_split'],n_estimators=search.best_params_['n_estimators']).fit(train_X, train_y.ravel())
		yhat = model.predict(test_X)
		yhat_inversed = sc_y.inverse_transform(yhat.reshape(-1, 1))
	elif model=='XGBoost':
		xgb_model = xgboost.XGBRegressor()
		grid = {"subsample":[0.5, 1],
				"colsample_bytree":[0.5, 1],
				"max_depth":[5, 6, 7, 8],
				"min_child_weight":[1,5],
				"learning_rate":[0.3, 0.09, 0.03]}
		search = GridSearchCV(xgb_model, grid, cv=5, n_jobs=-1)
		start = time.time()
		search.fit(train_X, train_y.ravel())
		# grid_xgb_elapsedTime = time.time()-start
		# print("Tuning time: %.5f seconds" % grid_xgb_elapsedTime)
		# print(search.best_params_)

		model = xgboost.XGBRegressor(subsample=search.best_params_['subsample'], colsample_bytree=search.best_params_['colsample_bytree'], 
				max_depth=search.best_params_['max_depth'], min_child_weight=search.best_params_['min_child_weight'],
				learning_rate=search.best_params_['learning_rate']).fit(train_X, train_y.ravel())
		yhat = model.predict(test_X)
		yhat_inversed = sc_y.inverse_transform(yhat.reshape(-1, 1))
	elif model=='MLP':
		mlp_model = MLPRegressor(early_stopping=True)
		grid = dict()
		grid = {
			'hidden_layer_sizes': [(50,50,50), (50,100,50), (50,100), (100,50), (100,)],
			'activation': ['tanh', 'relu'],
			'solver': ['sgd', 'adam'],
			'alpha': [0.0001, 0.05, 0.09],
			'learning_rate': ['constant','adaptive'],
		}
		search = GridSearchCV(mlp_model, grid, cv=5, n_jobs=-1)
		start = time.time()
		search.fit(train_X, train_y.ravel())
		# grid_mlp_elapsedTime = time.time()-start
		# print("Tuning time: %.5f seconds" % grid_mlp_elapsedTime)
		# print(search.best_params_)

		model = MLPRegressor(early_stopping=True, hidden_layer_sizes=search.best_params_['hidden_layer_sizes'], 
				activation=search.best_params_['activation'], solver=search.best_params_['solver'],
				alpha=search.best_params_['alpha'],learning_rate=search.best_params_['learning_rate']).fit(train_X, train_y.ravel())
		yhat = model.predict(test_X)
		yhat_inversed = sc_y.inverse_transform(yhat.reshape(-1, 1))


	test_y = test_y.reshape((len(test_y), 1))
	train_y_inversed = sc_y.inverse_transform(train_y)
	test_y_inversed = sc_y.inverse_transform(test_y)
	train_y_last = train_y_inversed[-1]
	return test_y_inversed, yhat_inversed, values, train_y_last

## Run all models for each player and hold results/plots

In [5]:
%%time 
model_evaluation = pd.DataFrame()

LEAGUE_DATAPATH = '/.../data/all_players_league_match_info.csv'
league_data = pd.read_csv(LEAGUE_DATAPATH, parse_dates=['startTimestamp', 'player_birth', 'previous_date'])
league_data = league_data[['player_name','age',\
	'fifa_rating','fifa_potential','after_injury','injury_days','rest_days',\
	'current_team_category','opponent_category','home_fixture',\
	'Performance']]
# league_data
names_list = league_data['player_name'].unique().tolist()
# names_list=['Messi']

for player_name in names_list:
	player_df = league_data[league_data['player_name']==player_name]
	player_df = player_df.drop(labels=['player_name'], axis=1, inplace=False)
	# player_df
	player_values = player_df.values
	input_size = 1
	output_size = 1
	test_size = 10
	train_size = player_values.shape[0]-input_size-test_size

	player_evaluation = pd.DataFrame()

	for i, model_name in enumerate(['Naive', 'Linear Regression', 'Support Vector Machines', 'Random Forest', 'XGBoost', 'MLP', 'LSTM']):
		if model_name=='LSTM':
			test_y, pred_y, reframed_values, train_y_last = lstm_train_predict(player_values, input_size, output_size, train_size, player_name)
		else:
			test_y, pred_y, reframed_values, train_y_last = train_predict(model_name, player_values, input_size, output_size, train_size, player_name)
		mae, mape, rmse, smaperror, r2 = get_metrics(test_y, pred_y)
		# print_metrics(mae, mape, rmse, smaperror, r2)
		if model_name != 'Naive':
			correct_pred_updowns = get_percentage(test_y, pred_y, train_y_last)
		else:
			correct_pred_updowns = 0
		results = [{'Player': player_name, model_name+'_MAE': mae, model_name+'_RMSE': rmse, model_name+'_R-squared': r2, model_name+'_correct_pred_updowns': correct_pred_updowns}]
		player_evaluation = player_evaluation.append(results, ignore_index=True, sort=False)
		player_evaluation = player_evaluation.apply(lambda x: pd.Series(x.dropna().values))
		player_evaluation.dropna(inplace=True)

		values_y = reframed_values[:, -output_size:]
		data = pd.Series(values_y.flatten())
		predictions = pd.DataFrame(pred_y.flatten())
		predictions.index = data[train_size:].index
		predictions = predictions.rename(columns={0: "Performance"})

		#plot
		if i==0:
			# pass
			plot_train_test(data, train_size, player_name)
		plot_preds_real(data, predictions, train_size, player_name, model_name)
	print('Finished for: ', player_name)
		
	player_evaluation.to_csv('results/'+player_name+'.csv')
	model_evaluation = pd.concat([model_evaluation, player_evaluation])
	
model_evaluation.to_csv('results/'+'model_evaluation.csv')

2022-07-18 14:10:52.757584: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-07-18 14:10:52.757708: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-07-18 14:10:52.757768: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (aspire-a315-56): /proc/driver/nvidia/version does not exist
2022-07-18 14:10:52.760019: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Finished for:  Messi
Finished for:  Lewandowski
Finished for:  Jorginho
Finished for:  Benzema
Finished for:  Kante
Finished for:  Cristiano
Finished for:  Salah
Finished for:  Debruyne
Finished for:  Mbappe
Finished for:  Donnarumma
Finished for:  Haaland
Finished for:  Lukaku
Finished for:  Chiellini
Finished for:  Bonucci
Finished for:  Sterling
Finished for:  Neymar
Finished for:  Suarez
Finished for:  Kjaer
Finished for:  Mount
Finished for:  Mahrez
Finished for:  Bruno
Finished for:  Lautaro
Finished for:  Kane
Finished for:  Pedri
Finished for:  Foden
Finished for:  Moreno
Finished for:  Barella
Finished for:  Dias
Finished for:  Modric
Finished for:  Azpilicueta
CPU times: user 6h 23min 23s, sys: 6min 25s, total: 6h 29min 48s
Wall time: 2h 54min 7s


In [6]:
for i in range(len(test_y)):
    print(test_y[i],pred_y[i])

[7.2] [7.119502]
[6.5] [7.260408]
[7.2] [7.2401104]
[7.4] [7.1220856]
[6.9] [7.2667465]
[7.2] [7.269112]
[7.2] [7.1855974]
[7.4] [7.1487603]
[6.5] [7.273168]
[6.1] [7.137644]
