In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib

from scipy.stats import norm
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process.kernels import RationalQuadratic
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import validation_curve

import utility

Every year I have to put the mean of the previous years to increase the accuracy of the regressors

## Load Datasets

In [3]:
# load and summarize the dataset
# load ratings dataset
urlT = 'output_rp.csv'
dataframe_ratings = pd.read_csv(urlT, header=0)
# summarize shape
print(dataframe_ratings.shape)
# summarize first few lines
print(dataframe_ratings.head(30))
# load prices datasets
urlT = 'player_prices.xlsx'
dataframe_prices = pd.read_excel('player_prices.xlsx')
# summarize shape
print(dataframe_prices.shape)
# summarize first few lines
print(dataframe_prices.head(30))

# load dataset squads
print("SQUADS")
urlSquads = 'squads.csv'
dataframe_squads = pd.read_csv(urlSquads, header=0, index_col=None)
dataframe_squads.set_index("Name", inplace=True)
squads_dict = dataframe_squads['Value'].to_dict()
# summarize first few lines
print("Summary Squads")
print(squads_dict)

(539, 26)
      Id Role          Name     Squad  Price  MyRating          Mate  \
0   4431    P   Carnesecchi  Atalanta     10       3.5         Musso   
1   2792    P         Musso  Atalanta      5       3.8   Carnesecchi   
2   2297    P      Rossi F.  Atalanta      1       3.2           NaN   
3    554    D    Zappacosta  Atalanta     15       4.1          Holm   
4   5067    D        Bakker  Atalanta     11       3.8       Ruggeri   
5   5526    D      Scalvini  Atalanta     10       4.0           NaN   
6   5678    D          Holm  Atalanta     10       3.9    Zappacosta   
7   2640    D     Kolasinac  Atalanta      9       3.9           NaN   
8    695    D         Toloi  Atalanta      8       3.4      Djimsiti   
9    787    D      Djimsiti  Atalanta      8       3.4         Toloi   
10  2181    D      Palomino  Atalanta      6       3.3           NaN   
11  2130    D      Hateboer  Atalanta      6       3.6           NaN   
12  4433    D        Zortea  Atalanta      5       3.3

## Create sub datasets for regressions

In [4]:
columns_to_keep = ["Role", "Name", "Squad", "Price", "MyRating", "Regularness", "FVM", "ExpectedMf", "mean", "std", "count"]

roles = ["P", "D", "C", "A"]

In [5]:
# Create the merged dataset
dataframe_merge = pd.merge(dataframe_ratings, dataframe_prices, on='Name', how='inner')
# Drop useless columns
dataframe_merge = dataframe_merge.drop(dataframe_merge.columns.difference(columns_to_keep), axis=1)
# Substitute Squads with their values
dataframe_merge['Squad'] = dataframe_merge['Squad'].map(squads_dict)

print("MERGED DATASET")
print(dataframe_merge)

# Create the subsets for Role
dataframe_A = dataframe_merge[dataframe_merge["Role"] == 'A']
dataframe_C = dataframe_merge[dataframe_merge["Role"] == 'C']
dataframe_D = dataframe_merge[dataframe_merge["Role"] == 'D']
dataframe_P = dataframe_merge[dataframe_merge["Role"] == 'P']

print(dataframe_A)

MERGED DATASET
    Role         Name  Squad  Price  MyRating  Regularness  FVM  ExpectedMf  \
0      P  Carnesecchi      4     10       3.5            3   10        4.71   
1      P        Musso      4      5       3.8            4   28        4.85   
2      P     Rossi F.      4      1       3.2            1    1        4.81   
3      D   Zappacosta      4     15       4.1            4   41        6.19   
4      D       Bakker      4     11       3.8            4   23        6.05   
..   ...          ...    ...    ...       ...          ...  ...         ...   
534    A        Henry      2     11       3.2            2    6        7.07   
535    A       Djuric      2     10       2.7            4   10        5.84   
536    A    Bonazzoli      2      8       3.2            3    8        7.03   
537    A        Braaf      2      1       2.5            1    1        6.33   
538    A       Kallon      2      1       2.7            1    2        6.36   

          mean       std  count  
0 

### Trial Regression with mean and std

**GPR**

After inspecting the behaviour of this model I concluded it does not fit our problem.

In [6]:
# Get the data to fit
data = dataframe_A.values
X, y = data[:, 2:-4], data[:, -3:-1]

# Define kernel
kernel = RationalQuadratic(length_scale=1)

# Fit the GP model
gp_model = GaussianProcessRegressor(kernel=kernel)

# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=8, random_state=42)
# evaluate model
scores = cross_val_score(gp_model, X, y, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = np.abs(scores)
print('Mean MSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

gp_model.fit(X, y)

# Predict
row = [
        [5, 37, 4.5, 5, 400]
    ]
predicted_mean, predicted_std = gp_model.predict(row, return_std=True)

print(f"Predicted mean: {predicted_mean}")


Mean MSE: 283.122 (287.092)
Predicted mean: [[190.77202634  23.00154827]]


**Ridge**

Has the behaviour we want and from previous takes it performs better than Linear Regression

In [7]:
# Get the data to fit
data = dataframe_A.values
X, y = data[:, 2:-4], data[:, -3:-1]

# Define model
model = Ridge(alpha=1.5)
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=8, random_state=42)
# evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = np.abs(scores)
print('Mean MSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
model.fit(X, y)

row = [[5, 20, 3.5, 5, 420]]
model.predict(row)

Mean MSE: 139.384 (108.956)


array([[334.06547347,  42.42942749]])

**Lasso**

Has the behaviour we want and from previous takes it performs better than Linear Regression

In [53]:
# Get the data to fit
data = dataframe_A.values
X, y = data[:, 2:-4], data[:, -3:-1]

# Define model
model = Lasso(alpha=1)
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=8, random_state=42)
# evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = np.abs(scores)
print('Mean MSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
model.fit(X, y)

row = [[5, 40, 4.4, 5, 250]]
model.predict(row)

Mean MSE: 140.015 (111.970)


array([[135.08139252,  24.3643135 ]])

**Linear Regression**

In [440]:
# Get the data to fit
data = dataframe_A.values
X, y = data[:, 2:-3], data[:, -3:-1]

# Create a linear regression model
model = LinearRegression()
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=8, random_state=42)
# evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = np.abs(scores)
print('Mean MSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
model.fit(X, y)

row = [[5, 20, 4.2, 4, 250]]
model.predict(row)

Mean MSE: 125.594 (97.359)


array([[167.70427997,  25.32717582]])

**SVM**

In [56]:
# Get the data to fit
data = dataframe_A.values
X, y = data[:, 2:-4], data[:, -3:-1]

# Create an SVR model
svr = SVR(kernel='linear')  # You can choose different kernels like 'rbf', 'poly', etc.

# Wrap SVR into MultiOutputRegressor
model = MultiOutputRegressor(svr)
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=8, random_state=42)
# evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = np.abs(scores)
print('Mean MSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
model.fit(X, y)

row = [[4, 50, 3.5, 5, 200]]
model.predict(row)

Mean MSE: 147.597 (130.852)


array([[92.28380825, 19.98913454]])

## GPR Regressors

**Attackers**

In [575]:
# Get the data to fit
data = dataframe_A.values
X, y = data[:, 2:-3], data[:, -3:-1]

# Scale the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Define kernel
kernel = RationalQuadratic(length_scale=0.1)

# Fit the GP model
model = GaussianProcessRegressor(kernel=kernel)
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=8, random_state=42)
# evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = np.abs(scores)
print('Mean MSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
model.fit(X, y)

row = [[4, 30, 4.6, 5, 200]]
row = scaler.transform(row)
model.predict(row)

temp = model.predict(X)
temp = np.round(temp)
temp = temp.astype(int)
#temp = np.concatenate((['price'] , temp), axis=0)
temp = np.vstack(temp)
tempJoint = np.hstack((data, temp))
print(tempJoint[20:80,:])
name = 'prices_A.csv'
print(name)
can
np.savetxt(name, tempJoint, delimiter=',', fmt='%s')
nameEx = 'prices_A_ex.csv'
np.savetxt(nameEx, tempJoint, delimiter=';', fmt='%s')
# Save the model to a file
joblib.dump(model, 'lasso_regressor_model_A.pkl')

Mean MSE: 313.279 (334.262)
[['A' 'Beltran L.' 4 15 3.0 3 61 14.07142857142857 7.670766541921304 14
  14 8]
 ['A' 'Brekalo' 4 11 2.7 3 17 1.0 1.0 1 1 1]
 ['A' "Kouame'" 4 8 2.0 3 8 1.0 0.0 2 1 0]
 ['A' 'Cheddira' 2 10 2.0 5 28 5.538461538461538 4.858247019508591 13 6 5]
 ['A' 'Caso' 2 6 2.0 3 12 1.0 1.0 0 1 1]
 ['A' "Soule'" 2 4 2.0 4 14 1.571428571428571 0.9759000729485332 7 2 1]
 ['A' 'Cuni' 2 4 1.0 2 5 1.0 1.0 0 1 1]
 ['A' 'Kvernadze' 2 3 1.0 1 6 1.0 1.0 0 1 1]
 ['A' 'Kaio Jorge' 2 1 1.0 1 9 1.0 0.0 6 1 0]
 ['A' 'Bidaoui' 2 1 1.0 1 1 1.0 1.0 0 1 1]
 ['A' 'Retegui' 3 23 4.0 5 128 64.71428571428571 15.27908863285207 14 65
  15]
 ['A' 'Ekuban' 3 3 1.0 2 2 1.0 1.0 0 1 1]
 ['A' 'Puscas' 3 1 1.0 1 2 1.0 1.0 0 1 1]
 ['A' 'Yalcin' 3 1 1.0 1 1 1.0 1.0 0 1 1]
 ['A' 'Martinez L.' 5 40 4.8 5 436 331.6428571428572 38.15677607502447 14
  332 38]
 ['A' 'Thuram' 5 25 4.0 4 180 131.9285714285714 20.5817853965561 14 132
  21]
 ['A' 'Arnautovic' 5 23 3.6 3 122 32.78571428571428 15.37319631160563 14
  

NameError: name 'can' is not defined

## Ridge Regressors

**Attackers**

In [596]:
# Get the data to fit
data = dataframe_A.values
X, y = data[:, 2:-3], data[:, -3:-1]

# Define model
model = Ridge(alpha=1.5)
# define model evaluation method
cv = RepeatedKFold(n_splits=4, n_repeats=3, random_state=42)
# evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = np.abs(scores)
print('Mean MSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
model.fit(X, y)

row = [[4, 30, 4.6, 5, 200]]
model.predict(row)

# Save the model to a file
joblib.dump(model, 'ridge_regressor_model_A.pkl')

Mean MSE: 6.454 (1.305)


['ridge_regressor_model_A.pkl']

**Midfielders**

In [597]:
# Get the data to fit
data = dataframe_C.values
X, y = data[:, 2:-3], data[:, -3:-1]

# Define model
model = Ridge(alpha=1.5)
# define model evaluation method
cv = RepeatedKFold(n_splits=8, n_repeats=3, random_state=42)
# evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = np.abs(scores)
print('Mean MSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
model.fit(X, y)

row = [[4, 30, 3.5, 4, 150]]
model.predict(row)

# Save the model to a file
joblib.dump(model, 'ridge_regressor_model_C.pkl')

Mean MSE: 2.706 (0.587)


['ridge_regressor_model_C.pkl']

**Defenders**

In [598]:
# Get the data to fit
data = dataframe_D.values
X, y = data[:, 2:-3], data[:, -3:-1]

# Define model
model = Ridge(alpha=1.5)
# define model evaluation method
cv = RepeatedKFold(n_splits=4, n_repeats=3, random_state=42)
# evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = np.abs(scores)
print('Mean MSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
model.fit(X, y)

row = [[5, 15, 4, 5, 50]]
model.predict(row)

# Save the model to a file
joblib.dump(model, 'ridge_regressor_model_D.pkl')

Mean MSE: 2.063 (0.220)


['ridge_regressor_model_D.pkl']

**Keepers**

In [599]:
# Get the data to fit
data = dataframe_P.values
X, y = data[:, 2:-3], data[:, -3:-1]

# Define model
model = Ridge(alpha=1.5)
# define model evaluation method
cv = RepeatedKFold(n_splits=4, n_repeats=3, random_state=42)
# evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = np.abs(scores)
print('Mean MSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
model.fit(X, y)

row = [[5, 20, 4, 4, 80]]
model.predict(row)
# Save the model to a file
joblib.dump(model, 'ridge_regressor_model_P.pkl')

Mean MSE: 2.059 (0.423)


['ridge_regressor_model_P.pkl']

## Lasso Regressors

**Attackers**

In [58]:
# Get the data to fit
data = dataframe_A.values
X, y = data[:, 2:-4], data[:, -3:-1]

# Define model
model = Lasso(alpha=1)
# define model evaluation method
cv = RepeatedKFold(n_splits=4, n_repeats=3, random_state=42)
# evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = np.abs(scores)
print('Mean MSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
model.fit(X, y)

row = [[4, 30, 4.6, 5, 200]]
model.predict(row)

# Save the model to a file
joblib.dump(model, 'lasso_regressor_model_A.pkl')

Mean MSE: 6.492 (1.464)


['lasso_regressor_model_A.pkl']

**Midfielders**

In [59]:
# Get the data to fit
data = dataframe_C.values
X, y = data[:, 2:-4], data[:, -3:-1]

# Define model
model = Lasso(alpha=1)
# define model evaluation method
cv = RepeatedKFold(n_splits=8, n_repeats=3, random_state=42)
# evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = np.abs(scores)
print('Mean MSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
model.fit(X, y)

row = [[4, 30, 3.5, 4, 150]]
model.predict(row)

# Save the model to a file
joblib.dump(model, 'lasso_regressor_model_C.pkl')

Mean MSE: 2.706 (0.579)


['lasso_regressor_model_C.pkl']

**Defenders**

In [60]:
# Get the data to fit
data = dataframe_D.values
X, y = data[:, 2:-4], data[:, -3:-1]

# Define model
model = Lasso(alpha=1)
# define model evaluation method
cv = RepeatedKFold(n_splits=4, n_repeats=3, random_state=42)
# evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = np.abs(scores)
print('Mean MSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
model.fit(X, y)

row = [[5, 15, 4, 5, 50]]
model.predict(row)

# Save the model to a file
joblib.dump(model, 'lasso_regressor_model_D.pkl')

Mean MSE: 2.194 (0.286)


['lasso_regressor_model_D.pkl']

**Keepers**

In [61]:
# Get the data to fit
data = dataframe_P.values
X, y = data[:, 2:-4], data[:, -3:-1]

# Define model
model = Lasso(alpha=1)
# define model evaluation method
cv = RepeatedKFold(n_splits=4, n_repeats=3, random_state=42)
# evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = np.abs(scores)
print('Mean MSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
model.fit(X, y)

row = [[5, 20, 4, 4, 80]]
model.predict(row)
# Save the model to a file
joblib.dump(model, 'lasso_regressor_model_P.pkl')

Mean MSE: 1.884 (0.376)


['lasso_regressor_model_P.pkl']