In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib

from scipy.stats import norm
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process.kernels import RationalQuadratic
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import validation_curve

import utility

Every year I have to put the mean of the previous years to increase the accuracy of the regressors

In [35]:
current_season = "24-25"

## Load Datasets

In [36]:
# load and summarize the dataset
# load ratings dataset
urlT = current_season + '/output_rp.csv'
dataframe_ratings = pd.read_csv(urlT, header=0)
# summarize shape
print(dataframe_ratings.shape)
# summarize first few lines
print(dataframe_ratings.head(30))

# load dataset squads
print("SQUADS")
urlSquads = current_season + '/squads.csv'
dataframe_squads = pd.read_csv(urlSquads, header=0, index_col=None)
dataframe_squads.set_index("Name", inplace=True)
squads_dict = dataframe_squads['Value'].to_dict()
# summarize first few lines
print("Summary Squads")
print(squads_dict)

(519, 26)
      Id Role          Name     Squad  Price  Age  MyRating         Mate  \
0   4431    P   Carnesecchi  Atalanta     13   24       4.3          NaN   
1   2792    P         Musso  Atalanta      4   30       3.9          NaN   
2   2297    P      Rossi F.  Atalanta      1   29       3.4          NaN   
3   4887    D     Bellanova  Atalanta     13   24       4.3   Zappacosta   
4   5354    D       Ruggeri  Atalanta     12   22       4.0          NaN   
5    554    D    Zappacosta  Atalanta      9   32       3.8    Bellanova   
6   2640    D     Kolasinac  Atalanta      9   31       3.7          NaN   
7    787    D      Djimsiti  Atalanta      8   31       3.5      Godfrey   
8   5526    D      Scalvini  Atalanta      8   21       3.7          NaN   
9   6046    D          Hien  Atalanta      8   25       3.5          NaN   
10  5324    D       Godfrey  Atalanta      7   26       4.0     Djimsiti   
11   695    D         Toloi  Atalanta      4   34       3.8          NaN   
12

## Create sub datasets for regressions

In [37]:
columns_to_keep = ["Id", "Role", "Name", "Squad", "Price", "MyRating",  "Mate", "Regularness", "FVM", "ExpectedMf", "mean", "std"]
#columns_to_keep = ["Role", "Name", "Squad", "MyRating", "FVM", "mean", "std"]

roles = ["P", "D", "C", "A"]

In [38]:
# Load models (assuming they are already saved in the current directory)
model_mean_P = joblib.load('regressors/model_mean_P.joblib')
model_std_P = joblib.load('regressors/model_std_P.joblib')
model_mean_D = joblib.load('regressors/model_mean_D.joblib')
model_std_D = joblib.load('regressors/model_std_D.joblib')
model_mean_C = joblib.load('regressors/model_mean_C.joblib')
model_std_C = joblib.load('regressors/model_std_C.joblib')
model_mean_A = joblib.load('regressors/model_mean_A.joblib')
model_std_A = joblib.load('regressors/model_std_A.joblib')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [39]:
# Function to predict mean and std based on role
def predict_for_role(row):
    fvm_value = np.array([[row['FVM']]], dtype=np.float32)
    
    if row['Role'] == 'P':
        value = model_mean_P.predict(fvm_value)[0]
        mean = round(value) if value >= 1 else 1
        std = round(model_std_P.predict(fvm_value)[0])
    elif row['Role'] == 'D':
        value = model_mean_D.predict(fvm_value)[0]
        mean = round(value) if value >= 1 else 1
        std = round(model_std_D.predict(fvm_value)[0])
    elif row['Role'] == 'C':
        value = model_mean_C.predict(fvm_value)[0]
        mean = round(value) if value >= 1 else 1
        std = round(model_std_C.predict(fvm_value)[0])
    elif row['Role'] == 'A':
        value = model_mean_A.predict(fvm_value)[0]
        mean = round(value) if value >= 1 else 1
        std = round(model_std_A.predict(fvm_value)[0])
    else:
        mean = np.nan
        std = np.nan
    
    return pd.Series([mean, std], index=['mean', 'std'])

In [40]:
# Create the dataset with mean and std
dataframe_ratings[['mean', 'std']] = dataframe_ratings.apply(predict_for_role, axis=1)
# Drop useless columns
dataframe_merge = dataframe_ratings.drop(dataframe_ratings.columns.difference(columns_to_keep), axis=1)
# Substitute Squads with their values
#dataframe_merge['Squad'] = dataframe_merge['Squad'].map(squads_dict)

print("MERGED DATASET")
print(dataframe_merge)

# Create the subsets for Role
dataframe_A = dataframe_merge[dataframe_merge["Role"] == 'A']
dataframe_C = dataframe_merge[dataframe_merge["Role"] == 'C']
dataframe_D = dataframe_merge[dataframe_merge["Role"] == 'D']
dataframe_P = dataframe_merge[dataframe_merge["Role"] == 'P']

# Filter the dataframes to the ones having more than 10 FVM
dataframe_A = dataframe_A[dataframe_A['FVM'] >= 20]

print(dataframe_A)

MERGED DATASET
       Id Role              Name     Squad  Price  MyRating        Mate  \
0    4431    P       Carnesecchi  Atalanta     13       4.3         NaN   
1    2792    P             Musso  Atalanta      4       3.9         NaN   
2    2297    P          Rossi F.  Atalanta      1       3.4         NaN   
3    4887    D         Bellanova  Atalanta     13       4.3  Zappacosta   
4    5354    D           Ruggeri  Atalanta     12       4.0         NaN   
..    ...  ...               ...       ...    ...       ...         ...   
514  6801    A         Tengstedt    Verona     12       3.4         NaN   
515  6630    A          Mosquera    Verona     10       3.5         NaN   
516  6644    A  Rocha Livramento    Verona      6       3.2         NaN   
517  6558    A            Tavsan    Verona      4       3.0         NaN   
518  6490    A              Cruz    Verona      1       2.9         NaN   

     Regularness  FVM  ExpectedMf  mean  std  
0              4   55        5.28    

# Save without the stats columns

In [41]:
# Dictionary to rename
rename_dict = {
    "Id": "id",
    "Role": "role",
    "Name": "name",
    "Squad": "squad",
    "Price": "price",
    "MyRating": "myRating",
    "Mate": "mate",
    "Regularness": "regularness",
    "FVM": "fvm",
    "ExpectedMf": "expMf",
    "mean": "expPrice",
    "std": "expStd"
}
# Rename the columns
dataframe_merge = dataframe_merge.rename(columns=rename_dict)
# Save to file
dataframe_merge.to_csv('players23_24_nostats.csv', index=False, sep=',', encoding='utf-8')

### Trial Regression with mean and std

**GPR**

After inspecting the behaviour of this model I concluded it does not fit our problem.

In [12]:
# Get the data to fit
data = dataframe_A.values
X, y = data[:, 2:-4], data[:, -3:-1]

# Define kernel
kernel = RationalQuadratic(length_scale=1)

# Fit the GP model
gp_model = GaussianProcessRegressor(kernel=kernel)

# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=8, random_state=42)
# evaluate model
scores = cross_val_score(gp_model, X, y, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = np.abs(scores)
print('Mean MSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

gp_model.fit(X, y)

# Predict
row = [
        [5, 37, 4.5, 5]
    ]
predicted_mean, predicted_std = gp_model.predict(row, return_std=True)

print(f"Predicted mean: {predicted_mean}")


Mean MSE: 423.472 (608.019)


ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  _check_optimize_result("lbfgs", opt_res)


ValueError: X has 4 features, but GaussianProcessRegressor is expecting 3 features as input.

**Ridge**

Has the behaviour we want and from previous takes it performs better than Linear Regression

In [7]:
# Get the data to fit
data = dataframe_A.values
X, y = data[:, 2:-4], data[:, -3:-1]

# Define model
model = Ridge(alpha=1.5)
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=8, random_state=42)
# evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = np.abs(scores)
print('Mean MSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
model.fit(X, y)

row = [[5, 20, 3.5, 5, 420]]
model.predict(row)

Mean MSE: 139.384 (108.956)


array([[334.06547347,  42.42942749]])

**Lasso**

Has the behaviour we want and from previous takes it performs better than Linear Regression

In [53]:
# Get the data to fit
data = dataframe_A.values
X, y = data[:, 2:-4], data[:, -3:-1]

# Define model
model = Lasso(alpha=1)
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=8, random_state=42)
# evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = np.abs(scores)
print('Mean MSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
model.fit(X, y)

row = [[5, 40, 4.4, 5, 250]]
model.predict(row)

Mean MSE: 140.015 (111.970)


array([[135.08139252,  24.3643135 ]])

**Linear Regression**

In [440]:
# Get the data to fit
data = dataframe_A.values
X, y = data[:, 2:-3], data[:, -3:-1]

# Create a linear regression model
model = LinearRegression()
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=8, random_state=42)
# evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = np.abs(scores)
print('Mean MSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
model.fit(X, y)

row = [[5, 20, 4.2, 4, 250]]
model.predict(row)

Mean MSE: 125.594 (97.359)


array([[167.70427997,  25.32717582]])

**SVM**

In [56]:
# Get the data to fit
data = dataframe_A.values
X, y = data[:, 2:-4], data[:, -3:-1]

# Create an SVR model
svr = SVR(kernel='linear')  # You can choose different kernels like 'rbf', 'poly', etc.

# Wrap SVR into MultiOutputRegressor
model = MultiOutputRegressor(svr)
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=8, random_state=42)
# evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = np.abs(scores)
print('Mean MSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
model.fit(X, y)

row = [[4, 50, 3.5, 5, 200]]
model.predict(row)

Mean MSE: 147.597 (130.852)


array([[92.28380825, 19.98913454]])

## GPR Regressors

**Attackers**

In [575]:
# Get the data to fit
data = dataframe_A.values
X, y = data[:, 2:-3], data[:, -3:-1]

# Scale the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Define kernel
kernel = RationalQuadratic(length_scale=0.1)

# Fit the GP model
model = GaussianProcessRegressor(kernel=kernel)
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=8, random_state=42)
# evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = np.abs(scores)
print('Mean MSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
model.fit(X, y)

row = [[4, 30, 4.6, 5, 200]]
row = scaler.transform(row)
model.predict(row)

temp = model.predict(X)
temp = np.round(temp)
temp = temp.astype(int)
#temp = np.concatenate((['price'] , temp), axis=0)
temp = np.vstack(temp)
tempJoint = np.hstack((data, temp))
print(tempJoint[20:80,:])
name = 'prices_A.csv'
print(name)
can
np.savetxt(name, tempJoint, delimiter=',', fmt='%s')
nameEx = 'prices_A_ex.csv'
np.savetxt(nameEx, tempJoint, delimiter=';', fmt='%s')
# Save the model to a file
joblib.dump(model, 'lasso_regressor_model_A.pkl')

Mean MSE: 313.279 (334.262)
[['A' 'Beltran L.' 4 15 3.0 3 61 14.07142857142857 7.670766541921304 14
  14 8]
 ['A' 'Brekalo' 4 11 2.7 3 17 1.0 1.0 1 1 1]
 ['A' "Kouame'" 4 8 2.0 3 8 1.0 0.0 2 1 0]
 ['A' 'Cheddira' 2 10 2.0 5 28 5.538461538461538 4.858247019508591 13 6 5]
 ['A' 'Caso' 2 6 2.0 3 12 1.0 1.0 0 1 1]
 ['A' "Soule'" 2 4 2.0 4 14 1.571428571428571 0.9759000729485332 7 2 1]
 ['A' 'Cuni' 2 4 1.0 2 5 1.0 1.0 0 1 1]
 ['A' 'Kvernadze' 2 3 1.0 1 6 1.0 1.0 0 1 1]
 ['A' 'Kaio Jorge' 2 1 1.0 1 9 1.0 0.0 6 1 0]
 ['A' 'Bidaoui' 2 1 1.0 1 1 1.0 1.0 0 1 1]
 ['A' 'Retegui' 3 23 4.0 5 128 64.71428571428571 15.27908863285207 14 65
  15]
 ['A' 'Ekuban' 3 3 1.0 2 2 1.0 1.0 0 1 1]
 ['A' 'Puscas' 3 1 1.0 1 2 1.0 1.0 0 1 1]
 ['A' 'Yalcin' 3 1 1.0 1 1 1.0 1.0 0 1 1]
 ['A' 'Martinez L.' 5 40 4.8 5 436 331.6428571428572 38.15677607502447 14
  332 38]
 ['A' 'Thuram' 5 25 4.0 4 180 131.9285714285714 20.5817853965561 14 132
  21]
 ['A' 'Arnautovic' 5 23 3.6 3 122 32.78571428571428 15.37319631160563 14
  

NameError: name 'can' is not defined

## Ridge Regressors

**Attackers**

In [191]:
# Get the data to fit
data = dataframe_A.values
X, y = data[:, 2:-2], data[:, -2:]

# Define model
model = Ridge(alpha=1)
# define model evaluation method
cv = RepeatedKFold(n_splits=4, n_repeats=3, random_state=42)
# evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = np.abs(scores)
print('Mean MSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
model.fit(X, y)

# Save the model to a file
joblib.dump(model, 'ridge_regressor_model_A.pkl')

Mean MSE: 1.612 (0.249)


['ridge_regressor_model_A.pkl']

In [193]:
row = [[5, 4.6, 100]]
print(model.predict(row))

[[50.70255599 13.5998253 ]]


**Midfielders**

In [99]:
# Get the data to fit
data = dataframe_C.values
X, y = data[:, 2:-2], data[:, -3:-1]

# Define model
model = Ridge(alpha=1.5)
# define model evaluation method
cv = RepeatedKFold(n_splits=8, n_repeats=3, random_state=42)
# evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = np.abs(scores)
print('Mean MSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
model.fit(X, y)

row = [[4, 4.5, 7.5]]
print(model.predict(row))

# Save the model to a file
joblib.dump(model, 'ridge_regressor_model_C.pkl')

Mean MSE: 4.813 (0.741)
[[ 7.38371268 49.51152719]]


['ridge_regressor_model_C.pkl']

**Defenders**

In [84]:
# Get the data to fit
data = dataframe_D.values
X, y = data[:, 2:-2], data[:, -3:-1]

# Define model
model = Ridge(alpha=1.5)
# define model evaluation method
cv = RepeatedKFold(n_splits=4, n_repeats=3, random_state=42)
# evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = np.abs(scores)
print('Mean MSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
model.fit(X, y)

row = [[5, 4, 6.5]]
model.predict(row)

# Save the model to a file
joblib.dump(model, 'ridge_regressor_model_D.pkl')

Mean MSE: 3.028 (0.490)


['ridge_regressor_model_D.pkl']

**Keepers**

In [86]:
# Get the data to fit
data = dataframe_P.values
X, y = data[:, 2:-2], data[:, -3:-1]

# Define model
model = Ridge(alpha=1.5)
# define model evaluation method
cv = RepeatedKFold(n_splits=4, n_repeats=3, random_state=42)
# evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = np.abs(scores)
print('Mean MSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
model.fit(X, y)

row = [[5, 4, 5.2]]
model.predict(row)
# Save the model to a file
joblib.dump(model, 'ridge_regressor_model_P.pkl')

Mean MSE: 3.787 (0.784)


['ridge_regressor_model_P.pkl']

## Lasso Regressors

**Attackers**

In [89]:
# Get the data to fit
data = dataframe_A.values
X, y = data[:, 2:-2], data[:, -3:-1]

# Define model
model = Lasso(alpha=1)
# define model evaluation method
cv = RepeatedKFold(n_splits=4, n_repeats=3, random_state=42)
# evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = np.abs(scores)
print('Mean MSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
model.fit(X, y)

row = [[4, 4.1, 7.5]]
model.predict(row)

# Save the model to a file
joblib.dump(model, 'lasso_regressor_model_A.pkl')

Mean MSE: 14.626 (2.597)


['lasso_regressor_model_A.pkl']

In [91]:
row = [[4, 4.1, 7.5]]
model.predict(row)

array([[ 6.88475248, 85.51941681]])

**Midfielders**

In [96]:
# Get the data to fit
data = dataframe_C.values
X, y = data[:, 2:-2], data[:, -3:-1]

# Define model
model = Lasso(alpha=1)
# define model evaluation method
cv = RepeatedKFold(n_splits=8, n_repeats=3, random_state=42)
# evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = np.abs(scores)
print('Mean MSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
model.fit(X, y)

row = [[5, 5, 8.4]]
print(model.predict(row))

# Save the model to a file
joblib.dump(model, 'lasso_regressor_model_C.pkl')

Mean MSE: 4.599 (0.861)
[[ 6.35587209 54.90152561]]


['lasso_regressor_model_C.pkl']

**Defenders**

In [60]:
# Get the data to fit
data = dataframe_D.values
X, y = data[:, 2:-4], data[:, -3:-1]

# Define model
model = Lasso(alpha=1)
# define model evaluation method
cv = RepeatedKFold(n_splits=4, n_repeats=3, random_state=42)
# evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = np.abs(scores)
print('Mean MSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
model.fit(X, y)

row = [[5, 15, 4, 5, 50]]
model.predict(row)

# Save the model to a file
joblib.dump(model, 'lasso_regressor_model_D.pkl')

Mean MSE: 2.194 (0.286)


['lasso_regressor_model_D.pkl']

**Keepers**

In [61]:
# Get the data to fit
data = dataframe_P.values
X, y = data[:, 2:-4], data[:, -3:-1]

# Define model
model = Lasso(alpha=1)
# define model evaluation method
cv = RepeatedKFold(n_splits=4, n_repeats=3, random_state=42)
# evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = np.abs(scores)
print('Mean MSE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))
model.fit(X, y)

row = [[5, 20, 4, 4, 80]]
model.predict(row)
# Save the model to a file
joblib.dump(model, 'lasso_regressor_model_P.pkl')

Mean MSE: 1.884 (0.376)


['lasso_regressor_model_P.pkl']