## Consolidating ML Algorithms for NAC Dataset

In [34]:
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, precision_score, f1_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [25]:
NAC_data =  pd.read_csv('NAC_data.csv')
NAC_df =  pd.DataFrame(NAC_data)
NAC_df.head()

Unnamed: 0.1,Unnamed: 0,Player,Team,Team within selected timeframe,Position,Age,Market value,Contract expires,Matches played,Minutes played,...,Prevented goals per 90,Back passes received as GK per 90,Exits per 90,Aerial duels per 90.1,Free kicks per 90,Direct free kicks per 90,"Direct free kicks on target, %",Corners per 90,Penalties taken,"Penalty conversion, %"
0,0,P. Iemmello,Catanzaro,Catanzaro,CF,31.0,750000,,38,3109,...,,0.0,,,0.0,0.0,0.0,0.0,5,80.0
1,1,J. Petriccione,Crotone,Crotone,"DMF, RDMF, RCMF",28.0,700000,2024-06-30,36,3341,...,,0.32,,,1.29,0.11,25.0,1.97,0,0.0
2,2,T. Biasci,Catanzaro,Catanzaro,"CF, LWF, AMF",28.0,550000,2024-06-30,38,2488,...,,0.07,,,0.0,0.0,0.0,0.0,2,50.0
3,3,E. Volpicelli,Sangiuliano City,Sangiuliano City,"CF, RWF, AMF",30.0,500000,2024-06-30,34,2639,...,,0.1,,,2.28,0.55,50.0,3.04,0,0.0
4,4,A. Curcio,Catanzaro,Catanzaro,"CF, AMF, LWF",33.0,500000,2024-06-30,38,1456,...,,0.0,,,0.62,0.31,20.0,0.31,1,0.0


### Linear Regression

In [24]:
# I decided to exclude the players whose market value is 0 for this linear regression because
# I only want to focus on experienced players and who are still under contract/ were signed with a new team.
NAC_df_copy = NAC_df.copy()
NAC_df_copy = NAC_df_copy.dropna(subset='Market value')

# I decided to keep only certain columns that I consider important metrics for determining a player's expected goals.
columns_to_keep = ['Market value','Age', 'Minutes played', 'Aerial duels won, %', 'Goals', 'Assists', 'Shots on target, %', 'Goal conversion, %', 'Accurate passes, %', 'xG', 'xA']
NAC_df_copy.drop(columns=NAC_df_copy.columns.difference(columns_to_keep), inplace=True)

# Some columns containted NaN values. I used imputation to handle them.
from sklearn.impute import SimpleImputer
numeric_columns = NAC_df_copy.select_dtypes(include=['number']).columns
df_numeric = NAC_df_copy[numeric_columns]

# Imputing with the mean
imputer = SimpleImputer(strategy='mean')
df_numeric_imputed = pd.DataFrame(imputer.fit_transform(df_numeric), columns=df_numeric.columns)
df_imputed = pd.concat([df_numeric_imputed, NAC_df_copy[NAC_df_copy.columns.difference(numeric_columns)]], axis=1)

# Splitting the dataset into features and target variables
X_nac = df_imputed.drop('xG', axis=1)
y_nac = df_imputed['xG']

# Splitting the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_nac, y_nac, test_size=0.2, random_state=20)

# Creating a linear regression model
NAC_model =  LinearRegression()
NAC_model.fit(X_train, y_train)  # Fit the model using the training data

# Making predictions on the test set
y_pred_nac = NAC_model.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred_nac)
rmse = mse ** 0.5  # Root Mean Squared Error
r2 = r2_score(y_test, y_pred_nac)

print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R squared:", r2)

Mean Squared Error: 1.0262954536534263
Root Mean Squared Error: 1.0130624135034456
R squared: 0.8558889521700708


### Logistic Regression

In [32]:
# Extracting features and target
X = NAC_df['Age'].values
y = NAC_df['Market value'].values
X = X.reshape(-1, 1)

# Impute NaN values with the mean
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.4, random_state=42)

# Import LogisticRegression
from sklearn.linear_model import LogisticRegression

# Instantiate the model
logreg = LogisticRegression(max_iter=10000)

# Fit the model
logreg.fit(X_train, y_train)

# Predict probabilities
y_pred = logreg.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
print(accuracy)
print(conf_matrix)
print(class_report)

0.18869065618385245
[[950   0   0 ...   0   0   0]
 [ 19   0   0 ...   0   0   0]
 [ 56   0   0 ...   0   0   0]
 ...
 [  1   0   0 ...   0   0   0]
 [  1   0   0 ...   0   0   0]
 [  1   0   0 ...   0   0   0]]
              precision    recall  f1-score   support

           0       0.25      0.87      0.39      1092
       10000       0.00      0.00      0.00        28
       25000       0.00      0.00      0.00        70
       50000       0.00      0.00      0.00       198
       75000       0.00      0.00      0.00       182
      100000       0.00      0.00      0.00       341
      125000       0.00      0.00      0.00       139
      150000       0.00      0.00      0.00       453
      175000       0.00      0.00      0.00       178
      200000       0.10      0.53      0.17       566
      225000       0.00      0.00      0.00        84
      250000       0.00      0.00      0.00       420
      275000       0.00      0.00      0.00        59
      300000       0.00      0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Clustering

In [35]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Step 1: Preprocess the data
X = NAC_df.drop(['Position'], axis=1)
X_encoded = pd.get_dummies(X)

# Handle NaN values using a simple imputer
imputer = SimpleImputer(strategy='mean')
X_imputed = pd.DataFrame(imputer.fit_transform(X_encoded), columns=X_encoded.columns)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# Step 2: Apply k-means clustering
kmeans = KMeans(n_clusters=3, random_state=42)  
cluster_labels = kmeans.fit_predict(X_scaled)

# Step 3: Add cluster labels to the original DataFrame
NAC_df['Cluster_KMeans'] = cluster_labels

# Display the resulting DataFrame with cluster labels
print(NAC_df.head())

  super()._check_params_vs_input(X, default_n_init=10)


   Unnamed: 0          Player              Team  \
0           0     P. Iemmello         Catanzaro   
1           1  J. Petriccione           Crotone   
2           2       T. Biasci         Catanzaro   
3           3   E. Volpicelli  Sangiuliano City   
4           4       A. Curcio         Catanzaro   

  Team within selected timeframe         Position   Age  Market value  \
0                      Catanzaro               CF  31.0        750000   
1                        Crotone  DMF, RDMF, RCMF  28.0        700000   
2                      Catanzaro     CF, LWF, AMF  28.0        550000   
3               Sangiuliano City     CF, RWF, AMF  30.0        500000   
4                      Catanzaro     CF, AMF, LWF  33.0        500000   

  Contract expires  Matches played  Minutes played  ...  \
0              NaN              38            3109  ...   
1       2024-06-30              36            3341  ...   
2       2024-06-30              38            2488  ...   
3       2024-06-30

### Random forests

In [39]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Perform one-hot encoding on other categorical columns
NAC_df_encoded = pd.get_dummies(NAC_df)

# Handle missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean') 
X_imputed = imputer.fit_transform(NAC_df_encoded.drop('xG', axis=1))

# Splitting data
X = X_imputed
y = NAC_df_encoded['xG'].values

# i.e. 70 % training dataset and 30 % test datasets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42) 

# creating a RF regressor 
regressor = RandomForestRegressor(n_estimators=100, random_state=42)   
  
# Training the model on the training dataset 
regressor.fit(X_train, y_train) 
  
# performing predictions on the test dataset 
y_pred = regressor.predict(X_test) 

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R squared:", r2)

Mean Squared Error: 0.0627261076839347
R squared: 0.9900369783578327


### Gradient boosting

In [48]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

NAC_df_encoded = pd.get_dummies(NAC_df)

NAC_df_encoded['On loan_yes'] = label_encoder.fit_transform(NAC_df_encoded['On loan_yes'])

X = NAC_df_encoded.drop(["On loan_yes"], axis=1).values
y = NAC_df_encoded['On loan_yes'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from xgboost import XGBClassifier

classifier = XGBClassifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
print(accuracy)
print(conf_matrix)
print(class_report)

1.0
[[3108    0]
 [   0  199]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3108
           1       1.00      1.00      1.00       199

    accuracy                           1.00      3307
   macro avg       1.00      1.00      1.00      3307
weighted avg       1.00      1.00      1.00      3307

