In [1]:
import pandas as pd
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.linear_model import LinearRegression

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
# Step 1: Load the CSV file
df = pd.read_csv('data/merged_data_with_whoscored.csv')

In [4]:
# Step 1b: Delete irrelevant Columns
columns_to_delete = ['Player_URL', 'Player_x', 'Player1', 'Player_URL2', 'Player3', 'Player4', 'Player5', 'Apps', 'Mins', 'Goals', 'Assists', 'Yel', 'SpG', 'PS', 'AerialsWon', 'MotM', 'Red', 'Unnamed: 0.1', 'Player_y', 'Squad', 'Age', 'Born', '90s', 'Based', 'Position', 'player_code', 'Unnamed: 0', 'player_id', 'name', 'country_of_birth', 'date_of_birth', 'foot', 'height_in_cm', 'contract_expiration_date', 'date', 'market_value_in_eur']

for column in columns_to_delete:
    del df[column]

In [5]:
# Step 1b: Delete irrelevant Columns
stats_to_exclude = ['Gls', 'Sh', 'SoT', 'SoT%', 'Dist', 'FK', 'PK', 'PKatt']

for column in stats_to_exclude:
    del df[column]

In [6]:
df_centre_backs = df[df['sub_position'] == 'Centre-Back']

In [7]:
df_centre_backs.head()

Unnamed: 0,Rating,Sh/90,SoT/90,G/Sh,G/SoT,xG,npxG,npxG/Sh,G-xG,np:G-xG,...,Ast,xAG,xA,A-xAG,KP,3-Jan,PPA,CrsPA,PrgP,sub_position
16,6.35,0.51,0.13,0.13,0.5,0.0,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0,0,0,0,0,Centre-Back
84,6.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.4,0.8,-0.4,4,88,13,2,104,Centre-Back
86,6.39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0.4,0.5,-0.4,2,51,6,0,57,Centre-Back
134,6.66,0.12,0.0,0.0,0.0,0.1,0.1,0.03,-0.1,-0.1,...,0,0.8,0.8,-0.8,12,111,9,2,117,Centre-Back
135,6.57,0.15,0.0,0.0,0.0,0.1,0.1,0.04,-0.1,-0.1,...,0,0.8,0.5,-0.8,9,62,10,2,90,Centre-Back


In [8]:
X_centre_backs = df_centre_backs.drop(columns=['Rating', 'sub_position'])  # Exclude the target column
y_centre_backs = df_centre_backs['Rating']

In [9]:
X_centre_backs.head()

Unnamed: 0,Sh/90,SoT/90,G/Sh,G/SoT,xG,npxG,npxG/Sh,G-xG,np:G-xG,Tkl,...,Long Cmp%,Ast,xAG,xA,A-xAG,KP,3-Jan,PPA,CrsPA,PrgP
16,0.51,0.13,0.13,0.5,0.0,0.0,0.0,0.0,0.0,0,...,0.0,0,0.0,0.0,0.0,0,0,0,0,0
84,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15,...,70.7,0,0.4,0.8,-0.4,4,88,13,2,104
86,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18,...,63.8,0,0.4,0.5,-0.4,2,51,6,0,57
134,0.12,0.0,0.0,0.0,0.1,0.1,0.03,-0.1,-0.1,68,...,56.7,0,0.8,0.8,-0.8,12,111,9,2,117
135,0.15,0.0,0.0,0.0,0.1,0.1,0.04,-0.1,-0.1,43,...,50.0,0,0.8,0.5,-0.8,9,62,10,2,90


In [10]:
y_centre_backs.head()

16     6.35
84     6.43
86     6.39
134    6.66
135    6.57
Name: Rating, dtype: float64

In [11]:
lreg = LinearRegression()
sfs1_centre_backs = sfs(lreg, k_features=5, forward=True, verbose=2, scoring='neg_mean_squared_error')

In [12]:
sfs1_centre_backs = sfs1_centre_backs.fit(X_centre_backs, y_centre_backs)

[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    0.9s

[2024-05-05 21:52:01] Features: 1/5 -- score: -0.025564448414405756[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    0.7s

[2024-05-05 21:52:01] Features: 2/5 -- score: -0.02138625078881321[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    0.8s

[2024-05-05 21:52:02] Features: 3/5 -- score: -0.019051186261374215[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    0.8s

[2024-05-05 21:52:04] Features: 4/5 -- score: -0.01827259165710484[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    0.8s

[2024-05-05 21:52:04] Features: 5/5 -- score: -0.017457307611124438

In [13]:
feat_names = list(sfs1_centre_backs.k_feature_names_)
print(feat_names)

['Sh/90', 'G-xG', 'Lost', 'Tkl+Int', 'Total Att']
