In [114]:
import pandas as pd
import numpy as np

df = pd.read_csv("data.csv")
print(df.columns)
print(f"Data on {df.shape[0]} players, {df.shape[1]} features")

Index(['name', 'full_name', 'birth_date', 'age', 'height_cm', 'weight_kgs',
       'positions', 'nationality', 'overall_rating', 'potential', 'value_euro',
       'wage_euro', 'preferred_foot', 'international_reputation(1-5)',
       'weak_foot(1-5)', 'skill_moves(1-5)', 'body_type',
       'release_clause_euro', 'national_team', 'national_rating',
       'national_team_position', 'national_jersey_number', 'crossing',
       'finishing', 'heading_accuracy', 'short_passing', 'volleys',
       'dribbling', 'curve', 'freekick_accuracy', 'long_passing',
       'ball_control', 'acceleration', 'sprint_speed', 'agility', 'reactions',
       'balance', 'shot_power', 'jumping', 'stamina', 'strength', 'long_shots',
       'aggression', 'interceptions', 'positioning', 'vision', 'penalties',
       'composure', 'marking', 'standing_tackle', 'sliding_tackle'],
      dtype='object')
Data on 17954 players, 51 features


In [115]:
df = df.drop(["name","full_name","birth_date","national_team","national_team_position","body_type","nationality"],axis=1)
print(df.columns)

Index(['age', 'height_cm', 'weight_kgs', 'positions', 'overall_rating',
       'potential', 'value_euro', 'wage_euro', 'preferred_foot',
       'international_reputation(1-5)', 'weak_foot(1-5)', 'skill_moves(1-5)',
       'release_clause_euro', 'national_rating', 'national_jersey_number',
       'crossing', 'finishing', 'heading_accuracy', 'short_passing', 'volleys',
       'dribbling', 'curve', 'freekick_accuracy', 'long_passing',
       'ball_control', 'acceleration', 'sprint_speed', 'agility', 'reactions',
       'balance', 'shot_power', 'jumping', 'stamina', 'strength', 'long_shots',
       'aggression', 'interceptions', 'positioning', 'vision', 'penalties',
       'composure', 'marking', 'standing_tackle', 'sliding_tackle'],
      dtype='object')


Transformation de la feature positions pour ne garder que la première position

In [116]:
newpositions = []
for value in df["positions"].values:
    newpositions.append(value.split(",")[0])
df["positions"] = newpositions
print(df["positions"])

0         CF
1        CAM
2         CM
3         LW
4         CB
        ... 
17949     RM
17950     LB
17951     GK
17952     ST
17953     ST
Name: positions, Length: 17954, dtype: object


In [117]:
positions = ["GK","CB","LB","LWB","RB","RWB","CDM","LM","RM","CM","CAM","RW","LW","CF","ST"]
positions_dict = {post : number for post,number in zip(positions,np.linspace(0,1,len(positions)))}

{'GK': 0.0, 'CB': 0.07142857142857142, 'LB': 0.14285714285714285, 'LWB': 0.21428571428571427, 'RB': 0.2857142857142857, 'RWB': 0.3571428571428571, 'CDM': 0.42857142857142855, 'LM': 0.5, 'RM': 0.5714285714285714, 'CM': 0.6428571428571428, 'CAM': 0.7142857142857142, 'RW': 0.7857142857142857, 'LW': 0.8571428571428571, 'CF': 0.9285714285714285, 'ST': 1.0}


In [118]:
df['positions'] = df['positions'].map(positions_dict)

0        0.928571
1        0.714286
2        0.642857
3        0.857143
4        0.071429
           ...   
17949    0.571429
17950    0.142857
17951    0.000000
17952    1.000000
17953    1.000000
Name: positions, Length: 17954, dtype: float64


In [119]:
df["preferred_foot"] = df["preferred_foot"].map({"Left" : 0,"Right" : 1})

0        0
1        1
2        1
3        1
4        1
        ..
17949    1
17950    0
17951    1
17952    1
17953    1
Name: preferred_foot, Length: 17954, dtype: int64


Normalisation

In [127]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df)
df_scaled = pd.DataFrame(X_scaled, columns=df.columns)

Régression

In [128]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

X = df_scaled.drop(["overall_rating"],axis=1)
y = df_scaled["overall_rating"]

In [129]:
# check for NaN values 
for column in df.columns:
    if (df_scaled[column].isnull().values.any()):
        df_scaled = df_scaled.drop(column,axis=1)

if not df.isnull().values.any():
    raise Exception("still null values")

X = df_scaled.drop(["overall_rating"],axis=1)
y = df_scaled["overall_rating"]

In [137]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)

In [143]:
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 0.0816215162074169
R^2 Score: 0.920805771350298


Exemple avec messi

In [144]:
messi = X.values[0]
messi_true_rating = y.values[0]
print(messi,messi_true_rating)
messi = messi.reshape(1, -1)
messi_predicted_rating = model.predict(messi)
print(f"prediction = {messi_predicted_rating[0]}, true = {messi_true_rating}")

[ 1.15491792 -0.33978909 -0.45190277  1.43983616  3.68103828 -1.81725626
  9.91522846  1.58859173  2.14748869  1.96411457  2.52755259  1.01886029
  2.24951039  2.4291497   2.1853364   2.4761865   2.9212345   2.35602748
  2.23855057  1.75347675  1.43975093  2.00063951  3.65582144  2.19632616
  1.71059007  0.26011798  0.55702055  0.06710083  2.42668374 -0.44939219
 -1.1881023   2.24145576  2.86767181  1.68513713  3.21024678 -0.70684287
 -0.91043202 -0.92580278] 3.9864562009997275
prediction = 4.142105087176791, true = 3.9864562009997275


