In [33]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

Load players data

In [34]:
data = pd.read_excel('fifa22.xlsx')
data.head()

Unnamed: 0,Name,Age,Nationality,Overall,Potential,Club,Position,Crossing,Finishing,HeadingAccuracy,...,Penalties,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,Value
0,L. Messi,31,Argentina,94,94,FC Barcelona,RF,84.0,95.0,70.0,...,75.0,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,110.5
1,Cristiano Ronaldo,33,Portugal,94,94,Juventus,ST,84.0,94.0,89.0,...,85.0,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,77.0
2,Neymar Jr,26,Brazil,92,93,Paris Saint-Germain,LW,79.0,87.0,62.0,...,81.0,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,118.5
3,De Gea,27,Spain,91,93,Manchester United,GK,17.0,13.0,21.0,...,40.0,68.0,15.0,21.0,13.0,90.0,85.0,87.0,88.0,72.0
4,K. De Bruyne,27,Belgium,91,92,Manchester City,RCM,93.0,82.0,55.0,...,79.0,88.0,68.0,58.0,51.0,15.0,13.0,5.0,10.0,102.0


# Preprocessing

In [35]:
data.isnull().sum()

Name                0
Age                 0
Nationality         0
Overall             0
Potential           0
Club                0
Position           48
Crossing           48
Finishing          48
HeadingAccuracy    48
ShortPassing       48
Volleys            48
Dribbling          48
Curve              48
FKAccuracy         48
LongPassing        48
BallControl        48
Acceleration       48
SprintSpeed        48
Agility            48
Reactions          48
Balance            48
ShotPower          48
Jumping            48
Stamina            48
Strength           48
LongShots          48
Aggression         48
Interceptions      48
Positioning        48
Vision             48
Penalties          48
Composure          48
Marking            48
StandingTackle     48
SlidingTackle      48
GKDiving           48
GKHandling         48
GKKicking          48
GKPositioning      48
Value               0
dtype: int64

Drop the rows that contain missing values

how='any': If any value is missing (NaN) in a row, that row will be dropped.

how='all': only rows where all values are missing would be dropped.

In [36]:
data.dropna(how='any', inplace=True)

In [37]:
data.isnull().sum().sum()

0

In [38]:
X = data.iloc[:,1:40] #Features
Y = data['Value'] #Label

In [39]:
def Feature_Encoder(X, cols):
    for c in cols:
        lbl = LabelEncoder()
        lbl.fit(list(X[c].values)) # nationality, club, position (convert them to numbers 0- nclass-1)
        X[c] = lbl.transform(list(X[c].values))
    return X

In [40]:
X['Nationality'][:4]

0    Argentina
1     Portugal
2       Brazil
3        Spain
Name: Nationality, dtype: object

In [41]:
cols=('Nationality','Club','Position')

X = Feature_Encoder(X, cols);

In [42]:
X['Nationality'][:4]

0      6
1    122
2     20
3    138
Name: Nationality, dtype: int32

Split the data to training and testing sets

In [43]:
X.shape

(17907, 39)

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.30, shuffle=True) # random_state

X_train.shape, X_test.shape

((12534, 39), (5373, 39))

# Model

In [45]:
cls = linear_model.LinearRegression()

cls.fit(X_train,y_train)

# Evaluation

In [46]:
prediction = cls.predict(X_test)

In [47]:
for i in zip(prediction[:10], y_test[:10]):
    print(i)

(0.5044534513116332, 0.35)
(-1.167395234571547, 0.2)
(6.996903042559495, 5.0)
(1.609532497321993, 0.525)
(13.249855937279172, 27.0)
(-2.091994127367844, 0.12)
(3.463981018971907, 3.9)
(3.7711956078319133, 1.7)
(-2.246431775227297, 0.12)
(8.944649346732035, 9.0)


In [48]:
print('Co-efficient of linear regression',cls.coef_)
print('Intercept of linear regression model',cls.intercept_)
print('Mean Square Error', metrics.mean_squared_error(np.asarray(y_test), prediction))

Co-efficient of linear regression [-2.82782455e-01 -6.80026485e-04  5.74726638e-01  6.55343797e-02
  4.10063194e-04 -1.13877637e-02 -6.05576699e-03  1.83473187e-02
 -1.24447214e-02 -1.45801830e-02  3.46807934e-02 -1.46355337e-02
 -1.19232995e-03  9.17507036e-03  7.59066721e-03 -3.97480430e-02
 -6.90472094e-03 -6.10183508e-03 -1.75215473e-02  5.41377879e-02
  1.28853673e-02 -2.43053381e-02 -1.87086693e-04 -1.70327014e-03
 -1.17192948e-02 -2.12310531e-02 -8.43358293e-03 -9.83167064e-03
  1.31584700e-02  2.13498638e-02  1.83535882e-03  4.91289356e-03
  5.59488126e-03  6.72341706e-03 -7.14186647e-03 -2.55578092e-02
 -9.50749715e-03  1.51717513e-02 -2.77709657e-02]
Intercept of linear regression model -30.942540537831455
Mean Square Error 14.295184964767152


In [49]:
true_player_value = np.asarray(y_test)[0]
predicted_player_value = prediction[0]

print('True value for the first player in the test set in millions is : ' + str(true_player_value))
print('Predicted value for the first player in the test set in millions is : ' + str(predicted_player_value))

True value for the first player in the test set in millions is : 0.35
Predicted value for the first player in the test set in millions is : 0.5044534513116332
