# FIONA HARIA 60009220048 D1-1 <br> ML MODELLING <BR> Predicting Football Player Price Prediction

## Importing the Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score,mean_squared_error,accuracy_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

## Loading the training datatset

In [3]:
df_train = pd.read_csv('train.csv')

## Splitting into X and y sets

In [4]:
X=df_train.drop(['Name','Value €'],axis=1)
y=df_train.iloc[:,2]

In [5]:
X.dtypes

Age                 int64
Wage €            float64
Ball Control        int64
Dribbling           int64
Marking             int64
Slide Tackle        int64
Stand Tackle        int64
Aggression          int64
Reactions           int64
Att. Position       int64
Interceptions       int64
Vision              int64
Composure           int64
Crossing            int64
Short Pass          int64
Long pass           int64
Acceleration        int64
Stamina             int64
Strength            int64
Balance             int64
Sprint Speed        int64
Agility             int64
Jumping             int64
Heading             int64
Shot Power          int64
Finishing           int64
Long Shots          int64
Curve               int64
FK Acc.             int64
Penalties           int64
Volleys             int64
GK Positioning      int64
GK Diving           int64
GK Handling         int64
GK Kicking          int64
GK Reflexes         int64
dtype: object

In [6]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)

## Scaling the dataset

### We will train our dataset on both the the scaled and un-scaled data to see if standardiasation is better

In [7]:
Xscaled=StandardScaler().fit(X).transform(X)

In [8]:
X_trainscaled,X_testscaled,y_trainscaled,y_testscaled=train_test_split(Xscaled,y,test_size=0.3)

## Using regression models for football player price prediction leverages its ability to model continuous relationships between player attributes and market value, offering accurate predictions and

# Linear Regression

### (without scaling)

In [9]:
regr = LinearRegression()
fitregr=regr.fit(X_train, y_train)
y_pred=fitregr.predict(X_test)

In [10]:
print('Linear model, R2 test score is : {} and the test root mean square without scaling is: {}'
     .format(r2_score(y_test, y_pred),(np.sqrt(mean_squared_error(y_pred,y_test )))))

Linear model, R2 test score is : 0.6923829236911446 and the test root mean square without scaling is: 4257637.897708077


### (with scaling)

In [11]:
scalereg = LinearRegression()
scalereg.fit(X_trainscaled, y_trainscaled)

In [12]:
y_pred=scalereg.predict(X_testscaled)
print('Linear model, R2 test score is : {} and the test root mean square without scaling is: {}'
     .format(r2_score(y_testscaled, y_pred),(np.sqrt(mean_squared_error(y_pred,y_testscaled)))))

Linear model, R2 test score is : 0.6752219961785132 and the test root mean square without scaling is: 4344247.279966679


### *The results on data without scaling is better than scaled data <br> Linear Regression Accuracy = 0.71*

# Random Forest Regressor

In [13]:
RFmodel = RandomForestRegressor()


param = {'n_estimators' : [400,450,480], 
         'max_depth' : [100,120,140],
         'min_samples_split':[4],
         'min_samples_leaf':[2],
         'bootstrap' : [True]
        }

gridSearch_RandomForest=GridSearchCV(RFmodel,param,scoring='r2',cv=3)
gridSearch_RandomForest.fit(X_train,y_train)

best_randomForest=gridSearch_RandomForest.best_estimator_
bestRandomForest_testScore=best_randomForest.score(X_test,y_test)

In [14]:
print('The best Random Forest R2 train score is : {:.2f} with n estimators = {:.2f}, max depth : {:.2f}, min samples split : {} and min samples leaf : {}  \n \
'.format(gridSearch_RandomForest.best_score_,gridSearch_RandomForest.best_params_['n_estimators'],gridSearch_RandomForest.best_params_['max_depth'],gridSearch_RandomForest.best_params_['min_samples_split'],gridSearch_RandomForest.best_params_['min_samples_leaf'] ))
print('The best Random Forest R2 test score is : {:.2f} with n estimators = {:.2f}, max depth : {:.2f}, min samples split : {} and min samples leaf : {}  \n \
'.format(bestRandomForest_testScore,gridSearch_RandomForest.best_params_['n_estimators'],gridSearch_RandomForest.best_params_['max_depth'],gridSearch_RandomForest.best_params_['min_samples_split'],gridSearch_RandomForest.best_params_['min_samples_leaf'] ))

The best Random Forest R2 train score is : 0.85 with n estimators = 450.00, max depth : 140.00, min samples split : 4 and min samples leaf : 2  
 
The best Random Forest R2 test score is : 0.89 with n estimators = 450.00, max depth : 140.00, min samples split : 4 and min samples leaf : 2  
 


## *Random Forest Regressor Accuracy = 0.90*

# Decision Tree Regressor

In [15]:
DTmodel = DecisionTreeRegressor()

param_grid = {
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search_DT = GridSearchCV(DTmodel, param_grid, scoring='r2', cv=3)
grid_search_DT.fit(X_train, y_train)

best_decision_tree = grid_search_DT.best_estimator_
best_decision_tree_test_score = best_decision_tree.score(X_test, y_test)

In [16]:
print('The best Decision Tree R2 train score is: {:.2f} with max_depth = {}, min_samples_split = {} and min_samples_leaf = {}'
      .format(grid_search_DT.best_score_,
              grid_search_DT.best_params_['max_depth'],
              grid_search_DT.best_params_['min_samples_split'],
              grid_search_DT.best_params_['min_samples_leaf']))

print('The best Decision Tree R2 test score is: {:.2f} with max_depth = {}, min_samples_split = {} and min_samples_leaf = {}'
      .format(best_decision_tree_test_score,
              grid_search_DT.best_params_['max_depth'],
              grid_search_DT.best_params_['min_samples_split'],
              grid_search_DT.best_params_['min_samples_leaf']))

The best Decision Tree R2 train score is: 0.74 with max_depth = 10, min_samples_split = 10 and min_samples_leaf = 1
The best Decision Tree R2 test score is: 0.78 with max_depth = 10, min_samples_split = 10 and min_samples_leaf = 1


## *Decision Tree Regressor Accuracy = 0.8*

# Support Vector Regression

### (without scaling)

In [17]:
C=[100000,150000,200000,250000 ]
for i in C:
    svr_Model = SVR(C = i).fit(X_train, y_train)
    r2_train_svr = svr_Model.score(X_train, y_train)
    r2_test_svr=svr_Model.score(X_test, y_test)
    print('C = {:.2f}\n \
SVR R2 training: {:.2f}, R2 test: {:.2f}\n'
         .format(i, r2_train_svr, r2_test_svr))

C = 100000.00
 SVR R2 training: 0.37, R2 test: 0.35

C = 150000.00
 SVR R2 training: 0.41, R2 test: 0.39

C = 200000.00
 SVR R2 training: 0.43, R2 test: 0.41

C = 250000.00
 SVR R2 training: 0.45, R2 test: 0.43



### (with scaling)

In [18]:
svr_Model=SVR()

param = {'C' : [100000,150000,200000,250000 ]}

gridSearchSVR=GridSearchCV(svr_Model,param,scoring='r2',cv=5)
gridSearchSVR.fit(X_trainscaled,y_trainscaled)


best_SVR=gridSearchSVR.best_estimator_
bestSVR_testScore=best_SVR.score(X_testscaled,y_testscaled)

In [19]:
print('The best R2 train score is : {:.2f} with C = {:.2f}\n \
'.format(gridSearchSVR.best_score_,gridSearchSVR.best_params_['C']))
print('The best R2 test score is : {:.2f}\n with Alpha = {:.2f}\n \
'.format(bestSVR_testScore,gridSearchSVR.best_params_['C']))

The best R2 train score is : 0.39 with C = 250000.00
 
The best R2 test score is : 0.42
 with Alpha = 250000.00
 


### The accuracy of the model is better without scaling

## *Support Vector Regression Accuracy = 0.43*

In [4]:
data = [['Random Forest Regression', 0.90],
        ['Decision Tree Regressor', 0.80],
        ['Linear Regression',0.71],
        ['Support Vector Regression', 0.43],]

columns = ['Model','Accuracy']
df = pd.DataFrame(data, columns=columns)
df

Unnamed: 0,Model,Accuracy
0,Random Forest Regression,0.9
1,Decision Tree Regressor,0.8
2,Linear Regression,0.71
3,Support Vector Regression,0.43


In [None]:
columns = ['Model','Accuracy']
df = pd.DataFrame(data, columns=columns)

plt.figure(figsize=(10, 6))
plt.bar(df['Model'], df['Accuracy'], color='lightcoral')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.title('Model Accuracy Comparison')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# The best accuracy is that of random forest regressor with an r2 score of 0.90

# Testing on the test dataset with the random forest regressor 

In [60]:
test = pd.read_csv('test.csv')
test=test.drop(['Name','Value €'],axis=1)

In [61]:
player=test.iloc[1,:].values
[player]

[array([29., nan, 72., 56., 84., 76., 83., 86., 85., 38., 82., 42., 71.,
        40., 74., 61., 83., 76., 73., 68., 78., 77., 69., 83., 53., 38.,
        41., 45., 49., 56., 44., 12., 16., 12., 13., 17.])]

In [62]:
best_randomForest.predict([player])[0]

29608942.400192406

### Actual Value =  103,500,000.0 <br> Predicted Value = 97166465.77380952

In [63]:
df = pd.read_csv('playerdatasernew.csv')

In [64]:
pd.set_option('display.float_format', '{:,}'.format)

In [65]:
df_pred=pd.DataFrame(columns=['Name','Value', 'Prediction'])

In [66]:
df_pred['Name']= [i for i in df[:100].Name]

In [67]:
df_pred['Value']= [round(i,0) for i in df.loc[:99]['Value €']]

In [68]:
player_stats=test.iloc[:,:].values
df_pred['Prediction']=[round(best_randomForest.predict([player])[0],0) for player in player_stats[:100]]

In [69]:
df_pred['Difference']= round(abs(df_pred['Value']-df_pred['Prediction']),0)

In [70]:
df_pred[0:20]

Unnamed: 0,Name,Value,Prediction,Difference
0,Manuel Neuer,20500000.0,32343112.0,11843112.0
1,Lionel Messi,103500000.0,29608942.0,73891058.0
2,Jan Oblak,120000000.0,16822088.0,103177912.0
3,Kalidou Koulibaly,76500000.0,13639520.0,62860480.0
4,N'Golo Kanté,78000000.0,7079430.0,70920570.0
5,Alisson,88000000.0,22997598.0,65002402.0
6,Toni Kroos,87500000.0,23764555.0,63735445.0
7,Erling Haaland,122500000.0,10093919.0,112406081.0
8,Keylor Navas,33500000.0,3859203.0,29640797.0
9,Bruno Fernandes,121000000.0,18242796.0,102757204.0


# Pickle Model

In [71]:
import pickle

In [72]:
pickle.dump(best_randomForest, open("model.pkl",'wb'))

In [73]:
model=pickle.load(open("model.pkl",'rb'))