In [1]:
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
import joblib

In [2]:
filepath = 'Updated2.xlsx'
df = pd.read_excel(filepath)

In [3]:
df.head()

Unnamed: 0,Player_index,Player,Current_Age,Starts,Min,Gls,Ast,CrdY,CrdR,SoT,...,POS,grade_value,POS1,POS2,COUNTRY,Current_Club,League,League_num,Club_num,Market_val
0,1,Aaron Connolly,21,8.33,747.33,2.0,0.67,0.33,0.0,8.67,...,3,0.37,F,CF,Ireland,Brighton,Premier League,1,20,4500000
1,2,Aaron Cresswell,31,30.33,2691.33,1.0,3.33,3.67,0.0,3.33,...,1,0.49,D,LB,England,West Ham,Premier League,1,96,8000000
2,3,Aaron Hickey,19,10.0,782.0,0.0,0.0,3.0,1.0,1.0,...,1,0.38,D,LB,Scotland,Bologna,Series A,5,15,5000000
3,4,Aaron Ramsey,30,19.0,1797.5,5.0,4.0,1.5,0.0,15.5,...,2,3.09,M,CM,Wales,Juventus,Series A,5,43,25000000
4,5,Aaron Wan-Bissaka,23,45.67,4083.0,0.67,4.33,6.0,0.33,2.33,...,1,1.51,D,RB,England,Manchester United,Premier League,1,58,40000000


In [3]:
features = ['Current_Age', 'Starts', 'Min', 'Gls', 'Ast', 'CrdY', 'CrdR', 'SoT', 'G_Sh', 'Pass_Att', 'Cmp_per', 'TklW', 'Blocks', 'Int', 'Clr', 'Dribble_Att', 'Dribble_Succ_per', 'Carries', 'Targ', 'Rec_per', 'POS', 'grade_value', 'League_num', 'Club_num']  # List of column names for features
target = 'Market_val'  # Column name for the target value

dataX = df[features]  # DataFrame containing the selected features
dataY = df[target]  # Series containing the target value


In [4]:
dataY = np.ravel(dataY)

In [5]:
rf2 = RandomForestRegressor(max_features=None,n_estimators=200,max_depth=None,
                           min_samples_split=2,min_samples_leaf=1,
                           criterion="squared_error",random_state=2)

In [6]:
rf2.fit(dataX,dataY)

In [8]:
joblib.dump(rf2, "/Users/ervinballa/Desktop/ML_Grad/footy/RF_Model_2.pkl")

['/Users/ervinballa/Desktop/ML_Grad/footy/RF_Model_2.pkl']

In [9]:
mod_score = cross_val_score(rf2,dataX,dataY,scoring='r2',cv=5).mean()
r2 = rf2.score(dataX, dataY)

In [10]:
mod_score

0.7053286436560876

In [11]:
r2

0.9593735543139753

In [7]:
# predict the response variable for dataset
pred = rf2.predict(dataX)
#pred = pd.DataFrame(pred)
#pred.to_excel("/Users/ervinballa/Desktop/ML_Grad/footy/Prediction2.xlsx",index=False)

In [8]:
mse = mean_squared_error(dataY, pred)
rmse = mse**.5
print(mse)
print(rmse)

11376426538012.357
3372895.8682432454


In [13]:
# Get numerical feature importances
importances = list(rf2.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(features, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Targ                 Importance: 0.32
Variable: Current_Age          Importance: 0.15
Variable: grade_value          Importance: 0.1
Variable: SoT                  Importance: 0.08
Variable: Gls                  Importance: 0.06
Variable: Dribble_Att          Importance: 0.05
Variable: Ast                  Importance: 0.04
Variable: G_Sh                 Importance: 0.02
Variable: TklW                 Importance: 0.02
Variable: Blocks               Importance: 0.02
Variable: League_num           Importance: 0.02
Variable: Starts               Importance: 0.01
Variable: Min                  Importance: 0.01
Variable: CrdY                 Importance: 0.01
Variable: Pass_Att             Importance: 0.01
Variable: Cmp_per              Importance: 0.01
Variable: Int                  Importance: 0.01
Variable: Clr                  Importance: 0.01
Variable: Dribble_Succ_per     Importance: 0.01
Variable: Carries              Importance: 0.01
Variable: Rec_per              Importance

In [14]:
result = permutation_importance(rf2, dataX, dataY, n_repeats=10, random_state=42)

feature_importances_2 = result.importances_mean

In [15]:
feature_importances_2

array([0.37039758, 0.01028539, 0.01554761, 0.05160855, 0.0318702 ,
       0.00739221, 0.00283823, 0.10230762, 0.01049134, 0.00618648,
       0.00880896, 0.01432003, 0.01486952, 0.00840275, 0.01063085,
       0.03766876, 0.007942  , 0.01503495, 0.46312063, 0.00595036,
       0.00064687, 0.22155634, 0.03100232, 0.01129248])

In [17]:
prediction_example = [[25, 57, 4117, 28, 11, 8, 0, 63, 0.21, 947, 74.55, 10, 40, 12, 14, 89, 44.95, 687, 1452, 58.5, 3, 4.75, 5, 42]]
example_result = rf2.predict(prediction_example)



In [18]:
example_result

array([81170000.])

In [9]:
feature_importances_2 = ([0.37039758, 0.01028539, 0.01554761, 0.05160855, 0.0318702 ,
       0.00739221, 0.00283823, 0.10230762, 0.01049134, 0.00618648,
       0.00880896, 0.01432003, 0.01486952, 0.00840275, 0.01063085,
       0.03766876, 0.007942  , 0.01503495, 0.46312063, 0.00595036,
       0.00064687, 0.22155634, 0.03100232, 0.01129248])
for i in feature_importances_2:
    print(i)

0.37039758
0.01028539
0.01554761
0.05160855
0.0318702
0.00739221
0.00283823
0.10230762
0.01049134
0.00618648
0.00880896
0.01432003
0.01486952
0.00840275
0.01063085
0.03766876
0.007942
0.01503495
0.46312063
0.00595036
0.00064687
0.22155634
0.03100232
0.01129248
