In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from skopt.space import Real,Integer, Categorical
from skopt.utils import use_named_args
from skopt import gp_minimize
pd.set_option('max_columns',1000)

In [2]:
rf_train_gk = pd.read_csv('../training_data/train_data_gk.csv')
rf_train_def = pd.read_csv('../training_data/train_data_def.csv')
rf_train_mid = pd.read_csv('../training_data/train_data_mid.csv')
rf_train_fwd = pd.read_csv('../training_data/train_data_fwd.csv')
rf_train_gk.drop(['Players','club','Position','league'], axis = 1, inplace = True)
rf_train_def.drop(['Players','club','Position','league'], axis = 1, inplace = True)
rf_train_mid.drop(['Players','club','Position','league'], axis = 1, inplace = True)
rf_train_fwd.drop(['Players','club','Position','league'], axis = 1, inplace = True)
rf_train_gk.describe()

Unnamed: 0,short_passing,dribbling,long_passing,sprint_speed,interceptions,gk_diving,Goals-16/17,A-16/17,CS-16/17,Yellow-16/17,Red-16/17,Off-16/17,Pen_SV-16/17,Pen_M-16/17,Goals_conceded-16/17,OG-16/17,Apps-16/17,Form-16/17,Goals-15/16,A-15/16,CS-15/16,Yellow-15/16,Red-15/16,Off-15/16,Pen_SV-15/16,Pen_M-15/16,Goals_conceded-15/16,OG-15/16,Apps-15/16
count,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0,122.0
mean,32.098361,14.483607,30.983607,46.680328,20.754098,78.213115,0.0,0.04918,5.934426,1.02459,0.057377,0.147541,0.532787,0.0,28.737705,0.065574,21.147541,2.322994,0.0,0.04918,6.819672,1.106557,0.098361,0.188525,0.45082,0.0,27.614754,0.081967,22.237705
std,8.525518,3.757601,8.46674,8.595854,4.26268,5.266814,0.0,0.252343,4.752968,1.326545,0.233521,0.356107,0.804783,0.0,19.123696,0.248556,12.871695,1.289888,0.0,0.252343,5.487802,1.389496,0.325496,0.432788,0.761713,0.0,18.330504,0.303973,13.575655
min,12.0,6.0,12.0,17.0,10.0,65.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,26.25,12.0,25.0,42.0,18.0,74.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,13.25,0.0,9.25,1.750458,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,7.25
50%,33.0,14.0,32.0,47.0,22.0,78.0,0.0,0.0,5.0,1.0,0.0,0.0,0.0,0.0,28.0,0.0,23.5,2.357143,0.0,0.0,6.0,1.0,0.0,0.0,0.0,0.0,29.0,0.0,28.5
75%,36.0,16.0,35.0,52.0,23.0,82.0,0.0,0.0,10.0,2.0,0.0,0.0,1.0,0.0,45.75,0.0,34.0,3.019737,0.0,0.0,11.0,2.0,0.0,0.0,1.0,0.0,42.0,0.0,34.0
max,56.0,30.0,59.0,65.0,30.0,91.0,0.0,2.0,18.0,5.0,1.0,1.0,4.0,0.0,82.0,1.0,38.0,8.5,0.0,2.0,24.0,6.0,2.0,2.0,4.0,0.0,67.0,2.0,38.0


In [3]:
# Reading target data
y_gk = pd.read_csv('../targets/gk_targets.csv')
y_def = pd.read_csv('../targets/def_targets.csv')
y_mid = pd.read_csv('../targets/mid_targets.csv')
y_fwd = pd.read_csv('../targets/fwd_targets.csv')

In [4]:
# Splitting Data into train and test
x_train_gk, x_test_gk, y_train_gk, y_test_gk = train_test_split(rf_train_gk, y_gk, test_size = 0.2, random_state = 42)
x_train_def, x_test_def, y_train_def, y_test_def = train_test_split(rf_train_def, y_def, test_size = 0.2, random_state = 42)
x_train_mid, x_test_mid, y_train_mid, y_test_mid = train_test_split(rf_train_mid, y_mid, test_size = 0.2, random_state = 42)
x_train_fwd, x_test_fwd, y_train_fwd, y_test_fwd = train_test_split(rf_train_fwd, y_fwd, test_size = 0.2, random_state = 42)

In [5]:
# Standard scaling the train data
stand_scale_gk = preprocessing.StandardScaler()
stand_scale_def = preprocessing.StandardScaler()
stand_scale_mid = preprocessing.StandardScaler()
stand_scale_fwd = preprocessing.StandardScaler()
x_train_gk_scaled = stand_scale_gk.fit_transform(x_train_gk)
x_train_def_scaled = stand_scale_def.fit_transform(x_train_def)
x_train_mid_scaled = stand_scale_mid.fit_transform(x_train_mid)
x_train_fwd_scaled = stand_scale_fwd.fit_transform(x_train_fwd)
x_train_gk_scaled = pd.DataFrame(x_train_gk_scaled)
x_train_def_scaled = pd.DataFrame(x_train_def_scaled)
x_train_mid_scaled = pd.DataFrame(x_train_mid_scaled)
x_train_fwd_scaled = pd.DataFrame(x_train_fwd_scaled)
x_train_gk_scaled

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28
0,-2.300936,-0.904073,-0.257651,-1.690361,-0.809619,-0.852758,0.0,-0.195515,-0.052095,2.911877,-0.278887,2.250000,-0.702486,0.0,0.675462,-0.299813,0.264769,-0.445654,0.0,-0.220564,-0.167992,2.062023,2.547100,1.913459,0.665308,0.0,0.490840,-0.286618,0.453335
1,0.591635,-0.076761,0.334580,1.167417,0.122524,0.317773,0.0,-0.195515,0.579556,-0.794843,-0.278887,-0.444444,2.023158,0.0,0.061039,-0.299813,0.509364,0.641486,0.0,-0.220564,0.779407,0.607360,-0.325792,-0.406610,0.665308,0.0,0.709995,-0.286618,0.833072
2,0.591635,0.474781,-0.849881,0.691121,-0.576583,1.098127,0.0,-0.195515,0.579556,0.687845,-0.278887,-0.444444,-0.702486,0.0,0.005183,-0.299813,0.427832,0.325542,0.0,-0.220564,0.968887,-0.119972,-0.325792,-0.406610,-0.625389,0.0,0.271685,-0.286618,0.757125
3,-1.491016,-0.628303,-0.612989,-2.166657,0.355560,0.122685,0.0,-0.195515,1.211207,1.429189,-0.278887,-0.444444,0.660336,0.0,-0.162387,-0.299813,0.427832,0.795784,0.0,-0.220564,-0.925912,-0.119972,-0.325792,-0.406610,0.665308,0.0,-0.878880,-0.286618,-0.913717
4,0.475932,1.026322,0.334580,1.286491,0.355560,1.683393,0.0,-0.195515,0.579556,2.170533,-0.278887,-0.444444,0.660336,0.0,1.010601,3.335416,1.243151,-0.028727,0.0,-0.220564,1.537326,2.789355,-0.325792,-0.406610,0.665308,0.0,0.271685,-0.286618,0.984967
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92,0.475932,-0.628303,0.334580,-0.737768,1.054667,0.317773,0.0,-0.195515,0.790107,-0.053499,-0.278887,-0.444444,-0.702486,0.0,1.345741,-0.299813,1.080087,-0.101034,0.0,-0.220564,0.400447,-0.119972,-0.325792,1.913459,-0.625389,0.0,1.641404,-0.286618,0.909020
93,-0.681096,0.474781,0.097687,1.405565,0.122524,0.512862,0.0,-0.195515,-1.315397,-0.794843,-0.278887,-0.444444,-0.702486,0.0,-1.335375,-0.299813,-1.610465,-1.540728,0.0,-0.220564,-0.546952,-0.847304,-0.325792,-0.406610,-0.625389,0.0,-0.111837,-0.286618,-0.458033
94,0.823041,-0.352532,0.571472,-1.094991,-0.809619,-0.072404,0.0,3.597485,-0.262645,-0.053499,-0.278887,-0.444444,0.660336,0.0,0.507892,-0.299813,-0.224423,-0.225135,0.0,3.345227,-0.357472,1.334691,2.547100,1.913459,-0.625389,0.0,0.381262,-0.286618,0.073598
95,-0.333988,-0.628303,-0.612989,1.643713,-0.343548,1.098127,0.0,-0.195515,1.842859,-0.053499,-0.278887,2.250000,0.660336,0.0,-0.385814,-0.299813,0.672428,1.157282,0.0,-0.220564,3.242645,-0.847304,-0.325792,-0.406610,-0.625389,0.0,-0.604936,-0.286618,1.136862


In [6]:
# Standard scaling the test data
x_test_gk_scaled = stand_scale_gk.transform(x_test_gk)
x_test_def_scaled = stand_scale_def.transform(x_test_def)
x_test_mid_scaled = stand_scale_mid.transform(x_test_mid)
x_test_fwd_scaled = stand_scale_fwd.transform(x_test_fwd)

In [7]:
y_train_gk.isnull().any()

overall    False
dtype: bool

In [8]:
# # Tuning hyperparameters using cross validation
# parameters = {'n_estimators' : list(range(50,1000,50)),
#               'bootstrap' : [True,False],
#               'max_depth' : list(range(10,110,10)),
#               'min_samples_leaf': [1, 2, 4],
#               'min_samples_split': [2, 5, 10]}
# gs = GridSearchCV(estimator = RandomForestRegressor(),
#                   param_grid = parameters,
#                   cv = 5,
#                   scoring = 'r2')
# gk_gs_result = gs.fit(x_train_gk_scaled,y_train_gk.values.ravel())
# gs = GridSearchCV(estimator = RandomForestRegressor(),
#                   param_grid = parameters,
#                   cv = 5,
#                   scoring = 'r2')
# def_gs_result = gs.fit(x_train_def_scaled,y_train_def.values.ravel())
# gs = GridSearchCV(estimator = RandomForestRegressor(),
#                   param_grid = parameters,
#                   cv = 5,
#                   scoring = 'r2')
# mid_gs_result = gs.fit(x_train_mid_scaled,y_train_mid.values.ravel())
# gs = GridSearchCV(estimator = RandomForestRegressor(),
#                   param_grid = parameters,
#                   cv = 5,
#                   scoring = 'r2')
# fwd_gs_result = gs.fit(x_train_fwd_scaled,y_train_fwd.values.ravel())

In [9]:
# Hyperparameter tuning for gk model
rf_gk = RandomForestRegressor(random_state = 43)
space = [Integer(50,500, name = 'n_estimators'),
         Categorical(['True','False'], name = 'bootstrap'),
         Integer(2,20, name = 'max_depth'),
         Integer(1,200, name = "min_samples_leaf"),
         Integer(2,10, name = "min_samples_split")]

@use_named_args(space)
def objective(**params):
    rf_gk.set_params(**params)

    score = np.mean(cross_val_score(rf_gk, x_train_gk_scaled.to_numpy().astype('float64'), y_train_gk.values.ravel(), cv=5, n_jobs=-1,
                                    scoring="r2"))
    print(score)
    return score

In [10]:
rf_gk_opt = gp_minimize(objective, space, n_calls=50, random_state=43)
print(f"Best Params: n_estimators: {rf_gk_opt.x[0]}, bootstrap: {rf_gk_opt.x[1]}, max_depth: {rf_gk_opt.x[2]}, min_samples_leaf: {rf_gk_opt.x[3]}, min_samples_split: {rf_gk_opt.x[4]}")

-0.1918911651963334
-0.18958695981360357
-0.19149058459371676
-0.190691607113044
-0.19098178706047278
-0.19184743821947495
-0.19168936557645022
0.17731214949538493
-0.19156861488730229
-0.18966794610981624
-0.1910342025327716
-0.18974368009227743
0.6657564993099712
-0.189267194635732
-0.1915563335854364
-0.19003073138235105
-0.18974368009227743
-0.18974368009227743
-0.18974368009227743
-0.189267194635732
-0.189267194635732
-0.189267194635732
-0.1914312909157761
-0.18974368009227743
-0.18974368009227743
-0.18974368009227743
-0.189267194635732
-0.18974368009227743
-0.18974368009227743
-0.18974368009227743
-0.189267194635732
-0.18980596898314833
-0.189267194635732
-0.18974368009227743
-0.189267194635732
-0.189267194635732
-0.189267194635732
-0.189267194635732
-0.189267194635732
-0.18974368009227743
-0.189267194635732
-0.189267194635732
-0.18974368009227743
-0.189267194635732
-0.189267194635732
-0.18974368009227743
-0.189267194635732
-0.189267194635732
-0.18974368009227743
-0.1892671946357

In [None]:
rf_def = RandomForestRegressor(random_state = 43)
space = [Integer(50,500, name = 'n_estimators'),
         Categorical(['True','False'], name = 'bootstrap'),
         Integer(2,20, name = 'max_depth'),
         Integer(1,200, name = "min_samples_leaf"),
         Integer(2,10, name = "min_samples_split")]

@use_named_args(space)
def objective(**params):
    rf_def.set_params(**params)

    score = np.mean(cross_val_score(rf_gk, x_train_def_scaled.to_numpy().astype('float64'), y_train_def.values.ravel(), cv=5, n_jobs=-1,
                                    scoring="r2"))
    print(score)
    return score

In [None]:
rf_def_opt = gp_minimize(objective, space, n_calls=50, random_state=43)
print(f"Best Params: n_estimators: {rf_def_opt.x[0]}, bootstrap: {rf_def_opt.x[1]}, max_depth: {rf_def_opt.x[2]}, min_samples_leaf: {rf_def_opt.x[3]}, min_samples_split: {rf_def_opt.x[4]}")

In [None]:
rf_mid = RandomForestRegressor(random_state = 43)
space = [Integer(50,500, name = 'n_estimators'),
         Categorical(['True','False'], name = 'bootstrap'),
         Integer(2,20, name = 'max_depth'),
         Integer(1,200, name = "min_samples_leaf"),
         Integer(2,10, name = "min_samples_split")]

@use_named_args(space)
def objective(**params):
    rf_mid.set_params(**params)

    score = np.mean(cross_val_score(rf_gk, x_train_mid_scaled.to_numpy().astype('float64'), y_train_mid.values.ravel(), cv=5, n_jobs=-1,
                                    scoring="r2"))
    print(score)
    return score

In [None]:
rf_mid_opt = gp_minimize(objective, space, n_calls=50, random_state=43)
print(f"Best Params: n_estimators: {rf_mid_opt.x[0]}, bootstrap: {rf_mid_opt.x[1]}, max_depth: {rf_mid_opt.x[2]}, min_samples_leaf: {rf_mid_opt.x[3]}, min_samples_split: {rf_mid_opt.x[4]}")

In [None]:
rf_fwd = RandomForestRegressor(random_state = 43)
space = [Integer(50,500, name = 'n_estimators'),
         Categorical(['True','False'], name = 'bootstrap'),
         Integer(2,20, name = 'max_depth'),
         Integer(1,200, name = "min_samples_leaf"),
         Integer(2,10, name = "min_samples_split")]

@use_named_args(space)
def objective(**params):
    rf_fwd.set_params(**params)

    score = np.mean(cross_val_score(rf_gk, x_train_fwd_scaled.to_numpy().astype('float64'), y_train_fwd.values.ravel(), cv=5, n_jobs=-1,
                                    scoring="r2"))
    print(score)
    return score

In [None]:
rf_fwd_opt = gp_minimize(objective, space, n_calls=50, random_state=43)
print(f"Best Params: n_estimators: {rf_fwd_opt.x[0]}, bootstrap: {rf_fwd_opt.x[1]}, max_depth: {rf_fwd_opt.x[2]}, min_samples_leaf: {rf_fwd_opt.x[3]}, min_samples_split: {rf_fwd_opt.x[4]}")

In [None]:
# print(gk_gs_result.best_params_)
# print(def_gs_result.best_params_)
# print(mid_gs_result.best_params_)
# print(fwd_gs_result.best_params_)

In [None]:
new_rf_gk = RandomForestRegressor(n_estimators = rf_gk_opt.x[0], 
                                  bootstrap = rf_gk_opt.x[1],
                                  max_depth = rf_gk_opt.x[2],
                                  min_samples_leaf = rf_gk_opt.x[3],
                                  min_samples_split = rf_gk_opt.x[4])
new_rf_def = RandomForestRegressor(n_estimators = rf_def_opt.x[0], 
                                  bootstrap = rf_def_opt.x[1],
                                  max_depth = rf_def_opt.x[2],
                                  min_samples_leaf = rf_def_opt.x[3],
                                  min_samples_split = rf_def_opt.x[4])
new_rf_mid = RandomForestRegressor(n_estimators = rf_mid_opt.x[0], 
                                  bootstrap = rf_mid_opt.x[1],
                                  max_depth = rf_mid_opt.x[2],
                                  min_samples_leaf = rf_mid_opt.x[3],
                                  min_samples_split = rf_mid_opt.x[4])
new_rf_fwd = RandomForestRegressor(n_estimators = rf_fwd_opt.x[0], 
                                  bootstrap = rf_fwd_opt.x[1],
                                  max_depth = rf_fwd_opt.x[2],
                                  min_samples_leaf = rf_fwd_opt.x[3],
                                  min_samples_split = rf_fwd_opt.x[4])

new_rf_gk = new_rf_gk.fit(x_train_gk_scaled,y_train_gk.values.ravel())
new_rf_def = new_rf_def.fit(x_train_def_scaled,y_train_def.values.ravel())
new_rf_mid = new_rf_mid.fit(x_train_mid_scaled,y_train_mid.values.ravel())
new_rf_fwd = new_rf_fwd.fit(x_train_fwd_scaled,y_train_fwd.values.ravel())

In [None]:
# Calculating variance score where 1 signifies a perfect prediction
print(f"GK score: {new_rf_gk.score(x_test_gk_scaled, y_test_gk)}")
print(f"DEF score: {new_rf_def.score(x_test_def_scaled, y_test_def)}")
print(f"MID score: {new_rf_mid.score(x_test_mid_scaled, y_test_mid)}")
print(f"FWD score: {new_rf_fwd.score(x_test_fwd_scaled, y_test_fwd)}")

In [None]:
# Calculating variance score where 1 signifies a perfect prediction
print(f"GK score: {new_rf_gk.score(x_train_gk_scaled, y_train_gk)}")
print(f"DEF score: {new_rf_def.score(x_train_def_scaled, y_train_def)}")
print(f"MID score: {new_rf_mid.score(x_train_mid_scaled, y_train_mid)}")
print(f"FWD score: {new_rf_fwd.score(x_train_fwd_scaled, y_train_fwd)}")