In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from skopt.space import Real,Integer, Categorical
from skopt import BayesSearchCV
pd.set_option('max_columns',1000)

In [2]:
ada_train_ovr = pd.read_csv('../training_data/train_data.csv')
ada_train_gk = pd.read_csv('../training_data/train_data_gk.csv')
ada_train_def = pd.read_csv('../training_data/train_data_def.csv')
ada_train_mid = pd.read_csv('../training_data/train_data_mid.csv')
ada_train_fwd = pd.read_csv('../training_data/train_data_fwd.csv')
ada_train_ovr.drop(['Players','club','Position'], axis = 1, inplace = True)
ada_train_gk.drop(['Players','club','Position'], axis = 1, inplace = True)
ada_train_def.drop(['Players','club','Position'], axis = 1, inplace = True)
ada_train_mid.drop(['Players','club','Position'], axis = 1, inplace = True)
ada_train_fwd.drop(['Players','club','Position'], axis = 1, inplace = True)
ada_train_gk.head()

Unnamed: 0,age,short_passing,dribbling,long_passing,ball_control,interceptions,positioning,vision,marking,gk_handling,gk_reflexes,Goals-16/17,A-16/17,CS-16/17,Yellow-16/17,Red-16/17,Off-16/17,Pen_SV-16/17,Pen_M-16/17,Goals_conceded-16/17,OG-16/17,Apps-16/17,Form-16/17,Goals-15/16,A-15/16,CS-15/16,Yellow-15/16,Red-15/16,Off-15/16,Pen_SV-15/16,Pen_M-15/16,Goals_conceded-15/16,OG-15/16,Apps-15/16,Form-15/16
0,32,28,19,25,27,22,16,63,20,69,73,0,0,1,1,0,0,0,0,3,0,2,2.5,0,0,0,0,0,1,0,0,16,0,6,0.333333
1,18,52,14,38,10,17,4,45,5,77,79,0,0,10,4,0,0,1,0,41,0,36,2.722222,0,0,7,0,0,1,0,0,27,0,24,2.791667
2,39,27,18,26,23,19,10,49,12,76,72,0,0,3,2,0,0,0,0,66,0,29,0.965517,0,0,11,2,0,0,2,0,43,0,35,2.942857
3,25,26,12,22,23,11,11,38,13,72,72,0,0,0,0,0,1,0,0,5,0,2,-0.5,0,0,0,0,0,0,0,0,5,0,1,-2.0
4,28,20,12,13,11,20,11,25,11,68,72,0,0,0,1,0,0,0,0,5,0,3,1.0,0,0,3,0,0,0,0,0,5,0,6,3.666667


In [3]:
# Reading target data
y_ovr = pd.read_csv('../targets/targets.csv')
y_gk = pd.read_csv('../targets/gk_targets.csv')
y_def = pd.read_csv('../targets/def_targets.csv')
y_mid = pd.read_csv('../targets/mid_targets.csv')
y_fwd = pd.read_csv('../targets/fwd_targets.csv')

In [4]:
# Splitting Data into train and test
x_train_ovr, x_test_ovr, y_train_ovr, y_test_ovr = train_test_split(ada_train_ovr, y_ovr, test_size = 0.2, random_state = 42)
x_train_gk, x_test_gk, y_train_gk, y_test_gk = train_test_split(ada_train_gk, y_gk, test_size = 0.2, random_state = 42)
x_train_def, x_test_def, y_train_def, y_test_def = train_test_split(ada_train_def, y_def, test_size = 0.2, random_state = 42)
x_train_mid, x_test_mid, y_train_mid, y_test_mid = train_test_split(ada_train_mid, y_mid, test_size = 0.2, random_state = 42)
x_train_fwd, x_test_fwd, y_train_fwd, y_test_fwd = train_test_split(ada_train_fwd, y_fwd, test_size = 0.2, random_state = 42)

In [5]:
# Standard scaling the train data
stand_scale_ovr = preprocessing.StandardScaler()
stand_scale_gk = preprocessing.StandardScaler()
stand_scale_def = preprocessing.StandardScaler()
stand_scale_mid = preprocessing.StandardScaler()
stand_scale_fwd = preprocessing.StandardScaler()
x_train_ovr_scaled = stand_scale_ovr.fit_transform(x_train_ovr)
x_train_gk_scaled = stand_scale_gk.fit_transform(x_train_gk)
x_train_def_scaled = stand_scale_def.fit_transform(x_train_def)
x_train_mid_scaled = stand_scale_mid.fit_transform(x_train_mid)
x_train_fwd_scaled = stand_scale_fwd.fit_transform(x_train_fwd)
x_train_ovr_scaled = pd.DataFrame(x_train_ovr_scaled)
x_train_gk_scaled = pd.DataFrame(x_train_gk_scaled)
x_train_def_scaled = pd.DataFrame(x_train_def_scaled)
x_train_mid_scaled = pd.DataFrame(x_train_mid_scaled)
x_train_fwd_scaled = pd.DataFrame(x_train_fwd_scaled)

In [6]:
# Standard scaling the test data
x_test_ovr_scaled = stand_scale_ovr.transform(x_test_ovr)
x_test_gk_scaled = stand_scale_gk.transform(x_test_gk)
x_test_def_scaled = stand_scale_def.transform(x_test_def)
x_test_mid_scaled = stand_scale_mid.transform(x_test_mid)
x_test_fwd_scaled = stand_scale_fwd.transform(x_test_fwd)

In [14]:
# Finding the best hyperparameters using cross validation
parameters = {'n_estimators' : list(range(50,500,50)),
              'learning_rate' : list(np.arange(0.1,2,0.1))}
gs = GridSearchCV(estimator = AdaBoostRegressor(),
                  param_grid = parameters,
                  cv = 5,
                  scoring = 'r2')
ovr_gs_result = gs.fit(x_train_ovr_scaled,y_train_ovr.values.ravel())
gs = GridSearchCV(estimator = AdaBoostRegressor(),
                  param_grid = parameters,
                  cv = 5,
                  scoring = 'r2')
gk_gs_result = gs.fit(x_train_gk_scaled,y_train_gk.values.ravel())
gs = GridSearchCV(estimator = AdaBoostRegressor(),
                  param_grid = parameters,
                  cv = 5,
                  scoring = 'r2')
def_gs_result = gs.fit(x_train_def_scaled,y_train_def.values.ravel())
gs = GridSearchCV(estimator = AdaBoostRegressor(),
                  param_grid = parameters,
                  cv = 5,
                  scoring = 'r2')
mid_gs_result = gs.fit(x_train_mid_scaled,y_train_mid.values.ravel())
gs = GridSearchCV(estimator = AdaBoostRegressor(),
                  param_grid = parameters,
                  cv = 5,
                  scoring = 'r2')
fwd_gs_result = gs.fit(x_train_fwd_scaled,y_train_fwd.values.ravel())

In [152]:
print(ovr_gs_result.best_params_)
print(gk_gs_result.best_params_)
print(def_gs_result.best_params_)
print(mid_gs_result.best_params_)
print(fwd_gs_result.best_params_)

{'learning_rate': 1.9000000000000001, 'n_estimators': 400}
{'learning_rate': 1.9000000000000001, 'n_estimators': 200}
{'learning_rate': 1.8000000000000003, 'n_estimators': 350}
{'learning_rate': 1.8000000000000003, 'n_estimators': 450}
{'learning_rate': 1.3000000000000003, 'n_estimators': 150}


In [149]:
ada_ovr = AdaBoostRegressor(n_estimators = ovr_gs_result.best_params_['n_estimators'], 
                           learning_rate = ovr_gs_result.best_params_['learning_rate'])
ada_gk = AdaBoostRegressor(n_estimators = gk_gs_result.best_params_['n_estimators'], 
                           learning_rate = gk_gs_result.best_params_['learning_rate'])
ada_def = AdaBoostRegressor(n_estimators = def_gs_result.best_params_['n_estimators'], 
                           learning_rate = def_gs_result.best_params_['learning_rate'])
ada_mid = AdaBoostRegressor(n_estimators = mid_gs_result.best_params_['n_estimators'], 
                           learning_rate = mid_gs_result.best_params_['learning_rate'])
ada_fwd = AdaBoostRegressor(n_estimators = fwd_gs_result.best_params_['n_estimators'], 
                           learning_rate = fwd_gs_result.best_params_['learning_rate'])
ada_ovr = ada_ovr.fit(x_train_ovr_scaled,y_train_ovr.values.ravel())
ada_gk = ada_gk.fit(x_train_gk_scaled,y_train_gk.values.ravel())
ada_def = ada_def.fit(x_train_def_scaled,y_train_def.values.ravel())
ada_mid = ada_mid.fit(x_train_mid_scaled,y_train_mid.values.ravel())
ada_fwd = ada_fwd.fit(x_train_fwd_scaled,y_train_fwd.values.ravel())

In [150]:
# Calculating variance score where 1 signifies a perfect prediction
print(f"Overall score: {ada_ovr.score(x_test_ovr_scaled, y_test_ovr)}")
print(f"GK score: {ada_gk.score(x_test_gk_scaled, y_test_gk)}")
print(f"DEF score: {ada_def.score(x_test_def_scaled, y_test_def)}")
print(f"MID score: {ada_mid.score(x_test_mid_scaled, y_test_mid)}")
print(f"FWD score: {ada_fwd.score(x_test_fwd_scaled, y_test_fwd)}")

Overall score: 0.7936859478691971
GK score: 0.9236187029850723
DEF score: 0.8066482875156337
MID score: 0.8439845210633834
FWD score: 0.8789497978317387
