In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
pd.set_option('max_columns',1000)

In [2]:
dt_train_ovr = pd.read_csv('../training_data/train_data.csv')
dt_train_gk = pd.read_csv('../training_data/train_data_gk.csv')
dt_train_def = pd.read_csv('../training_data/train_data_def.csv')
dt_train_mid = pd.read_csv('../training_data/train_data_mid.csv')
dt_train_fwd = pd.read_csv('../training_data/train_data_fwd.csv')
dt_train_ovr.drop(['Players','club','Position'], axis = 1, inplace = True)
dt_train_gk.drop(['Players','club','Position'], axis = 1, inplace = True)
dt_train_def.drop(['Players','club','Position'], axis = 1, inplace = True)
dt_train_mid.drop(['Players','club','Position'], axis = 1, inplace = True)
dt_train_fwd.drop(['Players','club','Position'], axis = 1, inplace = True)
dt_train_ovr.head()

Unnamed: 0,age,short_passing,dribbling,long_passing,ball_control,interceptions,positioning,vision,marking,gk_handling,gk_reflexes,Goals-16/17,A-16/17,CS-16/17,Yellow-16/17,Red-16/17,Off-16/17,Pen_SV-16/17,Pen_M-16/17,Goals_conceded-16/17,OG-16/17,Apps-16/17,Form-16/17,Goals-15/16,A-15/16,CS-15/16,Yellow-15/16,Red-15/16,Off-15/16,Pen_SV-15/16,Pen_M-15/16,Goals_conceded-15/16,OG-15/16,Apps-15/16,Form-15/16
0,27,72,76,68,75,71,63,68,76,7,12,0,2,5,3,0,3,0,0,37,0,26,2.269231,2,4,11,1,0,1,0,0,50,0,37,3.108108
1,30,78,75,68,80,55,75,74,28,12,15,4,2,0,3,0,7,0,1,0,0,22,2.590909,1,2,0,2,0,7,0,0,0,1,22,2.136364
2,30,76,82,54,80,42,71,73,40,7,11,0,1,0,0,0,5,0,0,0,0,11,1.818182,5,1,0,1,0,8,0,0,0,0,25,2.76
3,26,84,81,80,82,69,82,80,60,11,8,1,4,0,3,0,3,0,0,0,0,23,2.173913,5,5,0,4,0,2,0,0,0,0,31,3.096774
4,24,82,78,74,80,78,73,77,70,12,14,1,0,0,4,0,3,0,0,0,0,20,1.75,2,4,0,6,0,3,0,0,0,0,31,2.279167


In [3]:
# Reading target data
y_ovr = pd.read_csv('../targets/targets.csv')
y_gk = pd.read_csv('../targets/gk_targets.csv')
y_def = pd.read_csv('../targets/def_targets.csv')
y_mid = pd.read_csv('../targets/mid_targets.csv')
y_fwd = pd.read_csv('../targets/fwd_targets.csv')

In [4]:
# Splitting Data into train and test
x_train_ovr, x_test_ovr, y_train_ovr, y_test_ovr = train_test_split(dt_train_ovr, y_ovr, test_size = 0.2, random_state = 42)
x_train_gk, x_test_gk, y_train_gk, y_test_gk = train_test_split(dt_train_gk, y_gk, test_size = 0.2, random_state = 42)
x_train_def, x_test_def, y_train_def, y_test_def = train_test_split(dt_train_def, y_def, test_size = 0.2, random_state = 42)
x_train_mid, x_test_mid, y_train_mid, y_test_mid = train_test_split(dt_train_mid, y_mid, test_size = 0.2, random_state = 42)
x_train_fwd, x_test_fwd, y_train_fwd, y_test_fwd = train_test_split(dt_train_fwd, y_fwd, test_size = 0.2, random_state = 42)

In [5]:
# Standard scaling the train data
stand_scale_ovr = preprocessing.StandardScaler()
stand_scale_gk = preprocessing.StandardScaler()
stand_scale_def = preprocessing.StandardScaler()
stand_scale_mid = preprocessing.StandardScaler()
stand_scale_fwd = preprocessing.StandardScaler()
x_train_ovr_scaled = stand_scale_ovr.fit_transform(x_train_ovr)
x_train_gk_scaled = stand_scale_gk.fit_transform(x_train_gk)
x_train_def_scaled = stand_scale_def.fit_transform(x_train_def)
x_train_mid_scaled = stand_scale_mid.fit_transform(x_train_mid)
x_train_fwd_scaled = stand_scale_fwd.fit_transform(x_train_fwd)
x_train_ovr_scaled = pd.DataFrame(x_train_ovr_scaled)
x_train_gk_scaled = pd.DataFrame(x_train_gk_scaled)
x_train_def_scaled = pd.DataFrame(x_train_def_scaled)
x_train_mid_scaled = pd.DataFrame(x_train_mid_scaled)
x_train_fwd_scaled = pd.DataFrame(x_train_fwd_scaled)

In [6]:
# Standard scaling the test data
x_test_ovr_scaled = stand_scale_ovr.transform(x_test_ovr)
x_test_gk_scaled = stand_scale_gk.transform(x_test_gk)
x_test_def_scaled = stand_scale_def.transform(x_test_def)
x_test_mid_scaled = stand_scale_mid.transform(x_test_mid)
x_test_fwd_scaled = stand_scale_fwd.transform(x_test_fwd)

In [7]:
# Cross Validating to tune the hyperparameters
parameters = {'max_depth': list(range(2,6))}
gs = GridSearchCV(estimator = DecisionTreeRegressor(),
                  param_grid = parameters,
                  cv = 5,
                  scoring = 'r2')
ovr_gs_result = gs.fit(x_train_ovr_scaled,y_train_ovr)
gs = GridSearchCV(estimator = DecisionTreeRegressor(),
                  param_grid = parameters,
                  cv = 5,
                  scoring = 'r2')
gk_gs_result = gs.fit(x_train_gk_scaled,y_train_gk)
gs = GridSearchCV(estimator = DecisionTreeRegressor(),
                  param_grid = parameters,
                  cv = 5,
                  scoring = 'r2')
def_gs_result = gs.fit(x_train_def_scaled,y_train_def)
gs = GridSearchCV(estimator = DecisionTreeRegressor(),
                  param_grid = parameters,
                  cv = 5,
                  scoring = 'r2')
mid_gs_result = gs.fit(x_train_mid_scaled,y_train_mid)
gs = GridSearchCV(estimator = DecisionTreeRegressor(),
                  param_grid = parameters,
                  cv = 5,
                  scoring = 'r2')
fwd_gs_result = gs.fit(x_train_fwd_scaled,y_train_fwd)

In [8]:
print(ovr_gs_result.best_params_['max_depth'])
print(gk_gs_result.best_params_['max_depth'])
print(def_gs_result.best_params_['max_depth'])
print(mid_gs_result.best_params_['max_depth'])
print(fwd_gs_result.best_params_['max_depth'])


5
4
4
5
4


In [9]:
dt_ovr = DecisionTreeRegressor(max_depth = ovr_gs_result.best_params_['max_depth'])
dt_gk = DecisionTreeRegressor(max_depth = gk_gs_result.best_params_['max_depth'])
dt_def = DecisionTreeRegressor(max_depth = def_gs_result.best_params_['max_depth'])
dt_mid = DecisionTreeRegressor(max_depth = mid_gs_result.best_params_['max_depth'])
dt_fwd = DecisionTreeRegressor(max_depth = fwd_gs_result.best_params_['max_depth'])
dt_ovr = dt_ovr.fit(x_train_ovr_scaled,y_train_ovr)
dt_gk = dt_gk.fit(x_train_gk_scaled,y_train_gk)
dt_def = dt_def.fit(x_train_def_scaled,y_train_def)
dt_mid = dt_mid.fit(x_train_mid_scaled,y_train_mid)
dt_fwd = dt_fwd.fit(x_train_fwd_scaled,y_train_fwd)

In [10]:
# Calculating variance score where 1 signifies a perfect prediction
print(f"Overall score: {dt_ovr.score(x_test_ovr_scaled, y_test_ovr)}")
print(f"GK score: {dt_gk.score(x_test_gk_scaled, y_test_gk)}")
print(f"DEF score: {dt_def.score(x_test_def_scaled, y_test_def)}")
print(f"MID score: {dt_mid.score(x_test_mid_scaled, y_test_mid)}")
print(f"FWD score: {dt_fwd.score(x_test_fwd_scaled, y_test_fwd)}")

Overall score: 0.6217771352208914
GK score: 0.8392048142520195
DEF score: 0.6228681718226028
MID score: 0.6827211310883187
FWD score: 0.8231665648390526
