In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, GridSearchCV
pd.set_option('max_columns',1000)

In [2]:
lr_train_ovr = pd.read_csv('../training_data/train_data.csv')
lr_train_gk = pd.read_csv('../training_data/train_data_gk.csv')
lr_train_def = pd.read_csv('../training_data/train_data_def.csv')
lr_train_mid = pd.read_csv('../training_data/train_data_mid.csv')
lr_train_fwd = pd.read_csv('../training_data/train_data_fwd.csv')
lr_train_ovr.drop(['Players','club','Position'], axis = 1, inplace = True)
lr_train_gk.drop(['Players','club','Position'], axis = 1, inplace = True)
lr_train_def.drop(['Players','club','Position'], axis = 1, inplace = True)
lr_train_mid.drop(['Players','club','Position'], axis = 1, inplace = True)
lr_train_fwd.drop(['Players','club','Position'], axis = 1, inplace = True)
lr_train_gk.head()

Unnamed: 0,age,short_passing,dribbling,long_passing,ball_control,interceptions,positioning,vision,marking,gk_handling,gk_reflexes,Goals-16/17,A-16/17,CS-16/17,Yellow-16/17,Red-16/17,Off-16/17,Pen_SV-16/17,Pen_M-16/17,Goals_conceded-16/17,OG-16/17,Apps-16/17,Form-16/17,Goals-15/16,A-15/16,CS-15/16,Yellow-15/16,Red-15/16,Off-15/16,Pen_SV-15/16,Pen_M-15/16,Goals_conceded-15/16,OG-15/16,Apps-15/16,Form-15/16
0,32,28,19,25,27,22,16,63,20,69,73,0,0,1,1,0,0,0,0,3,0,2,2.5,0,0,0,0,0,1,0,0,16,0,6,0.333333
1,18,52,14,38,10,17,4,45,5,77,79,0,0,10,4,0,0,1,0,41,0,36,2.722222,0,0,7,0,0,1,0,0,27,0,24,2.791667
2,39,27,18,26,23,19,10,49,12,76,72,0,0,3,2,0,0,0,0,66,0,29,0.965517,0,0,11,2,0,0,2,0,43,0,35,2.942857
3,25,26,12,22,23,11,11,38,13,72,72,0,0,0,0,0,1,0,0,5,0,2,-0.5,0,0,0,0,0,0,0,0,5,0,1,-2.0
4,28,20,12,13,11,20,11,25,11,68,72,0,0,0,1,0,0,0,0,5,0,3,1.0,0,0,3,0,0,0,0,0,5,0,6,3.666667


In [3]:
# Reading target data
y_ovr = pd.read_csv('../targets/targets.csv')
y_gk = pd.read_csv('../targets/gk_targets.csv')
y_def = pd.read_csv('../targets/def_targets.csv')
y_mid = pd.read_csv('../targets/mid_targets.csv')
y_fwd = pd.read_csv('../targets/fwd_targets.csv')

In [4]:
# Splitting Data into train and test
x_train_ovr, x_test_ovr, y_train_ovr, y_test_ovr = train_test_split(lr_train_ovr, y_ovr, test_size = 0.2, random_state = 42)
x_train_gk, x_test_gk, y_train_gk, y_test_gk = train_test_split(lr_train_gk, y_gk, test_size = 0.2, random_state = 42)
x_train_def, x_test_def, y_train_def, y_test_def = train_test_split(lr_train_def, y_def, test_size = 0.2, random_state = 42)
x_train_mid, x_test_mid, y_train_mid, y_test_mid = train_test_split(lr_train_mid, y_mid, test_size = 0.2, random_state = 42)
x_train_fwd, x_test_fwd, y_train_fwd, y_test_fwd = train_test_split(lr_train_fwd, y_fwd, test_size = 0.2, random_state = 42)

In [5]:
# Standard scaling the train data
stand_scale_ovr = preprocessing.StandardScaler()
stand_scale_gk = preprocessing.StandardScaler()
stand_scale_def = preprocessing.StandardScaler()
stand_scale_mid = preprocessing.StandardScaler()
stand_scale_fwd = preprocessing.StandardScaler()
x_train_ovr_scaled = stand_scale_ovr.fit_transform(x_train_ovr)
x_train_gk_scaled = stand_scale_gk.fit_transform(x_train_gk)
x_train_def_scaled = stand_scale_def.fit_transform(x_train_def)
x_train_mid_scaled = stand_scale_mid.fit_transform(x_train_mid)
x_train_fwd_scaled = stand_scale_fwd.fit_transform(x_train_fwd)
x_train_ovr_scaled = pd.DataFrame(x_train_ovr_scaled)
x_train_gk_scaled = pd.DataFrame(x_train_gk_scaled)
x_train_def_scaled = pd.DataFrame(x_train_def_scaled)
x_train_mid_scaled = pd.DataFrame(x_train_mid_scaled)
x_train_fwd_scaled = pd.DataFrame(x_train_fwd_scaled)

In [6]:
# Standard scaling the test data
x_test_ovr_scaled = stand_scale_ovr.transform(x_test_ovr)
x_test_gk_scaled = stand_scale_gk.transform(x_test_gk)
x_test_def_scaled = stand_scale_def.transform(x_test_def)
x_test_mid_scaled = stand_scale_mid.transform(x_test_mid)
x_test_fwd_scaled = stand_scale_fwd.transform(x_test_fwd)

In [7]:
# Cross-validation to tune hyperparameter
gs = GridSearchCV(estimator = Ridge(),
                  param_grid = {'alpha' : [1,0.1,0.01,0.001,0]},
                  cv = 5,
                  scoring = 'r2',
                  n_jobs = -1)
ovr_gs_result = gs.fit(x_train_ovr_scaled,y_train_ovr)
gs = GridSearchCV(estimator = Ridge(),
                  param_grid = {'alpha' : [1,0.1,0.01,0.001,0]},
                  cv = 5,
                  scoring = 'r2',
                  n_jobs = -1)
gk_gs_result = gs.fit(x_train_gk_scaled,y_train_gk)
gs = GridSearchCV(estimator = Ridge(),
                  param_grid = {'alpha' : [1,0.1,0.01,0.001,0]},
                  cv = 5,
                  scoring = 'r2',
                  n_jobs = -1)
def_gs_result = gs.fit(x_train_def_scaled,y_train_def)
gs = GridSearchCV(estimator = Ridge(),
                  param_grid = {'alpha' : [1,0.1,0.01,0.001,0]},
                  cv = 5,
                  scoring = 'r2',
                  n_jobs = -1)
mid_gs_result = gs.fit(x_train_mid_scaled,y_train_mid)
gs = GridSearchCV(estimator = Ridge(),
                  param_grid = {'alpha' : [1,0.1,0.01,0.001,0]},
                  cv = 5,
                  scoring = 'r2',
                  n_jobs = -1)
fwd_gs_result = gs.fit(x_train_fwd_scaled,y_train_fwd)


In [8]:
print(ovr_gs_result.best_params_)
print(gk_gs_result.best_params_)
print(def_gs_result.best_params_)
print(mid_gs_result.best_params_)
print(fwd_gs_result.best_params_)

{'alpha': 1}
{'alpha': 1}
{'alpha': 1}
{'alpha': 1}
{'alpha': 1}


In [9]:
# Training data on the best parameter
lin_reg_ovr = Ridge(alpha = 1.0)
lin_reg_gk = Ridge(alpha = 1.0)
lin_reg_def = Ridge(alpha = 1.0)
lin_reg_mid = Ridge(alpha = 1.0)
lin_reg_fwd = Ridge(alpha = 1.0)
lin_reg_ovr = lin_reg_ovr.fit(x_train_ovr_scaled,y_train_ovr)
lin_reg_gk = lin_reg_gk.fit(x_train_gk_scaled,y_train_gk)
lin_reg_def = lin_reg_def.fit(x_train_def_scaled,y_train_def)
lin_reg_mid = lin_reg_mid.fit(x_train_mid_scaled,y_train_mid)
lin_reg_fwd = lin_reg_fwd.fit(x_train_fwd_scaled,y_train_fwd)

In [10]:
# Calculating variance score where 1 signifies a perfect prediction
print(f"Overall score: {lin_reg_ovr.score(x_test_ovr_scaled, y_test_ovr)}")
print(f"GK score: {lin_reg_gk.score(x_test_gk_scaled, y_test_gk)}")
print(f"DEF score: {lin_reg_def.score(x_test_def_scaled, y_test_def)}")
print(f"MID score: {lin_reg_mid.score(x_test_mid_scaled, y_test_mid)}")
print(f"FWD score: {lin_reg_fwd.score(x_test_fwd_scaled, y_test_fwd)}")

Overall score: 0.6299310445097301
GK score: 0.9469372943782862
DEF score: 0.539816139446528
MID score: 0.6879849644116449
FWD score: 0.8898657196039338


In [12]:
# Checking coeffs for model trained on complete data
for feature, coef in zip(lr_train_ovr.columns.tolist(),lin_reg_ovr.coef_[0].tolist()):
    print(feature,":",coef)

age : 0.3386291729547622
short_passing : 1.5903121906628865
dribbling : -0.44742064427413025
long_passing : -0.14128652450055806
ball_control : 2.2626373434115443
interceptions : 1.1404060040612516
positioning : -0.6806252675557879
vision : 0.4473542437643282
marking : -0.135770483375483
gk_handling : 1.329285408285954
gk_reflexes : 2.065905583187292
Goals-16/17 : 0.6172779123162258
A-16/17 : 0.37568386204262066
CS-16/17 : 0.90970932837866
Yellow-16/17 : 0.08575150395870303
Red-16/17 : 0.017915578947070406
Off-16/17 : 0.1150248335234144
Pen_SV-16/17 : 0.1397422228596003
Pen_M-16/17 : 9.580322315394429e-05
Goals_conceded-16/17 : -0.5452899468066389
OG-16/17 : 0.01868641902820568
Apps-16/17 : 0.35141533651867674
Form-16/17 : 0.6096160541720865
Goals-15/16 : 0.43624614071511203
A-15/16 : 0.2260413340044155
CS-15/16 : 0.4008392778133993
Yellow-15/16 : 0.15625652234880827
Red-15/16 : 0.05204909865429564
Off-15/16 : -0.2909625792702088
Pen_SV-15/16 : 0.17120516381824052
Pen_M-15/16 : 0.04843