In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split, GridSearchCV
pd.set_option('max_columns',1000)

In [2]:
lr_train_gk = pd.read_csv('../training_data/train_data_gk.csv')
lr_train_def = pd.read_csv('../training_data/train_data_def.csv')
lr_train_mid = pd.read_csv('../training_data/train_data_mid.csv')
lr_train_fwd = pd.read_csv('../training_data/train_data_fwd.csv')
lr_train_gk.drop(['Players','club','Position','league'], axis = 1, inplace = True)
lr_train_def.drop(['Players','club','Position','league'], axis = 1, inplace = True)
lr_train_mid.drop(['Players','club','Position','league'], axis = 1, inplace = True)
lr_train_fwd.drop(['Players','club','Position','league'], axis = 1, inplace = True)
lr_train_gk.head()

Unnamed: 0,short_passing,dribbling,long_passing,sprint_speed,interceptions,gk_diving,Goals-16/17,A-16/17,CS-16/17,Yellow-16/17,Red-16/17,Off-16/17,Pen_SV-16/17,Pen_M-16/17,Goals_conceded-16/17,OG-16/17,Apps-16/17,Form-16/17,Goals-15/16,A-15/16,CS-15/16,Yellow-15/16,Red-15/16,Off-15/16,Pen_SV-15/16,Pen_M-15/16,Goals_conceded-15/16,OG-15/16,Apps-15/16
0,28,19,25,48,22,73,0,0,1,1,0,0,0,0,3,0,2,2.5,0,0,0,0,0,1,0,0,16,0,6
1,52,14,38,45,17,78,0,0,10,4,0,0,1,0,41,0,36,2.722222,0,0,7,0,0,1,0,0,27,0,24
2,27,18,26,45,19,74,0,0,3,2,0,0,0,0,66,0,29,0.965517,0,0,11,2,0,0,2,0,43,0,35
3,26,12,22,33,11,67,0,0,0,0,0,1,0,0,5,0,2,-0.5,0,0,0,0,0,0,0,0,5,0,1
4,20,12,13,42,20,68,0,0,0,1,0,0,0,0,5,0,3,1.0,0,0,3,0,0,0,0,0,5,0,6


In [3]:
# Reading target data
y_gk = pd.read_csv('../targets/gk_targets.csv')
y_def = pd.read_csv('../targets/def_targets.csv')
y_mid = pd.read_csv('../targets/mid_targets.csv')
y_fwd = pd.read_csv('../targets/fwd_targets.csv')

In [4]:
# Splitting Data into train and test
x_train_gk, x_test_gk, y_train_gk, y_test_gk = train_test_split(lr_train_gk, y_gk, test_size = 0.2, random_state = 42)
x_train_def, x_test_def, y_train_def, y_test_def = train_test_split(lr_train_def, y_def, test_size = 0.2, random_state = 42)
x_train_mid, x_test_mid, y_train_mid, y_test_mid = train_test_split(lr_train_mid, y_mid, test_size = 0.2, random_state = 42)
x_train_fwd, x_test_fwd, y_train_fwd, y_test_fwd = train_test_split(lr_train_fwd, y_fwd, test_size = 0.2, random_state = 42)

In [5]:
# Standard scaling the train data
stand_scale_gk = preprocessing.StandardScaler()
stand_scale_def = preprocessing.StandardScaler()
stand_scale_mid = preprocessing.StandardScaler()
stand_scale_fwd = preprocessing.StandardScaler()
x_train_gk_scaled = stand_scale_gk.fit_transform(x_train_gk)
x_train_def_scaled = stand_scale_def.fit_transform(x_train_def)
x_train_mid_scaled = stand_scale_mid.fit_transform(x_train_mid)
x_train_fwd_scaled = stand_scale_fwd.fit_transform(x_train_fwd)
x_train_gk_scaled = pd.DataFrame(x_train_gk_scaled)
x_train_def_scaled = pd.DataFrame(x_train_def_scaled)
x_train_mid_scaled = pd.DataFrame(x_train_mid_scaled)
x_train_fwd_scaled = pd.DataFrame(x_train_fwd_scaled)

In [6]:
# Standard scaling the test data
x_test_gk_scaled = stand_scale_gk.transform(x_test_gk)
x_test_def_scaled = stand_scale_def.transform(x_test_def)
x_test_mid_scaled = stand_scale_mid.transform(x_test_mid)
x_test_fwd_scaled = stand_scale_fwd.transform(x_test_fwd)

In [7]:
# # Training a regression model with L2 regularisation
# lin_reg_gk = Ridge(alpha = 1.0)
# lin_reg_def = Ridge(alpha = 1.0)
# lin_reg_mid = Ridge(alpha = 1.0)
# lin_reg_fwd = Ridge(alpha = 1.0)
# # lin_reg_gk = lin_reg_gk.fit(x_train_gk_scaled,y_train_gk)
# # lin_reg_def = lin_reg_def.fit(x_train_def_scaled,y_train_def)
# # lin_reg_mid = lin_reg_mid.fit(x_train_mid_scaled,y_train_mid)
# # lin_reg_fwd = lin_reg_fwd.fit(x_train_fwd_scaled,y_train_fwd)

In [8]:
# # Cross validation
# gk_scores = cross_val_score(lin_reg_gk,x_train_gk, y_train_gk, cv = 5)
# def_scores = cross_val_score(lin_reg_def,x_train_def, y_train_def, cv = 5)
# mid_scores = cross_val_score(lin_reg_mid,x_train_mid, y_train_mid, cv = 5)
# fwd_scores = cross_val_score(lin_reg_fwd,x_train_fwd, y_train_fwd, cv = 5)

In [9]:
# Cross-validation to tune hyperparameter
gs = GridSearchCV(estimator = Ridge(),
                  param_grid = {'alpha' : [1,0.1,0.01,0.001,0]},
                  cv = 5,
                  scoring = 'r2',
                  n_jobs = -1)
gk_gs_result = gs.fit(x_train_gk_scaled,y_train_gk)
gs = GridSearchCV(estimator = Ridge(),
                  param_grid = {'alpha' : [1,0.1,0.01,0.001,0]},
                  cv = 5,
                  scoring = 'r2',
                  n_jobs = -1)
def_gs_result = gs.fit(x_train_def_scaled,y_train_def)
gs = GridSearchCV(estimator = Ridge(),
                  param_grid = {'alpha' : [1,0.1,0.01,0.001,0]},
                  cv = 5,
                  scoring = 'r2',
                  n_jobs = -1)
mid_gs_result = gs.fit(x_train_mid_scaled,y_train_mid)
gs = GridSearchCV(estimator = Ridge(),
                  param_grid = {'alpha' : [1,0.1,0.01,0.001,0]},
                  cv = 5,
                  scoring = 'r2',
                  n_jobs = -1)
fwd_gs_result = gs.fit(x_train_fwd_scaled,y_train_fwd)


In [10]:
print(gk_gs_result.best_params_)
print(def_gs_result.best_params_)
print(mid_gs_result.best_params_)
print(fwd_gs_result.best_params_)

{'alpha': 1}
{'alpha': 1}
{'alpha': 1}
{'alpha': 1}


In [11]:
# Training data on the best parameter
lin_reg_gk = Ridge(alpha = 1.0)
lin_reg_def = Ridge(alpha = 1.0)
lin_reg_mid = Ridge(alpha = 1.0)
lin_reg_fwd = Ridge(alpha = 1.0)
lin_reg_gk = lin_reg_gk.fit(x_train_gk_scaled,y_train_gk)
lin_reg_def = lin_reg_def.fit(x_train_def_scaled,y_train_def)
lin_reg_mid = lin_reg_mid.fit(x_train_mid_scaled,y_train_mid)
lin_reg_fwd = lin_reg_fwd.fit(x_train_fwd_scaled,y_train_fwd)

In [12]:
# print(f"GK:{gk_scores}")
# print(f"DEF:{def_scores}")
# print(f"MID:{mid_scores}")
# print(f"FWD:{fwd_scores}")

In [13]:
# Calculating variance score where 1 signifies a perfect prediction
print(f"GK score: {lin_reg_gk.score(x_test_gk_scaled, y_test_gk)}")
print(f"DEF score: {lin_reg_def.score(x_test_def_scaled, y_test_def)}")
print(f"MID score: {lin_reg_mid.score(x_test_mid_scaled, y_test_mid)}")
print(f"FWD score: {lin_reg_fwd.score(x_test_fwd_scaled, y_test_fwd)}")

GK score: 0.852797888418026
DEF score: 0.5397202923511721
MID score: 0.6620779821330262
FWD score: 0.8529675098249898


In [None]:
print(lin_reg_gk.coef_)