In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
import lightgbm

In [2]:
datapath = '../data/'

### Feature Sync

In [3]:
window_size = '6'

In [4]:
gk_df = pd.read_csv(datapath+'gks_'+window_size+'.csv')
def_df = pd.read_csv(datapath+'defs_'+window_size+'.csv')
fwd_df = pd.read_csv(datapath+'fwds_'+window_size+'.csv')
mid_df = pd.read_csv(datapath+'mids_'+window_size+'.csv')

keep = ['assists', 'clean_sheets',
       'creativity', 'goals_conceded', 'goals_scored', 'ict_index',
       'influence', 'penalties_saved', 'red_cards', 'saves', 'threat', 'total_points',
       'yellow_cards', 'was_home','opponent_team', 'Target_Output']

gk_df = gk_df[keep]
def_df = def_df[keep]
fwd_df = fwd_df[keep]
mid_df = mid_df[keep]

# Treating opponent team as a categorical variable for now
# I currently cannot calculate this week's matchup difficulty because team_h_score and team_a_score are averaged
gk_df.opponent_team = gk_df.opponent_team.astype('int').apply(str)
def_df.opponent_team = def_df.opponent_team.astype('int').apply(str)
fwd_df.opponent_team = fwd_df.opponent_team.astype('int').apply(str)
mid_df.opponent_team = mid_df.opponent_team.astype('int').apply(str)

In [5]:
gk_df.columns

Index(['assists', 'clean_sheets', 'creativity', 'goals_conceded',
       'goals_scored', 'ict_index', 'influence', 'penalties_saved',
       'red_cards', 'saves', 'threat', 'total_points', 'yellow_cards',
       'was_home', 'opponent_team', 'Target_Output'],
      dtype='object')

In [6]:
gk_df.head()

Unnamed: 0,assists,clean_sheets,creativity,goals_conceded,goals_scored,ict_index,influence,penalties_saved,red_cards,saves,threat,total_points,yellow_cards,was_home,opponent_team,Target_Output
0,0.0,0.166667,0.0,1.333333,0.0,1.466667,14.5,0.0,0.0,2.0,0.0,2.0,0.0,True,3,6.0
1,0.0,0.333333,0.0,1.0,0.0,1.15,11.266667,0.0,0.0,1.5,0.0,6.0,0.0,False,18,10.0
2,0.0,0.5,0.0,0.833333,0.0,1.683333,16.733333,0.0,0.0,2.333333,0.0,10.0,0.0,True,12,10.0
3,0.0,0.666667,0.0,0.666667,0.0,1.9,18.8,0.0,0.0,2.666667,0.0,10.0,0.0,False,5,6.0
4,0.0,0.833333,0.0,0.166667,0.0,1.9,18.933333,0.0,0.0,2.666667,0.0,6.0,0.0,True,1,3.0


### Modeling

In [7]:
# For lightgbm
params = {
    'objective': 'regression',
    'n_estimators': 200,
    'num_leaves': 8,
    'max_depth': 4,
    'learning_rate': 0.01,
    'reg_alpha': 0.01,
    'reg_lambda': 0.01,
    'verbose': -1
}

#### Goalkeepers

In [8]:
model = lightgbm.LGBMRegressor(**params)

In [9]:
gk_df = pd.get_dummies(gk_df)

In [10]:
X = gk_df.drop('Target_Output', axis=1)
y = gk_df.Target_Output

# Train/Valid/Test 0.6/0.2/0.2
# Test set is still held-out and won't be used until writing the report.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

In [11]:
model.fit(X_train, y_train)

preds = model.predict(X_val)

train_preds = model.predict(X_train)
print('lightgbm Train MSE: %f' % mean_squared_error(y_train, train_preds))
print('lightgbm Validation MSE: %f' % mean_squared_error(y_val, preds))

lightgbm Train MSE: 3.007662
lightgbm Validation MSE: 3.484219


In [12]:
clf = linear_model.Lasso(alpha=0.01)
clf.fit(X_train, y_train)
preds = clf.predict(X_val)

train_preds = clf.predict(X_train)
print('linear Train MSE: %f' % mean_squared_error(y_train, train_preds))
print('linear Validation MSE: %f' % mean_squared_error(y_val, preds))

pd.DataFrame([clf.feature_names_in_, clf.coef_])

linear Train MSE: 3.507056
linear Validation MSE: 3.498940


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
0,assists,clean_sheets,creativity,goals_conceded,goals_scored,ict_index,influence,penalties_saved,red_cards,saves,...,opponent_team_19,opponent_team_2,opponent_team_20,opponent_team_3,opponent_team_4,opponent_team_5,opponent_team_6,opponent_team_7,opponent_team_8,opponent_team_9
1,0.0,2.592953,0.156794,0.537518,0.0,0.0,0.210124,-0.0,-0.0,-1.083213,...,0.275904,-0.0,-0.162122,0.0,-0.0,-0.0,0.061717,0.0,-0.0,-0.094067


In [13]:
print('Heuristic MSE: %f' % mean_squared_error(y_val, X_val.total_points))

Heuristic MSE: 6.674487


#### Defenders

In [14]:
model = lightgbm.LGBMRegressor(**params)

In [15]:
def_df = pd.get_dummies(def_df)

In [16]:
X = def_df.drop('Target_Output', axis=1)
y = def_df.Target_Output

# Train/Valid/Test 0.6/0.2/0.2
# Test set is still held-out and won't be used until writing the report.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

In [17]:
model.fit(X_train, y_train)

preds = model.predict(X_val)

train_preds = model.predict(X_train)
print('lightgbm Train MSE: %f' % mean_squared_error(y_train, train_preds))
print('lightgbm Validation MSE: %f' % mean_squared_error(y_val, preds))

lightgbm Train MSE: 4.411185
lightgbm Validation MSE: 4.369393


In [18]:
clf = linear_model.Lasso(alpha=0.01)
clf.fit(X_train, y_train)
preds = clf.predict(X_val)

train_preds = clf.predict(X_train)
print('linear Train MSE: %f' % mean_squared_error(y_train, train_preds))
print('linear Validation MSE: %f' % mean_squared_error(y_val, preds))

pd.DataFrame([clf.feature_names_in_, clf.coef_])

linear Train MSE: 4.765697
linear Validation MSE: 4.521456


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
0,assists,clean_sheets,creativity,goals_conceded,goals_scored,ict_index,influence,penalties_saved,red_cards,saves,...,opponent_team_19,opponent_team_2,opponent_team_20,opponent_team_3,opponent_team_4,opponent_team_5,opponent_team_6,opponent_team_7,opponent_team_8,opponent_team_9
1,-0.0,0.313182,0.049491,0.0,-0.0,0.0,0.076523,0.0,-0.0,0.0,...,0.0,-0.085163,0.0,0.196909,0.001808,-0.0,0.038692,0.075034,0.0,-0.108624


In [19]:
print('Heuristic MSE: %f' % mean_squared_error(y_val, X_val.total_points))

Heuristic MSE: 8.003342


#### Forwards

In [20]:
model = lightgbm.LGBMRegressor(**params)

In [21]:
fwd_df = pd.get_dummies(fwd_df)

In [22]:
X = fwd_df.drop('Target_Output', axis=1)
y = fwd_df.Target_Output

# Train/Valid/Test 0.6/0.2/0.2
# Test set is still held-out and won't be used until writing the report.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

In [23]:
model.fit(X_train, y_train)

preds = model.predict(X_val)

train_preds = model.predict(X_train)
print('lightgbm Train MSE: %f' % mean_squared_error(y_train, train_preds))
print('lightgbm Validation MSE: %f' % mean_squared_error(y_val, preds))

lightgbm Train MSE: 4.887702
lightgbm Validation MSE: 6.015776


In [24]:
clf = linear_model.Lasso(alpha=0.01)
clf.fit(X_train, y_train)
preds = clf.predict(X_val)

train_preds = clf.predict(X_train)
print('linear Train MSE: %f' % mean_squared_error(y_train, train_preds))
print('linear Validation MSE: %f' % mean_squared_error(y_val, preds))

pd.DataFrame([clf.feature_names_in_, clf.coef_])

linear Train MSE: 5.574827
linear Validation MSE: 6.227419


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
0,assists,clean_sheets,creativity,goals_conceded,goals_scored,ict_index,influence,penalties_saved,red_cards,saves,...,opponent_team_19,opponent_team_2,opponent_team_20,opponent_team_3,opponent_team_4,opponent_team_5,opponent_team_6,opponent_team_7,opponent_team_8,opponent_team_9
1,-0.0,0.0,0.056386,0.589428,0.0,0.0,0.00118,0.0,-0.0,0.0,...,0.024098,-0.0,-0.034499,0.560914,0.0,0.331073,-0.0,-0.174466,-0.0,-0.542257


In [25]:
print('Heuristic MSE: %f' % mean_squared_error(y_val, X_val.total_points))

Heuristic MSE: 9.268949


#### Midfielders

In [26]:
model = lightgbm.LGBMRegressor(**params)

In [27]:
mid_df = pd.get_dummies(mid_df)

In [28]:
X = mid_df.drop('Target_Output', axis=1)
y = mid_df.Target_Output

# Train/Valid/Test 0.6/0.2/0.2
# Test set is still held-out and won't be used until writing the report.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

In [29]:
model.fit(X_train, y_train)

preds = model.predict(X_val)

train_preds = model.predict(X_train)
print('lightgbm Train MSE: %f' % mean_squared_error(y_train, train_preds))
print('lightgbm Validation MSE: %f' % mean_squared_error(y_val, preds))

lightgbm Train MSE: 4.229285
lightgbm Validation MSE: 5.060418


In [30]:
clf = linear_model.Lasso(alpha=0.01)
clf.fit(X_train, y_train)
preds = clf.predict(X_val)

train_preds = clf.predict(X_train)
print('linear Train MSE: %f' % mean_squared_error(y_train, train_preds))
print('linear Validation MSE: %f' % mean_squared_error(y_val, preds))

pd.DataFrame([clf.feature_names_in_, clf.coef_])

linear Train MSE: 4.569375
linear Validation MSE: 5.270811


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
0,assists,clean_sheets,creativity,goals_conceded,goals_scored,ict_index,influence,penalties_saved,red_cards,saves,...,opponent_team_19,opponent_team_2,opponent_team_20,opponent_team_3,opponent_team_4,opponent_team_5,opponent_team_6,opponent_team_7,opponent_team_8,opponent_team_9
1,-0.0,0.646223,0.027473,0.240216,-0.0,0.0,0.036586,0.0,0.0,0.0,...,0.0,-0.027442,-0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,-0.0


In [31]:
print('Heuristic MSE: %f' % mean_squared_error(y_val, X_val.total_points))

Heuristic MSE: 8.478620
