# Train simple baseline models on merged seasons (full) data with historical rolling features and with data about next opponent team and positions separated data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from src.data.data_loader import load_master_team_list
from src.data.data_loader import load_understat_team_stats
from src.features.data_engineering import preprocess_seasons_data, reverse_processing, get_oponent_team_stats
from src.features.utils import idx_to_team_name, str_date_days_forward

In [2]:
from src.data.data_loader import load_average_pts

average_pts = load_average_pts()

In [10]:
import os

# load data by position
data_intermediate_path = os.path.dirname(os.getcwd()) + '\\data\\intermediate\\'

data_gk = pd.read_csv(data_intermediate_path + 'separate/gk.csv')
data_def = pd.read_csv(data_intermediate_path + 'separate/def.csv')
data_mid = pd.read_csv(data_intermediate_path + 'separate/mid.csv')
data_fwd = pd.read_csv(data_intermediate_path + 'separate/fwd.csv')

In [11]:
rolling_columns_gk = ['bonus', 'bps', 'clean_sheets', 'goals_conceded', 'influence', 'minutes',
                      'penalties_saved', 'saves', 'selected', 'player_team_score', 'opponent_team_score',
                      'total_points', 'transfers_in', 'transfers_out','value']

rolling_columns_def = ['assists', 'bonus', 'bps', 'clean_sheets', 'creativity', 'goals_conceded', 'goals_scored',
                       'ict_index', 'influence', 'minutes', 'selected', 'player_team_score', 'opponent_team_score',
                       'threat', 'total_points', 'transfers_in', 'transfers_out', 'value', 'yellow_cards']

rolling_columns_mid = ['assists', 'bonus', 'bps', 'clean_sheets', 'creativity', 'goals_conceded', 'goals_scored',
                       'ict_index', 'influence', 'minutes', 'selected', 'player_team_score', 'opponent_team_score',
                       'threat', 'total_points', 'transfers_in', 'transfers_out', 'value', 'yellow_cards']

rolling_columns_fwd = ['assists', 'bonus', 'bps', 'creativity', 'goals_scored', 'ict_index', 'influence',
                       'minutes', 'selected', 'player_team_score', 'opponent_team_score', 'threat',
                       'total_points', 'transfers_in', 'transfers_out', 'value', 'yellow_cards']

times = ['all', 6, 3]

test_subset_XL = (['2016-17', [4, 8, 9, 14, 18, 20, 21, 26, 32]], ['2018-19', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]], ['2021-22', [17, 18, 19, 20, 21, 22, 23, 24, 25, 26]])

In [12]:
(x_train_gk, y_train_gk), (x_test_gk, y_test_gk), (x_train_target_gk, x_test_target_gk), x_scaler_gk = preprocess_seasons_data(data_gk, random_split=True, test_subset=test_subset_XL, rolling_features=True, rolling_columns=rolling_columns_gk, rolling_times=times, opponent_team_stats=True)

(x_train_def, y_train_def), (x_test_def, y_test_def), (x_train_target_def, x_test_target_def), x_scaler_def = preprocess_seasons_data(data_def, random_split=True, test_subset=test_subset_XL, rolling_features=True, rolling_columns=rolling_columns_def, rolling_times=times, opponent_team_stats=True)

(x_train_mid, y_train_mid), (x_test_mid, y_test_mid), (x_train_target_mid, x_test_target_mid), x_scaler_mid = preprocess_seasons_data(data_mid, random_split=True, test_subset=test_subset_XL, rolling_features=True, rolling_columns=rolling_columns_mid, rolling_times=times, opponent_team_stats=True)

(x_train_fwd, y_train_fwd), (x_test_fwd, y_test_fwd), (x_train_target_fwd, x_test_target_fwd), x_scaler_fwd = preprocess_seasons_data(data_fwd, random_split=True, test_subset=test_subset_XL, rolling_features=True, rolling_columns=rolling_columns_fwd, rolling_times=times, opponent_team_stats=True)

## Train on baseline Dummy model

In [19]:
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error

In [20]:
dummy_regr_gk = DummyRegressor(strategy="mean")
dummy_regr_gk.fit(x_train_gk, y_train_gk)

dummy_regr_def = DummyRegressor(strategy="mean")
dummy_regr_def.fit(x_train_def, y_train_def)

dummy_regr_mid = DummyRegressor(strategy="mean")
dummy_regr_mid.fit(x_train_mid, y_train_mid)

dummy_regr_fwd = DummyRegressor(strategy="mean")
dummy_regr_fwd.fit(x_train_fwd, y_train_fwd)

DummyRegressor()

In [21]:
y_pred_gk = dummy_regr_gk.predict(x_test_gk)
y_pred_def = dummy_regr_def.predict(x_test_def)
y_pred_mid = dummy_regr_mid.predict(x_test_mid)
y_pred_fwd = dummy_regr_fwd.predict(x_test_fwd)

In [22]:
mean_squared_error(y_pred_gk, y_test_gk)

4.898789656592843

In [23]:
mean_squared_error(y_pred_def, y_test_def)

6.263342308849872

In [24]:
mean_squared_error(y_pred_mid, y_test_mid)

5.9069129491351

In [25]:
mean_squared_error(y_pred_fwd, y_test_fwd)

7.027382892899582

### Train baseline MLP model

In [27]:
import tensorflow as tf

### MLP GK

In [28]:
model_mlp_gk = tf.keras.models.Sequential()

model_mlp_gk.add(tf.keras.layers.Dense(units=128, activation='relu', input_shape=(x_train_gk.shape[1],)))
model_mlp_gk.add(tf.keras.layers.Dense(units=256, activation='relu'))
model_mlp_gk.add(tf.keras.layers.Dense(units=64, activation='relu'))
model_mlp_gk.add(tf.keras.layers.Dense(units=1, activation='linear'))


In [29]:
model_mlp_gk.compile(optimizer='adam',
                  loss='mse')

In [30]:
history_gk = model_mlp_gk.fit(x_train_gk, y_train_gk,
                        batch_size=16,
                        epochs=15,
                        validation_split=0.3)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [31]:
score_gk = model_mlp_gk.evaluate(x_test_gk, y_test_gk, verbose=0)
print('Test loss:', score_gk)

Test loss: 2.7315902709960938


### MLP DEF

In [32]:
model_mlp_def = tf.keras.models.Sequential()

model_mlp_def.add(tf.keras.layers.Dense(units=128, activation='relu', input_shape=(x_train_def.shape[1],)))
model_mlp_def.add(tf.keras.layers.Dense(units=256, activation='relu'))
model_mlp_def.add(tf.keras.layers.Dense(units=64, activation='relu'))
model_mlp_def.add(tf.keras.layers.Dense(units=1, activation='linear'))


In [33]:
model_mlp_def.compile(optimizer='adam',
                  loss='mse')

In [34]:
history_def = model_mlp_def.fit(x_train_def, y_train_def,
                        batch_size=16,
                        epochs=15,
                        validation_split=0.3)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [35]:
score_def = model_mlp_def.evaluate(x_test_def, y_test_def, verbose=0)
print('Test loss:', score_def)

Test loss: 4.776841640472412


### MLP MID

In [36]:
model_mlp_mid = tf.keras.models.Sequential()

model_mlp_mid.add(tf.keras.layers.Dense(units=128, activation='relu', input_shape=(x_train_mid.shape[1],)))
model_mlp_mid.add(tf.keras.layers.Dense(units=256, activation='relu'))
model_mlp_mid.add(tf.keras.layers.Dense(units=64, activation='relu'))
model_mlp_mid.add(tf.keras.layers.Dense(units=1, activation='linear'))


In [37]:
model_mlp_mid.compile(optimizer='adam',
                  loss='mse')

In [38]:
history_mid = model_mlp_mid.fit(x_train_mid, y_train_mid,
                        batch_size=16,
                        epochs=15,
                        validation_split=0.3)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [39]:
score_mid = model_mlp_mid.evaluate(x_test_mid, y_test_mid, verbose=0)
print('Test loss:', score_mid)

Test loss: 4.210040092468262


### MLP FWD

In [40]:
model_mlp_fwd = tf.keras.models.Sequential()

model_mlp_fwd.add(tf.keras.layers.Dense(units=128, activation='relu', input_shape=(x_train_fwd.shape[1],)))
model_mlp_fwd.add(tf.keras.layers.Dense(units=256, activation='relu'))
model_mlp_fwd.add(tf.keras.layers.Dense(units=64, activation='relu'))
model_mlp_fwd.add(tf.keras.layers.Dense(units=1, activation='linear'))


In [41]:
model_mlp_fwd.compile(optimizer='adam',
                  loss='mse')

In [42]:
history_fwd = model_mlp_fwd.fit(x_train_fwd, y_train_fwd,
                        batch_size=16,
                        epochs=15,
                        validation_split=0.3)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [43]:
score_fwd = model_mlp_fwd.evaluate(x_test_fwd, y_test_fwd, verbose=0)
print('Test loss:', score_fwd)

Test loss: 5.272583484649658


## Train on baseline XGBoost model

In [44]:
import xgboost as xgb

### XGB GK

In [45]:
model_xgb_gk = xgb.XGBRegressor()

In [46]:
model_xgb_gk.fit(x_train_gk, y_train_gk)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [47]:
# create series to keep indexes and name same as in the original y data
y_pred_xgb_gk = pd.Series(model_xgb_gk.predict(x_test_gk), index=y_test_gk.index,
                       name='predicted_total_points_next_gameweek')

In [48]:
mean_squared_error(y_pred_xgb_gk, y_test_gk)

3.1491762980915836

### XGB DEF

In [49]:
model_xgb_def = xgb.XGBRegressor()

In [50]:
model_xgb_def.fit(x_train_def, y_train_def)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [51]:
# create series to keep indexes and name same as in the original y data
y_pred_xgb_def = pd.Series(model_xgb_def.predict(x_test_def), index=y_test_def.index,
                       name='predicted_total_points_next_gameweek')

In [52]:
mean_squared_error(y_pred_xgb_def, y_test_def)

4.881946248012564

### XGB MID

In [53]:
model_xgb_mid = xgb.XGBRegressor()

In [54]:
model_xgb_mid.fit(x_train_mid, y_train_mid)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [55]:
# create series to keep indexes and name same as in the original y data
y_pred_xgb_mid = pd.Series(model_xgb_mid.predict(x_test_mid), index=y_test_mid.index,
                       name='predicted_total_points_next_gameweek')

In [56]:
mean_squared_error(y_pred_xgb_mid, y_test_mid)

4.638671892398152

### XGB FWD

In [57]:
model_xgb_fwd = xgb.XGBRegressor()

In [58]:
model_xgb_fwd.fit(x_train_fwd, y_train_fwd)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [59]:
# create series to keep indexes and name same as in the original y data
y_pred_xgb_fwd = pd.Series(model_xgb_fwd.predict(x_test_fwd), index=y_test_fwd.index,
                       name='predicted_total_points_next_gameweek')

In [60]:
mean_squared_error(y_pred_xgb_fwd, y_test_fwd)

5.72651565642211

## Models separated field and gk

In [61]:
# load data by position
data_intermediate_path = os.path.dirname(os.getcwd()) + '\\data\\intermediate\\'

data_gk = pd.read_csv(data_intermediate_path + 'separate/gk.csv')
data_field = pd.read_csv(data_intermediate_path + 'separate/field.csv')

In [62]:
rolling_columns_field = ['assists', 'bonus', 'bps', 'clean_sheets', 'creativity', 'goals_conceded', 'goals_scored',
                       'ict_index', 'influence', 'minutes', 'selected', 'player_team_score', 'opponent_team_score',
                       'threat', 'total_points', 'transfers_in', 'transfers_out', 'value', 'yellow_cards']

In [None]:
(x_train_gk, y_train_gk), (x_test_gk, y_test_gk), (x_train_target_gk, x_test_target_gk), x_scaler_gk = preprocess_seasons_data(data_gk, random_split=True, test_subset=test_subset_XL, rolling_features=True, rolling_columns=rolling_columns_gk, rolling_times=times, opponent_team_stats=True)

(x_train_field, y_train_field), (x_test_field, y_test_field), (x_train_target_field, x_test_target_field), x_scaler_field = preprocess_seasons_data(data_field, random_split=True, test_subset=test_subset_XL, rolling_features=True, rolling_columns=rolling_columns_field, rolling_times=times, opponent_team_stats=True)

### Dummy regressor

In [64]:
dummy_regr_gk = DummyRegressor(strategy="mean")
dummy_regr_gk.fit(x_train_gk, y_train_gk)

dummy_regr_field = DummyRegressor(strategy="mean")
dummy_regr_field.fit(x_train_field, y_train_field)

DummyRegressor()

In [65]:
y_pred_gk = dummy_regr_gk.predict(x_test_gk)
y_pred_field = dummy_regr_field.predict(x_test_field)

In [66]:
mean_squared_error(y_pred_gk, y_test_gk)

4.898789656592843

In [67]:
mean_squared_error(y_pred_field, y_test_field)

6.620080025357256

### MLP GK

In [68]:
model_mlp_gk = tf.keras.models.Sequential()

model_mlp_gk.add(tf.keras.layers.Dense(units=128, activation='relu', input_shape=(x_train_gk.shape[1],)))
model_mlp_gk.add(tf.keras.layers.Dense(units=256, activation='relu'))
model_mlp_gk.add(tf.keras.layers.Dense(units=64, activation='relu'))
model_mlp_gk.add(tf.keras.layers.Dense(units=1, activation='linear'))


In [69]:
model_mlp_gk.compile(optimizer='adam',
                  loss='mse')

In [70]:
history_gk = model_mlp_gk.fit(x_train_gk, y_train_gk,
                        batch_size=16,
                        epochs=15,
                        validation_split=0.3)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [71]:
score_gk = model_mlp_gk.evaluate(x_test_gk, y_test_gk, verbose=0)
print('Test loss:', score_gk)

Test loss: 2.7077624797821045


### MLP field

In [72]:
model_mlp_field = tf.keras.models.Sequential()

model_mlp_field.add(tf.keras.layers.Dense(units=128, activation='relu', input_shape=(x_train_field.shape[1],)))
model_mlp_field.add(tf.keras.layers.Dense(units=256, activation='relu'))
model_mlp_field.add(tf.keras.layers.Dense(units=64, activation='relu'))
model_mlp_field.add(tf.keras.layers.Dense(units=1, activation='linear'))


In [73]:
model_mlp_field.compile(optimizer='adam',
                  loss='mse')

In [74]:
history_field = model_mlp_field.fit(x_train_field, y_train_field,
                        batch_size=16,
                        epochs=15,
                        validation_split=0.3)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [75]:
score_field = model_mlp_field.evaluate(x_test_field, y_test_field, verbose=0)
print('Test loss:', score_field)

Test loss: 4.823612689971924


### XGB GK

In [76]:
model_xgb_gk = xgb.XGBRegressor()

In [77]:
model_xgb_gk.fit(x_train_gk, y_train_gk)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [78]:
# create series to keep indexes and name same as in the original y data
y_pred_xgb_gk = pd.Series(model_xgb_gk.predict(x_test_gk), index=y_test_gk.index,
                       name='predicted_total_points_next_gameweek')

In [79]:
mean_squared_error(y_pred_xgb_gk, y_test_gk)

3.1491762980915836

### XGB field

In [80]:
model_xgb_field = xgb.XGBRegressor()

In [81]:
model_xgb_field.fit(x_train_field, y_train_field)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [82]:
# create series to keep indexes and name same as in the original y data
y_pred_xgb_field = pd.Series(model_xgb_field.predict(x_test_field), index=y_test_field.index,
                       name='predicted_total_points_next_gameweek')

In [83]:
mean_squared_error(y_pred_xgb_field, y_test_field)

4.995369267983375