# FPL data engeeniering for single season

In [2]:
# %load_ext autoreload
# %autoreload 2

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from src.data.data_loader import load_merged_gw

Import data and prepare it for MLP model

## 2020-21 season data engineering for baseline model
Predict next game total_points based only on single previous game

In [2]:
data_2021 = load_merged_gw('2020-21')

In [3]:
# add 'total_points_next_gameweek' column where total_points_next_gameweek = total_points from next 'GW' for each 'element'
data_2021['total_points_next_gameweek'] = data_2021.groupby('element')['total_points'].shift(-1)

In [4]:
# preview 'name', 'GW', 'total_points', 'total_points_next_gameweek' columns of data with 'Adam Lallana' name sorted by 'GW'
data_2021[data_2021['name'] == 'Adam Lallana'].sort_values('GW').head(10)[['name', 'GW', 'total_points', 'total_points_next_gameweek']]

Unnamed: 0,name,GW,total_points,total_points_next_gameweek
8,Adam Lallana,1,1,1.0
434,Adam Lallana,2,1,2.0
973,Adam Lallana,3,2,1.0
1527,Adam Lallana,4,1,2.0
2089,Adam Lallana,5,2,3.0
2806,Adam Lallana,6,3,2.0
3396,Adam Lallana,7,2,3.0
3995,Adam Lallana,8,3,4.0
4595,Adam Lallana,9,4,1.0
5196,Adam Lallana,10,1,0.0


**NOTE:** Model won't have any information about for which player the prediction is made.

In [5]:
# create data_2021_processed dataframe without 'team', 'fixture', 'kickoff_time', 'opponent_team', 'round', 'team_h_score', 'team_a_score'
data_2021_processed = data_2021.drop(['team', 'fixture', 'kickoff_time', 'opponent_team', 'round', 'team_h_score', 'team_a_score'], axis=1)

In [6]:
# one-hot encode 'position' column
data_2021_processed = pd.get_dummies(data_2021_processed, columns=['position'])

In [7]:
# change 'was_home' column to binary
data_2021_processed['was_home'] = data_2021_processed['was_home'].map({True: 1, False: 0})

In [8]:
# drop rows with NaN values
data_2021_processed = data_2021_processed.dropna()

In [9]:
data_2021_processed.head(10)

Unnamed: 0,name,xP,assists,bonus,bps,clean_sheets,creativity,element,goals_conceded,goals_scored,...,transfers_out,value,was_home,yellow_cards,GW,total_points_next_gameweek,position_DEF,position_FWD,position_GK,position_MID
0,Aaron Connolly,0.5,0,0,-3,0,0.3,78,2,0,...,0,55,1,0,1,8.0,0,1,0,0
1,Aaron Cresswell,2.1,0,0,11,0,11.2,435,2,0,...,0,50,1,0,1,1.0,1,0,0,0
2,Aaron Mooy,0.0,0,0,0,0,0.0,60,0,0,...,0,50,1,0,1,0.0,0,0,0,1
3,Aaron Ramsdale,2.5,0,0,12,0,0.0,483,2,0,...,0,50,1,0,1,2.0,0,0,1,0
4,Abdoulaye DoucourÃ©,1.3,0,0,20,1,44.6,512,0,0,...,0,55,0,0,1,1.0,0,0,0,1
5,Aboubakar Kamara,0.4,0,0,-2,0,1.8,190,3,0,...,0,50,1,0,1,1.0,0,0,0,1
6,Adama TraorÃ©,2.2,0,0,6,1,1.8,465,0,0,...,0,65,0,0,1,2.0,0,0,0,1
7,Adam Forshaw,0.0,0,0,0,0,0.0,199,0,0,...,0,50,0,0,1,0.0,0,0,0,1
8,Adam Lallana,1.2,0,0,6,0,27.2,54,1,0,...,0,65,1,0,1,1.0,0,0,0,1
9,Adam Webster,0.6,0,0,14,0,11.8,66,3,0,...,0,45,1,0,1,6.0,1,0,0,0


In [10]:
# extract 'name', 'GW', 'element' and 'total_points_next_gameweek' from data_2021_processed
data_2021_extract_target = data_2021_processed[['name', 'GW', 'element', 'total_points_next_gameweek']]

In [11]:
X_2021 = data_2021_processed.drop(['name', 'GW', 'element', 'total_points_next_gameweek'], axis=1)
y_2021 = data_2021_processed['total_points_next_gameweek']

In [12]:
# scale X data
X_scaler = MinMaxScaler(feature_range=(0, 1))
X_2021_scaled = pd.DataFrame(X_scaler.fit_transform(X_2021), index=X_2021.index, columns = X_2021.columns)

In [13]:
# concatenate data_2021_extract_target and X_2021_scaled
X_2021_scaled = pd.concat([data_2021_extract_target, X_2021_scaled], axis=1)

In [14]:
# split data into train and test sets
X_train_2021, X_test_2021, y_train_2021, y_test_2021 = train_test_split(X_2021_scaled, y_2021, test_size=0.2, random_state=42)

In [15]:
X_train_2021

Unnamed: 0,name,GW,element,total_points_next_gameweek,xP,assists,bonus,bps,clean_sheets,creativity,...,transfers_balance,transfers_in,transfers_out,value,was_home,yellow_cards,position_DEF,position_FWD,position_GK,position_MID
19010,James RodrÃ­guez,31,508,6.0,0.409483,0.0,0.0,0.214953,1.0,0.104923,...,0.420871,0.013690,0.003569,0.433333,0.0,0.0,0.0,0.0,0.0,1.0
3950,Jonjo Shelvey,8,321,0.0,0.107759,0.0,0.0,0.130841,0.0,0.000000,...,0.413320,0.000023,0.001676,0.166667,0.0,0.0,0.0,0.0,0.0,1.0
19536,Matej Vydra,32,93,2.0,0.267241,0.0,0.0,0.158879,0.0,0.000000,...,0.419214,0.012464,0.005685,0.111111,0.0,0.0,0.0,1.0,0.0,0.0
8239,John Lundstram,15,355,0.0,0.107759,0.0,0.0,0.130841,0.0,0.000000,...,0.406885,0.000055,0.016730,0.133333,1.0,0.0,0.0,0.0,0.0,1.0
16267,Taylor Perry,26,611,0.0,0.064655,0.0,0.0,0.130841,0.0,0.000000,...,0.414020,0.000039,0.000066,0.077778,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11964,Jan Bednarek,21,376,-7.0,0.228448,0.0,0.0,0.261682,0.0,0.002421,...,0.403304,0.016893,0.049117,0.122222,1.0,0.0,1.0,0.0,0.0,0.0
21575,Rui Pedro dos Santos PatrÃ­cio,35,455,5.0,0.107759,0.0,0.0,0.233645,0.0,0.000000,...,0.407037,0.001244,0.018074,0.166667,1.0,0.0,0.0,0.0,1.0,0.0
5390,Alexis Mac Allister,10,80,0.0,0.107759,0.0,0.0,0.130841,0.0,0.000000,...,0.413934,0.000035,0.000260,0.166667,1.0,0.0,0.0,0.0,0.0,1.0
860,RaÃºl JimÃ©nez,2,460,0.0,0.366379,0.0,0.0,0.373832,0.0,0.015335,...,0.423170,0.078279,0.090398,0.522222,1.0,0.0,0.0,1.0,0.0,0.0


In [16]:
# extract target and drop it from X data
X_train_2021_extract_target = X_train_2021[['name', 'GW', 'element', 'total_points_next_gameweek']]
X_train_2021.drop(['name', 'GW', 'element', 'total_points_next_gameweek'], axis=1, inplace=True)

X_test_2021_extract_target = X_test_2021[['name', 'GW', 'element', 'total_points_next_gameweek']]
X_test_2021.drop(['name', 'GW', 'element', 'total_points_next_gameweek'], axis=1, inplace=True)

In [17]:
X_test_2021.head()

Unnamed: 0,xP,assists,bonus,bps,clean_sheets,creativity,goals_conceded,goals_scored,ict_index,influence,...,transfers_balance,transfers_in,transfers_out,value,was_home,yellow_cards,position_DEF,position_FWD,position_GK,position_MID
11387,0.237069,0.0,0.666667,0.429907,0.0,0.020178,0.222222,0.25,0.234756,0.289398,...,0.407318,0.001784,0.01819,0.266667,1.0,0.0,0.0,1.0,0.0,0.0
2055,0.323276,0.0,0.0,0.252336,0.0,0.004036,0.777778,0.0,0.04878,0.110315,...,0.407962,0.039257,0.070173,0.3,0.0,1.0,1.0,0.0,0.0,0.0
18887,0.107759,0.0,0.0,0.130841,0.0,0.0,0.0,0.0,0.0,0.0,...,0.414023,0.0,1e-06,0.055556,0.0,0.0,0.0,0.0,0.0,1.0
3353,0.107759,0.0,0.0,0.130841,0.0,0.0,0.0,0.0,0.0,0.0,...,0.413927,5e-06,0.000234,0.122222,1.0,0.0,1.0,0.0,0.0,0.0
11466,0.206897,0.0,0.0,0.17757,0.0,0.000807,0.222222,0.0,0.042683,0.083095,...,0.412423,0.001793,0.006294,0.166667,1.0,1.0,1.0,0.0,0.0,0.0


In [18]:
X_test_2021_extract_target.head()

Unnamed: 0,name,GW,element,total_points_next_gameweek
11387,Chris Wood,20,91,2.0
2055,Virgil van Dijk,4,250,1.0
18887,Mateusz Bogusz,31,533,0.0
3353,Japhet Tanganga,7,402,0.0
11466,James Tarkowski,20,81,1.0


In [19]:
y_test_2021.head()

11387    2.0
2055     1.0
18887    0.0
3353     0.0
11466    1.0
Name: total_points_next_gameweek, dtype: float64

## Reverse processing and merge with footballer data

In [23]:
X_train_reversed = pd.concat([X_train_2021_extract_target, pd.DataFrame(X_scaler.inverse_transform(X_train_2021), index=X_train_2021.index, columns=X_train_2021.columns)], axis=1)
X_test_reversed = pd.concat([X_test_2021_extract_target, pd.DataFrame(X_scaler.inverse_transform(X_test_2021), index=X_test_2021.index, columns=X_test_2021.columns)], axis=1)

In [26]:
# preview X_train_reversed data ordered by index
X_train_reversed.sort_index().head(10)

Unnamed: 0,name,GW,element,total_points_next_gameweek,xP,assists,bonus,bps,clean_sheets,creativity,...,transfers_balance,transfers_in,transfers_out,value,was_home,yellow_cards,position_DEF,position_FWD,position_GK,position_MID
0,Aaron Connolly,1,78,8.0,0.5,0.0,0.0,-3.0,0.0,0.3,...,0.0,0.0,0.0,55.0,1.0,0.0,0.0,1.0,0.0,0.0
1,Aaron Cresswell,1,435,1.0,2.1,0.0,0.0,11.0,0.0,11.2,...,0.0,0.0,0.0,50.0,1.0,0.0,1.0,0.0,0.0,0.0
2,Aaron Mooy,1,60,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,50.0,1.0,0.0,0.0,0.0,0.0,1.0
3,Aaron Ramsdale,1,483,2.0,2.5,0.0,0.0,12.0,0.0,0.0,...,0.0,0.0,0.0,50.0,1.0,0.0,0.0,0.0,1.0,0.0
4,Abdoulaye DoucourÃ©,1,512,1.0,1.3,0.0,0.0,20.0,1.0,44.6,...,0.0,0.0,0.0,55.0,0.0,0.0,0.0,0.0,0.0,1.0
5,Aboubakar Kamara,1,190,1.0,0.4,0.0,0.0,-2.0,0.0,1.8,...,0.0,0.0,0.0,50.0,1.0,0.0,0.0,0.0,0.0,1.0
6,Adama TraorÃ©,1,465,2.0,2.2,0.0,0.0,6.0,1.0,1.8,...,0.0,0.0,0.0,65.0,0.0,0.0,0.0,0.0,0.0,1.0
7,Adam Forshaw,1,199,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,50.0,0.0,0.0,0.0,0.0,0.0,1.0
8,Adam Lallana,1,54,1.0,1.2,0.0,0.0,6.0,0.0,27.2,...,0.0,0.0,0.0,65.0,1.0,0.0,0.0,0.0,0.0,1.0
9,Adam Webster,1,66,6.0,0.6,0.0,0.0,14.0,0.0,11.8,...,0.0,0.0,0.0,45.0,1.0,0.0,1.0,0.0,0.0,0.0


In [30]:
y_train_2021.sort_index().head(10)

0    8.0
1    1.0
2    0.0
3    2.0
4    1.0
5    1.0
6    2.0
7    0.0
8    1.0
9    6.0
Name: total_points_next_gameweek, dtype: float64