# FPL data engeeniering

In [2]:
# %load_ext autoreload
# %autoreload 2

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from src.data.data_loader import load_raw_data

Import data and prepare it for MLP model

## 2020-21 season data engineering for baseline model
Predict next game total_points based only on single previous game

In [3]:
data_2021 = load_raw_data('2020-21')

In [4]:
# add 'total_points_next_gameweek' column where total_points_next_gameweek = total_points from next 'GW' for each 'element'
data_2021['total_points_next_gameweek'] = data_2021.groupby('element')['total_points'].shift(-1)

In [5]:
# preview 'name', 'GW', 'total_points', 'total_points_next_gameweek' columns of data with 'Adam Lallana' name sorted by 'GW'
data_2021[data_2021['name'] == 'Adam Lallana'].sort_values('GW').head(10)[['name', 'GW', 'total_points', 'total_points_next_gameweek']]

Unnamed: 0,name,GW,total_points,total_points_next_gameweek
8,Adam Lallana,1,1,1.0
434,Adam Lallana,2,1,2.0
973,Adam Lallana,3,2,1.0
1527,Adam Lallana,4,1,2.0
2089,Adam Lallana,5,2,3.0
2806,Adam Lallana,6,3,2.0
3396,Adam Lallana,7,2,3.0
3995,Adam Lallana,8,3,4.0
4595,Adam Lallana,9,4,1.0
5196,Adam Lallana,10,1,0.0


**NOTE:** Model won't have any information about for which player the prediction is made.

In [6]:
# create data_2021_processed dataframe without 'name', 'team', 'fixture', 'kickoff_time', 'opponent_team', 'round', 'team_h_score', 'team_a_score'
data_2021_processed = data_2021.drop(['name', 'team', 'fixture', 'kickoff_time', 'opponent_team', 'round', 'team_h_score', 'team_a_score'], axis=1)

In [7]:
# extract 'GW', 'element' and 'total_points_next_gameweek' from data_2021_processed
data_2021_extract_target = data_2021_processed[['GW', 'element', 'total_points_next_gameweek']]

In [8]:
# drop 'GW' and 'element' from data_2021_processed
data_2021_processed = data_2021_processed.drop(['GW', 'element'], axis=1)

In [9]:
# one-hot encode 'position' column
data_2021_processed = pd.get_dummies(data_2021_processed, columns=['position'])

In [10]:
# change 'was_home' column to binary
data_2021_processed['was_home'] = data_2021_processed['was_home'].map({True: 1, False: 0})

In [11]:
# drop rows with NaN values
data_2021_processed = data_2021_processed.dropna()

In [12]:
data_2021_processed.head(10)

Unnamed: 0,xP,assists,bonus,bps,clean_sheets,creativity,goals_conceded,goals_scored,ict_index,influence,...,transfers_in,transfers_out,value,was_home,yellow_cards,total_points_next_gameweek,position_DEF,position_FWD,position_GK,position_MID
0,0.5,0,0,-3,0,0.3,2,0,3.4,1.2,...,0,0,55,1,0,8.0,0,1,0,0
1,2.1,0,0,11,0,11.2,2,0,2.2,10.4,...,0,0,50,1,0,1.0,1,0,0,0
2,0.0,0,0,0,0,0.0,0,0,0.0,0.0,...,0,0,50,1,0,0.0,0,0,0,1
3,2.5,0,0,12,0,0.0,2,0,1.8,18.2,...,0,0,50,1,0,2.0,0,0,1,0
4,1.3,0,0,20,1,44.6,0,0,6.9,20.4,...,0,0,55,0,0,1.0,0,0,0,1
5,0.4,0,0,-2,0,1.8,3,0,0.2,0.0,...,0,0,50,1,0,1.0,0,0,0,1
6,2.2,0,0,6,1,1.8,0,0,3.2,12.8,...,0,0,65,0,0,2.0,0,0,0,1
7,0.0,0,0,0,0,0.0,0,0,0.0,0.0,...,0,0,50,0,0,0.0,0,0,0,1
8,1.2,0,0,6,0,27.2,1,0,3.6,6.8,...,0,0,65,1,0,1.0,0,0,0,1
9,0.6,0,0,14,0,11.8,3,0,2.8,14.8,...,0,0,45,1,0,6.0,1,0,0,0


In [13]:
X_2021 = data_2021_processed.drop(['total_points_next_gameweek'], axis=1)
y_2021 = data_2021_processed['total_points_next_gameweek']

In [14]:
# scale X data
X_scaler = MinMaxScaler(feature_range=(0, 1))
X_2021_scaled = pd.DataFrame(X_scaler.fit_transform(X_2021), columns = X_2021.columns)

In [15]:
# split data into train and test sets
X_train_2021, X_test_2021, y_train_2021, y_test_2021 = train_test_split(X_2021_scaled, y_2021, test_size=0.2, random_state=42)

In [16]:
X_train_2021.head()

Unnamed: 0,xP,assists,bonus,bps,clean_sheets,creativity,goals_conceded,goals_scored,ict_index,influence,...,transfers_balance,transfers_in,transfers_out,value,was_home,yellow_cards,position_DEF,position_FWD,position_GK,position_MID
19010,0.409483,0.0,0.0,0.214953,1.0,0.104923,0.0,0.0,0.134146,0.06447,...,0.420871,0.01369,0.003569,0.433333,0.0,0.0,0.0,0.0,0.0,1.0
3950,0.107759,0.0,0.0,0.130841,0.0,0.0,0.0,0.0,0.0,0.0,...,0.41332,2.3e-05,0.001676,0.166667,0.0,0.0,0.0,0.0,0.0,1.0
19536,0.267241,0.0,0.0,0.158879,0.0,0.0,0.111111,0.0,0.003049,0.007163,...,0.419214,0.012464,0.005685,0.111111,0.0,0.0,0.0,1.0,0.0,0.0
8239,0.107759,0.0,0.0,0.130841,0.0,0.0,0.0,0.0,0.0,0.0,...,0.406885,5.5e-05,0.01673,0.133333,1.0,0.0,0.0,0.0,0.0,1.0
16267,0.064655,0.0,0.0,0.130841,0.0,0.0,0.0,0.0,0.0,0.0,...,0.41402,3.9e-05,6.6e-05,0.077778,0.0,0.0,0.0,0.0,0.0,1.0


In [17]:
X_test_2021.head()

Unnamed: 0,xP,assists,bonus,bps,clean_sheets,creativity,goals_conceded,goals_scored,ict_index,influence,...,transfers_balance,transfers_in,transfers_out,value,was_home,yellow_cards,position_DEF,position_FWD,position_GK,position_MID
11387,0.237069,0.0,0.666667,0.429907,0.0,0.020178,0.222222,0.25,0.234756,0.289398,...,0.407318,0.001784,0.01819,0.266667,1.0,0.0,0.0,1.0,0.0,0.0
2055,0.323276,0.0,0.0,0.252336,0.0,0.004036,0.777778,0.0,0.04878,0.110315,...,0.407962,0.039257,0.070173,0.3,0.0,1.0,1.0,0.0,0.0,0.0
18887,0.107759,0.0,0.0,0.130841,0.0,0.0,0.0,0.0,0.0,0.0,...,0.414023,0.0,1e-06,0.055556,0.0,0.0,0.0,0.0,0.0,1.0
3353,0.107759,0.0,0.0,0.130841,0.0,0.0,0.0,0.0,0.0,0.0,...,0.413927,5e-06,0.000234,0.122222,1.0,0.0,1.0,0.0,0.0,0.0
11466,0.206897,0.0,0.0,0.17757,0.0,0.000807,0.222222,0.0,0.042683,0.083095,...,0.412423,0.001793,0.006294,0.166667,1.0,1.0,1.0,0.0,0.0,0.0


In [18]:
y_train_2021.head()

19010    6.0
3950     0.0
19536    2.0
8239     0.0
16267    0.0
Name: total_points_next_gameweek, dtype: float64

In [19]:
y_test_2021.head()

11387    2.0
2055     1.0
18887    0.0
3353     0.0
11466    1.0
Name: total_points_next_gameweek, dtype: float64