# Merge different seasons data into one dataframe
Different seasons contain different features, so there is a need to do some engineering to merge them.

In [141]:
# %load_ext autoreload
# %autoreload 2

In [143]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from src.data.data_loader import load_merged_gw_data
from src.data.data_loader import load_players_raw_data

In [144]:
seasons = ['2016-17', '2017-18', '2018-19', '2019-20', '2020-21', '2021-22']

In [145]:
data = {}
for season in seasons:
    data[season] = load_merged_gw_data(season)

In [146]:
for season in data:
    print(f'Season {season} data shape: ', data[season].shape)

Season 2016-17 data shape:  (23679, 56)
Season 2017-18 data shape:  (22467, 56)
Season 2018-19 data shape:  (21790, 56)
Season 2019-20 data shape:  (22560, 33)
Season 2020-21 data shape:  (24365, 36)
Season 2021-22 data shape:  (18737, 36)


In [147]:
for season in list(data.keys())[:-1]:
    next_season = list(data.keys())[list(data.keys()).index(season) + 1]
    print(f'Season {season} and {next_season} features are equal: ', data[season].columns.equals(data[next_season].columns))

Season 2016-17 and 2017-18 features are equal:  True
Season 2017-18 and 2018-19 features are equal:  True
Season 2018-19 and 2019-20 features are equal:  False
Season 2019-20 and 2020-21 features are equal:  False
Season 2020-21 and 2021-22 features are equal:  True


#### There were some changes with datasets for 19-20 and 20-21 seasons.
I'll normalize every dataset to have same features as 2019-20 season dataset and I'll add position column.

## Make every season dataset use same features

In [148]:
# get common features for every season
common_features = data['2018-19'].columns.intersection(data['2019-20'].columns).intersection(data['2020-21'].columns)

In [149]:
# delete every column from dataframes that is not in common_features
for season in data:
    data[season] = data[season][common_features]

In [150]:
for season in data:
    print(f'Season {season} data shape after selecting only common features: ', data[season].shape)

Season 2016-17 data shape after selecting only common features:  (23679, 33)
Season 2017-18 data shape after selecting only common features:  (22467, 33)
Season 2018-19 data shape after selecting only common features:  (21790, 33)
Season 2019-20 data shape after selecting only common features:  (22560, 33)
Season 2020-21 data shape after selecting only common features:  (24365, 33)
Season 2021-22 data shape after selecting only common features:  (18737, 33)


#### Every season dataset has now the same features.

In [151]:
print(common_features)

Index(['name', 'assists', 'bonus', 'bps', 'clean_sheets', 'creativity',
       'element', 'fixture', 'goals_conceded', 'goals_scored', 'ict_index',
       'influence', 'kickoff_time', 'minutes', 'opponent_team', 'own_goals',
       'penalties_missed', 'penalties_saved', 'red_cards', 'round', 'saves',
       'selected', 'team_a_score', 'team_h_score', 'threat', 'total_points',
       'transfers_balance', 'transfers_in', 'transfers_out', 'value',
       'was_home', 'yellow_cards', 'GW'],
      dtype='object')


## Add position column to every season dataset

In [152]:
# load 'players_raw' for every season
players_raw = {}
for season in seasons:
    players_raw[season] = load_players_raw_data(season)

In [153]:
# leave only 'id' and 'element_type' columns
player_position = {}
for season in players_raw:
    player_position[season] = players_raw[season][['id', 'element_type']].rename(columns={'id': 'element', 'element_type': 'position'})

In [154]:
# change values from element type to 1: GK, 2: DEF, 3: MID, 4: FWD
for season in player_position:
    player_position[season]['position'].replace({1: 'GK', 2: 'DEF', 3: 'MID', 4: 'FWD'}, inplace=True)

In [155]:
player_position['2018-19'].head()

Unnamed: 0,element,position
0,1,GK
1,2,GK
2,3,DEF
3,4,DEF
4,5,DEF


In [156]:
# add position column to every season dataset
for season in player_position:
    data[season] = pd.merge(data[season], player_position[season], on='element', how='left')

In [157]:
data['2018-19'].head()

Unnamed: 0,name,assists,bonus,bps,clean_sheets,creativity,element,fixture,goals_conceded,goals_scored,...,threat,total_points,transfers_balance,transfers_in,transfers_out,value,was_home,yellow_cards,GW,position
0,Aaron_Cresswell_402,0,0,0,0,0.0,402,5,0,0,...,0.0,0,0,0,0,55,False,0,1,DEF
1,Aaron_Lennon_83,0,0,6,1,12.3,83,8,0,0,...,17.0,3,0,0,0,50,False,0,1,MID
2,Aaron_Mooy_199,0,0,24,0,18.2,199,4,3,0,...,0.0,2,0,0,0,55,True,0,1,MID
3,Aaron_Ramsey_14,0,0,7,0,10.8,14,1,1,0,...,9.0,1,0,0,0,75,True,0,1,MID
4,Aaron_Wan-Bissaka_145,1,3,38,1,14.0,145,3,0,0,...,0.0,12,0,0,0,40,False,0,1,DEF


In [161]:
# make data one single dataframe
data_merged = pd.concat(data.values(), ignore_index=True)