# Separate data into different positions and do feature selection in order to train different models

In [2]:
import os
from src.features.data_engineering import get_merged_seasons_data

In [3]:
data = get_merged_seasons_data()

## Separate data into different positions

In [4]:
data_gk = data[data['position'] == 'GK']
data_def = data[data['position'] == 'DEF']
data_mid = data[data['position'] == 'MID']
data_fwd = data[data['position'] == 'FWD']

In [5]:
print('GK len:', len(data_gk))
print('DEF len:', len(data_def))
print('MID len:', len(data_mid))
print('FWD len:', len(data_fwd))

GK len: 15267
DEF len: 47611
MID len: 57922
FWD len: 19508


## Feature selection for different positions

In [6]:
# preview columns from data
data.columns

Index(['name', 'assists', 'bonus', 'bps', 'clean_sheets', 'creativity',
       'element', 'fixture', 'goals_conceded', 'goals_scored', 'ict_index',
       'influence', 'kickoff_time', 'minutes', 'opponent_team', 'own_goals',
       'penalties_missed', 'penalties_saved', 'red_cards', 'round', 'saves',
       'selected', 'team_a_score', 'team_h_score', 'threat', 'total_points',
       'transfers_balance', 'transfers_in', 'transfers_out', 'value',
       'was_home', 'yellow_cards', 'GW', 'position', 'season'],
      dtype='object')

Remember about so called 'target features':
- (['name', 'GW', 'element', 'total_points_next_gameweek', 'season'])

And about features dropped during preprocessing:
- (['fixture', 'kickoff_time', 'opponent_team', 'round', 'transfers_balance', 'was_home'])

### Select features to drop

In [7]:
data_gk.describe()

Unnamed: 0,assists,bonus,bps,clean_sheets,creativity,element,fixture,goals_conceded,goals_scored,ict_index,...,team_a_score,team_h_score,threat,total_points,transfers_balance,transfers_in,transfers_out,value,yellow_cards,GW
count,15267.0,15267.0,15267.0,15267.0,15267.0,15267.0,15267.0,15267.0,15267.0,15267.0,...,15262.0,15262.0,15267.0,15267.0,15267.0,15267.0,15267.0,15267.0,15267.0,15267.0
mean,0.000852,0.082203,5.474553,0.084365,0.079708,332.074998,199.043034,0.411803,6.6e-05,0.675909,...,1.239811,1.502359,0.006681,1.091308,529.8106,7336.096679,6806.286,44.905875,0.011397,20.972228
std,0.029169,0.421938,9.313818,0.277943,0.926182,189.028235,108.228868,0.937661,0.008093,1.301005,...,1.21596,1.305039,0.260923,2.273843,25039.94,22115.549325,19904.92,4.848968,0.106151,11.541131
min,0.0,0.0,-7.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-3.0,-1035061.0,0.0,0.0,39.0,0.0,1.0
25%,0.0,0.0,0.0,0.0,0.0,166.0,107.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,-1057.0,49.0,139.0,40.0,0.0,11.0
50%,0.0,0.0,0.0,0.0,0.0,333.0,203.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,-72.0,331.0,872.0,44.0,0.0,21.0
75%,0.0,0.0,11.0,0.0,0.0,506.0,293.0,0.0,0.0,0.9,...,2.0,2.0,0.0,1.0,78.5,4437.5,6439.0,48.0,0.0,30.0
max,1.0,3.0,54.0,1.0,30.0,708.0,380.0,9.0,1.0,10.3,...,9.0,9.0,18.0,17.0,419020.0,459020.0,1039664.0,63.0,1.0,47.0


In [8]:
drop_features_gk = ['assists', 'creativity', 'goals_scored', 'ict_index', 'own_goals', 'penalties_missed', 'red_cards', 'threat', 'yellow_cards']

In [9]:
data_def.describe()

Unnamed: 0,assists,bonus,bps,clean_sheets,creativity,element,fixture,goals_conceded,goals_scored,ict_index,...,team_a_score,team_h_score,threat,total_points,transfers_balance,transfers_in,transfers_out,value,yellow_cards,GW
count,47611.0,47611.0,47611.0,47611.0,47611.0,47611.0,47611.0,47611.0,47611.0,47611.0,...,47586.0,47586.0,47611.0,47611.0,47611.0,47611.0,47611.0,47611.0,47611.0,47611.0
mean,0.02554,0.099998,6.959148,0.111088,3.384966,302.160866,197.046964,0.548382,0.01695,1.426072,...,1.241773,1.505506,2.933482,1.290878,667.1422,9150.101783,8482.334,46.704039,0.063746,20.743022
std,0.166821,0.456991,9.874363,0.314244,8.466178,179.302312,109.321642,1.028448,0.131663,2.26391,...,1.219413,1.304715,8.129947,2.504407,36992.04,31749.682901,27715.43,6.213925,0.244302,11.633362
min,0.0,0.0,-12.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-7.0,-1012731.0,0.0,0.0,37.0,0.0,1.0
25%,0.0,0.0,0.0,0.0,0.0,148.0,103.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,-1400.0,54.0,171.0,43.0,0.0,11.0
50%,0.0,0.0,0.0,0.0,0.0,300.0,201.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,-74.0,436.0,1185.0,45.0,0.0,21.0
75%,0.0,0.0,13.0,0.0,1.3,446.0,292.0,1.0,0.0,2.4,...,2.0,2.0,1.0,1.0,167.0,4179.0,6036.5,50.0,0.0,30.0
max,3.0,3.0,75.0,1.0,109.5,730.0,380.0,9.0,2.0,22.8,...,9.0,9.0,152.0,24.0,935082.0,941215.0,1025064.0,85.0,1.0,47.0


In [10]:
drop_features_def = ['own_goals', 'penalties_missed', 'penalties_saved', 'red_cards', 'saves']

In [11]:
data_mid.describe()

Unnamed: 0,assists,bonus,bps,clean_sheets,creativity,element,fixture,goals_conceded,goals_scored,ict_index,...,team_a_score,team_h_score,threat,total_points,transfers_balance,transfers_in,transfers_out,value,yellow_cards,GW
count,57922.0,57922.0,57922.0,57922.0,57922.0,57922.0,57922.0,57922.0,57922.0,57922.0,...,57900.0,57900.0,57922.0,57922.0,57922.0,57922.0,57922.0,57922.0,57922.0,57922.0
mean,0.056507,0.093885,5.719157,0.108784,7.119506,308.469528,198.174735,0.488243,0.05409,2.059801,...,1.247237,1.519326,6.587376,1.398104,934.17,10539.14,9604.768,54.150616,0.059632,20.866994
std,0.253225,0.463168,9.237676,0.311371,13.381443,180.338521,109.328979,0.944814,0.251359,3.466613,...,1.22481,1.317185,14.375928,2.470625,54354.54,46863.53,39509.57,13.225818,0.236806,11.638579
min,0.0,0.0,-14.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-2.0,-1857821.0,0.0,0.0,40.0,0.0,1.0
25%,0.0,0.0,0.0,0.0,0.0,155.0,104.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,-860.0,27.0,66.0,45.0,0.0,11.0
50%,0.0,0.0,0.0,0.0,0.0,301.0,202.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,-46.0,205.0,560.0,50.0,0.0,21.0
75%,0.0,0.0,9.0,0.0,10.7,458.0,293.0,1.0,0.0,3.0,...,2.0,2.0,6.0,2.0,11.0,2345.0,4295.0,57.0,0.0,30.0
max,4.0,3.0,93.0,1.0,170.9,737.0,380.0,9.0,4.0,35.8,...,9.0,9.0,181.0,29.0,1907229.0,1991731.0,1872898.0,136.0,1.0,47.0


In [12]:
drop_features_mid = ['own_goals', 'penalties_missed', 'penalties_saved', 'red_cards', 'saves']

In [13]:
data_fwd.describe()

Unnamed: 0,assists,bonus,bps,clean_sheets,creativity,element,fixture,goals_conceded,goals_scored,ict_index,...,team_a_score,team_h_score,threat,total_points,transfers_balance,transfers_in,transfers_out,value,yellow_cards,GW
count,19508.0,19508.0,19508.0,19508.0,19508.0,19508.0,19508.0,19508.0,19508.0,19508.0,...,19501.0,19501.0,19508.0,19508.0,19508.0,19508.0,19508.0,19508.0,19508.0,19508.0
mean,0.051825,0.161882,4.733084,0.087144,4.573139,320.552286,198.745694,0.432489,0.109596,2.210801,...,1.229937,1.507666,11.464835,1.45366,1849.246,20205.92,18355.48,60.694484,0.039984,20.916752
std,0.242655,0.617526,11.029191,0.282053,9.063182,185.651113,109.087114,0.891628,0.36744,3.959847,...,1.212308,1.290997,21.433914,2.707575,77751.92,66384.75,56869.69,17.119851,0.195926,11.603276
min,0.0,0.0,-19.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-2.0,-1734284.0,0.0,0.0,42.0,0.0,1.0
25%,0.0,0.0,0.0,0.0,0.0,153.0,105.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,-2237.75,80.0,240.75,49.0,0.0,11.0
50%,0.0,0.0,0.0,0.0,0.0,327.0,203.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,-95.0,837.5,1878.5,56.0,0.0,21.0
75%,0.0,0.0,3.0,0.0,3.8,480.0,294.0,1.0,0.0,3.0,...,2.0,2.0,16.0,2.0,128.0,8450.0,11975.75,65.0,0.0,30.0
max,4.0,3.0,128.0,1.0,92.0,735.0,380.0,9.0,4.0,31.5,...,9.0,9.0,199.0,24.0,1983733.0,2104464.0,1782662.0,132.0,1.0,47.0


In [14]:
drop_features_fwd = ['clean_sheets', 'goals_conceded', 'own_goals', 'penalties_missed', 'penalties_saved', 'red_cards', 'saves']

### Drop features and save data to intermediate folder

In [15]:
data_gk = data_gk.drop(drop_features_gk, axis=1)
data_def = data_def.drop(drop_features_def, axis=1)
data_mid = data_mid.drop(drop_features_mid, axis=1)
data_fwd = data_fwd.drop(drop_features_fwd, axis=1)

In [16]:
data_intermediate_path = os.path.dirname(os.getcwd()) + '\\data\\intermediate\\'

In [17]:
data_gk.to_csv(data_intermediate_path + 'separate/gk_full.csv', index=False)
data_def.to_csv(data_intermediate_path + 'separate/def_full.csv', index=False)
data_mid.to_csv(data_intermediate_path + 'separate/mid_full.csv', index=False)
data_fwd.to_csv(data_intermediate_path + 'separate/fwd_full.csv', index=False)

## Separate data for GK and Field players

In [18]:
data_field_players = data.drop(data_gk.index)
print('GK len:', len(data_gk))
print('Field players:', len(data_field_players))

GK len: 15267
Field players: 125041


In [19]:
drop_features_field = ['own_goals', 'penalties_missed', 'penalties_saved', 'red_cards', 'saves']
data_field_players = data_field_players.drop(drop_features_field, axis=1)

In [20]:
data_field_players.to_csv(data_intermediate_path + 'separate/field_full.csv', index=False)