# Separate data into different positions and do feature selection in order to train different models

In [45]:
import os
from src.features.data_engineering import get_merged_seasons_data

In [46]:
data = get_merged_seasons_data()

## Separate data into different positions

In [47]:
data_gk = data[data['position'] == 'GK']
data_def = data[data['position'] == 'DEF']
data_mid = data[data['position'] == 'MID']
data_fwd = data[data['position'] == 'FWD']

In [48]:
print('GK len:', len(data_gk))
print('DEF len:', len(data_def))
print('MID len:', len(data_mid))
print('FWD len:', len(data_fwd))

GK len: 14504
DEF len: 45353
MID len: 55126
FWD len: 18615


## Feature selection for different positions

In [49]:
# preview columns from data
data.columns

Index(['name', 'assists', 'bonus', 'bps', 'clean_sheets', 'creativity',
       'element', 'fixture', 'goals_conceded', 'goals_scored', 'ict_index',
       'influence', 'kickoff_time', 'minutes', 'opponent_team', 'own_goals',
       'penalties_missed', 'penalties_saved', 'red_cards', 'round', 'saves',
       'selected', 'team_a_score', 'team_h_score', 'threat', 'total_points',
       'transfers_balance', 'transfers_in', 'transfers_out', 'value',
       'was_home', 'yellow_cards', 'GW', 'position', 'season'],
      dtype='object')

Remember about so called 'target features':
- (['name', 'GW', 'element', 'total_points_next_gameweek', 'season'])

And about features dropped during preprocessing:
- (['fixture', 'kickoff_time', 'opponent_team', 'round', 'transfers_balance', 'was_home'])

### Select features to drop

In [50]:
data_gk.describe()

Unnamed: 0,assists,bonus,bps,clean_sheets,creativity,element,fixture,goals_conceded,goals_scored,ict_index,...,team_a_score,team_h_score,threat,total_points,transfers_balance,transfers_in,transfers_out,value,yellow_cards,GW
count,14504.0,14504.0,14504.0,14504.0,14504.0,14504.0,14504.0,14504.0,14504.0,14504.0,...,14499.0,14499.0,14504.0,14504.0,14504.0,14504.0,14504.0,14504.0,14504.0,14504.0
mean,0.000896,0.083149,5.53599,0.085494,0.081833,329.473318,192.27551,0.414782,6.9e-05,0.686025,...,1.234223,1.496862,0.007033,1.104592,498.4468,7382.392581,6883.946,44.969388,0.011514,20.256481
std,0.029926,0.423872,9.355694,0.279625,0.939281,187.487637,106.365156,0.938966,0.008303,1.308831,...,1.205719,1.306483,0.267694,2.286974,25261.36,22281.962485,20173.16,4.82075,0.106688,11.388042
min,0.0,0.0,-7.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-3.0,-1035061.0,0.0,0.0,39.0,0.0,1.0
25%,0.0,0.0,0.0,0.0,0.0,165.0,102.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,-1089.0,50.0,141.0,40.0,0.0,11.0
50%,0.0,0.0,0.0,0.0,0.0,329.0,194.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,-76.0,337.0,896.5,44.0,0.0,20.0
75%,0.0,0.0,11.0,0.0,0.0,495.0,281.0,0.0,0.0,1.0,...,2.0,2.0,0.0,1.0,75.0,4468.75,6557.5,48.0,0.0,29.0
max,1.0,3.0,54.0,1.0,30.0,708.0,380.0,9.0,1.0,10.3,...,9.0,9.0,18.0,17.0,419020.0,459020.0,1039664.0,63.0,1.0,47.0


In [51]:
drop_features_gk = ['assists', 'creativity', 'goals_scored', 'ict_index', 'own_goals', 'penalties_missed', 'red_cards', 'threat', 'yellow_cards']

In [52]:
data_def.describe()

Unnamed: 0,assists,bonus,bps,clean_sheets,creativity,element,fixture,goals_conceded,goals_scored,ict_index,...,team_a_score,team_h_score,threat,total_points,transfers_balance,transfers_in,transfers_out,value,yellow_cards,GW
count,45353.0,45353.0,45353.0,45353.0,45353.0,45353.0,45353.0,45353.0,45353.0,45353.0,...,45328.0,45328.0,45353.0,45353.0,45353.0,45353.0,45353.0,45353.0,45353.0,45353.0
mean,0.025842,0.101118,7.01592,0.112209,3.417029,299.456464,190.515534,0.550636,0.016846,1.441501,...,1.236785,1.500441,2.954909,1.300928,646.2538,9168.141093,8521.231,46.786034,0.064384,20.053337
std,0.16798,0.459563,9.905467,0.315626,8.498592,177.287403,107.499723,1.028436,0.131407,2.269528,...,1.209492,1.305887,8.143801,2.512014,37002.06,31944.028488,27670.15,6.209284,0.245438,11.480167
min,0.0,0.0,-12.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-7.0,-1012731.0,0.0,0.0,37.0,0.0,1.0
25%,0.0,0.0,0.0,0.0,0.0,146.0,98.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,-1451.0,55.0,181.0,43.0,0.0,10.0
50%,0.0,0.0,0.0,0.0,0.0,297.0,192.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,-83.0,442.0,1216.0,45.0,0.0,20.0
75%,0.0,0.0,13.0,0.0,1.3,443.0,281.0,1.0,0.0,2.4,...,2.0,2.0,2.0,1.0,159.0,4163.0,6120.0,50.0,0.0,29.0
max,3.0,3.0,75.0,1.0,109.5,713.0,380.0,9.0,2.0,22.8,...,9.0,9.0,152.0,24.0,935082.0,941215.0,1025064.0,85.0,1.0,47.0


In [53]:
drop_features_def = ['own_goals', 'penalties_missed', 'penalties_saved', 'red_cards', 'saves']

In [54]:
data_mid.describe()

Unnamed: 0,assists,bonus,bps,clean_sheets,creativity,element,fixture,goals_conceded,goals_scored,ict_index,...,team_a_score,team_h_score,threat,total_points,transfers_balance,transfers_in,transfers_out,value,yellow_cards,GW
count,55126.0,55126.0,55126.0,55126.0,55126.0,55126.0,55126.0,55126.0,55126.0,55126.0,...,55104.0,55104.0,55126.0,55126.0,55126.0,55126.0,55126.0,55126.0,55126.0,55126.0
mean,0.056833,0.094148,5.767968,0.109894,7.213297,305.888728,191.580379,0.491202,0.054185,2.085564,...,1.24187,1.514845,6.663643,1.406451,908.9597,10618.98,9709.814,54.20854,0.060461,20.169884
std,0.253803,0.463828,9.253016,0.31276,13.457418,177.974122,107.540642,0.945799,0.25108,3.482679,...,1.215104,1.318324,14.457536,2.469952,54752.63,47198.98,39840.44,13.250668,0.238342,11.489364
min,0.0,0.0,-14.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-2.0,-1857821.0,0.0,0.0,40.0,0.0,1.0
25%,0.0,0.0,0.0,0.0,0.0,155.0,100.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,-896.0,28.0,71.0,45.0,0.0,10.0
50%,0.0,0.0,0.0,0.0,0.0,299.0,194.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,-50.0,213.0,583.5,50.0,0.0,20.0
75%,0.0,0.0,9.0,0.0,10.9,453.0,282.0,1.0,0.0,3.1,...,2.0,2.0,6.0,2.0,10.0,2383.75,4388.75,58.0,0.0,29.0
max,4.0,3.0,93.0,1.0,170.9,714.0,380.0,9.0,4.0,35.8,...,9.0,9.0,181.0,29.0,1907229.0,1991731.0,1872898.0,136.0,1.0,47.0


In [55]:
drop_features_mid = ['own_goals', 'penalties_missed', 'penalties_saved', 'red_cards', 'saves']

In [56]:
data_fwd.describe()

Unnamed: 0,assists,bonus,bps,clean_sheets,creativity,element,fixture,goals_conceded,goals_scored,ict_index,...,team_a_score,team_h_score,threat,total_points,transfers_balance,transfers_in,transfers_out,value,yellow_cards,GW
count,18615.0,18615.0,18615.0,18615.0,18615.0,18615.0,18615.0,18615.0,18615.0,18615.0,...,18608.0,18608.0,18615.0,18615.0,18615.0,18615.0,18615.0,18615.0,18615.0,18615.0
mean,0.052216,0.163309,4.77502,0.088155,4.623717,318.231104,192.584851,0.433253,0.110771,2.237534,...,1.223828,1.504837,11.604781,1.466882,1755.41,20167.22,18410.57,60.730164,0.03986,20.259683
std,0.243454,0.620014,11.043327,0.283527,9.093162,184.143104,107.460852,0.889658,0.368351,3.97137,...,1.201224,1.2917,21.515047,2.713212,78295.93,66859.45,57470.27,17.124864,0.195636,11.463868
min,0.0,0.0,-19.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-2.0,-1734284.0,0.0,0.0,42.0,0.0,1.0
25%,0.0,0.0,0.0,0.0,0.0,152.0,101.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,-2258.5,85.0,242.0,49.0,0.0,11.0
50%,0.0,0.0,0.0,0.0,0.0,326.0,195.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,-100.0,821.0,1862.0,56.0,0.0,20.0
75%,0.0,0.0,4.0,0.0,3.9,476.0,283.0,1.0,0.0,3.0,...,2.0,2.0,17.0,2.0,111.5,8211.0,11839.5,65.0,0.0,29.0
max,4.0,3.0,114.0,1.0,92.0,711.0,380.0,9.0,4.0,31.1,...,9.0,9.0,199.0,24.0,1983733.0,2104464.0,1782662.0,132.0,1.0,47.0


In [57]:
drop_features_fwd = ['clean_sheets', 'goals_conceded', 'own_goals', 'penalties_missed', 'penalties_saved', 'red_cards', 'saves']

### Drop features and save data to intermediate folder

In [58]:
data_gk = data_gk.drop(drop_features_gk, axis=1)
data_def = data_def.drop(drop_features_def, axis=1)
data_mid = data_mid.drop(drop_features_mid, axis=1)
data_fwd = data_fwd.drop(drop_features_fwd, axis=1)

In [60]:
data_intermediate_path = os.path.dirname(os.getcwd()) + '\\data\\intermediate\\'

data_gk.to_csv(data_intermediate_path + 'separate/gk.csv', index=False)
data_def.to_csv(data_intermediate_path + 'separate/def.csv', index=False)
data_mid.to_csv(data_intermediate_path + 'separate/mid.csv', index=False)
data_fwd.to_csv(data_intermediate_path + 'separate/fwd.csv', index=False)