In [137]:
import asyncio
import pandas as pd

from src.features.data_engineering import get_merged_seasons_data, update_team_score_feature
from src.data.data_loader import get_league_table, load_master_team_list
from src.features.utils import idx_to_team_name, str_date_months_back

# Add features with information about player team performance and next match opponent team performance from last two months

In [138]:
df_table = asyncio.run(get_league_table(2019, '2019-10-01', '2019-12-31'))

In [139]:
df_table.head()

Unnamed: 0,Position,Team,M,W,D,L,G,GA,PTS,xG,NPxG,xGA,NPxGA,NPxGD,PPDA,OPPDA,DC,ODC,xPTS
0,1,Liverpool,12,11,1,0,29,9,34,25.99,23.7,9.77,9.77,13.93,8.45,18.3,135,52,25.77
1,2,Leicester,13,9,1,3,30,14,28,24.55,21.51,20.19,17.91,3.6,8.92,12.33,98,93,21.61
2,3,Manchester City,13,8,1,4,27,16,25,29.88,28.21,13.08,12.32,15.89,8.27,22.38,187,43,27.66
3,4,Chelsea,13,8,0,5,21,15,24,23.57,22.81,14.16,13.4,9.41,8.18,14.05,122,66,24.05
4,5,Wolverhampton Wanderers,13,6,5,2,20,14,23,18.99,18.23,14.14,13.23,5.0,11.66,10.07,68,80,21.34


In [141]:
data = update_team_score_feature(get_merged_seasons_data())

## For now, work only with single season data

In [142]:
data19 = data[data['season'] == '2019-20']

### And start with only one player, to make it simpler to test

In [143]:
data_kdb = data19[data19['name'] == 'Kevin_De Bruyne_215']

In [145]:
df_table_kdb_gw1 = asyncio.run(get_league_table(2019, '2019-06-10', '2019-08-10'))

In [146]:
df_table_kdb_gw1.head()

Unnamed: 0,Position,Team,M,W,D,L,G,GA,PTS,xG,NPxG,xGA,NPxGA,NPxGD,PPDA,OPPDA,DC,ODC,xPTS
0,1,Manchester City,1,1,0,0,5,0,3,3.18,2.42,1.2,1.2,1.22,6.94,16.4,9,1,2.63
1,2,Liverpool,1,1,0,0,4,1,3,2.23,2.23,0.84,0.84,1.39,5.94,21.45,11,5,2.39
2,3,Burnley,1,1,0,0,3,0,3,0.91,0.91,1.09,1.09,-0.18,9.28,5.5,0,9,1.14
3,4,Brighton,1,1,0,0,3,0,3,0.86,0.86,0.67,0.67,0.19,11.5,9.79,5,7,1.55
4,5,Tottenham,1,1,0,0,3,1,3,2.57,2.57,0.64,0.64,1.93,5.25,13.42,15,2,2.59


In [147]:
column_to_get = ['Position', 'PPDA', 'OPPDA', 'G', 'GA', 'xG', 'NPxG', 'xGA', 'NPxGA', 'NPxGD', 'DC', 'ODC', 'xPTS']

In [148]:
master_team_list = load_master_team_list()

In [149]:
master_team_list.head(5)

Unnamed: 0,season,team,team_name
0,2016-17,1,Arsenal
1,2016-17,2,Bournemouth
2,2016-17,3,Burnley
3,2016-17,4,Chelsea
4,2016-17,5,Crystal Palace


In [150]:
print(idx_to_team_name(master_team_list, 1, '2019-20'))

Arsenal


## Add next game opponent team stats for player from last two months

In [151]:
data_kdb.head(1)

Unnamed: 0,name,assists,bonus,bps,clean_sheets,creativity,element,fixture,goals_conceded,goals_scored,...,total_points,transfers_balance,transfers_in,transfers_out,value,was_home,yellow_cards,GW,position,season
68002,Kevin_De Bruyne_215,1,1,37,1,37.4,215,8,0,0,...,7,0,0,0,95,False,0,1,MID,2019-20


In [152]:
df_table_kdb_gw1.loc[df_table_kdb_gw1['Team'] == 'Arsenal'][column_to_get].squeeze(axis=1)

Unnamed: 0,Position,PPDA,OPPDA,G,GA,xG,NPxG,xGA,NPxGA,NPxGD,DC,ODC,xPTS
11,12,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0,0,0.0


In [153]:
data_kdb = data_kdb.copy()
data_kdb['opponent_next_gameweek'] = data_kdb.sort_values('kickoff_time').groupby(['season', 'element'])['opponent_team'].shift(-1)

In [154]:
# dropna() to remove rows with NaN values
data_kdb = data_kdb.dropna(subset=['opponent_next_gameweek']).astype({'opponent_next_gameweek': int})

In [158]:
def add_team_stats(row):
    columns_to_get = ['Position', 'PPDA', 'OPPDA', 'G', 'GA', 'xG', 'NPxG', 'xGA', 'NPxGA', 'NPxGD', 'DC', 'ODC', 'xPTS']
    opponent_team = idx_to_team_name(master_team_list, row['opponent_next_gameweek'], row['season'])

    season_year = row['season'].split('-')[0]
    date = row['kickoff_time'].split('T')[0]

    date_back = str_date_months_back(date, 2)
    table = asyncio.run(get_league_table(season_year, date_back, date))

    # get row from table where Team == opponent_team
    table_opponent = table.loc[table['Team'] == opponent_team]

    cols_normalize = table_opponent.filter(items=columns_to_get[3:]).columns
    table_opponent[cols_normalize] = table_opponent[cols_normalize].divide(table_opponent['M'], axis=0)

    table_opponent = table_opponent[columns_to_get].add_prefix('opponent_next_')

    return table_opponent

In [159]:
#data_kdb['Position'], data_kdb['G'], data_kdb['xG'] = data_kdb.apply(lambda row: add_team_stats(row, column_to_get), axis=1)
data_stats = data_kdb.apply(lambda row: add_team_stats(row), axis=1)

In [160]:
df_stacked = pd.concat([r for r in data_stats], ignore_index=True)

In [161]:
# merge df_stacked with data_kdb
data_kdb_concat = pd.concat([data_kdb, df_stacked.set_index(data_kdb.index)], axis=1)

In [162]:
data_kdb_concat

Unnamed: 0,name,assists,bonus,bps,clean_sheets,creativity,element,fixture,goals_conceded,goals_scored,...,opponent_G,opponent_GA,opponent_xG,opponent_NPxG,opponent_xGA,opponent_NPxGA,opponent_NPxGD,opponent_DC,opponent_ODC,opponent_xPTS
68002,Kevin_De Bruyne_215,1,1,37,1,37.4,215,8,0,0,...,3.0,1.0,2.57,2.57,0.64,0.64,1.93,15.0,2.0,2.59
68821,Kevin_De Bruyne_215,2,3,44,0,127.7,215,16,2,0,...,1.5,1.0,1.485,1.105,1.625,1.625,-0.52,6.5,9.0,1.19
69384,Kevin_De Bruyne_215,1,0,22,0,42.3,215,22,1,0,...,1.333333,1.0,1.42,1.42,1.253333,1.253333,0.166667,6.333333,5.666667,1.483333
69602,Kevin_De Bruyne_215,1,2,41,1,26.7,215,37,0,1,...,1.5,2.5,1.295,1.295,1.815,1.815,-0.5175,6.0,10.25,1.03
70408,Kevin_De Bruyne_215,0,0,6,0,36.8,215,46,0,0,...,0.5,2.0,1.31,1.31,1.5475,1.3575,-0.045,9.0,5.25,1.195
70716,Kevin_De Bruyne_215,2,3,66,1,109.4,215,57,0,1,...,0.833333,1.5,1.261667,1.261667,1.138333,1.138333,0.123333,7.166667,4.0,1.46
71516,Kevin_De Bruyne_215,2,1,30,0,33.6,215,65,1,0,...,1.285714,1.571429,1.265714,1.157143,1.36,1.251429,-0.094286,4.571429,6.571429,1.298571
72097,Kevin_De Bruyne_215,0,0,0,0,0.0,215,75,0,0,...,1.0,1.0,1.08,0.89,1.295,1.2,-0.31125,6.75,6.125,1.23625
72649,Kevin_De Bruyne_215,0,0,8,1,34.8,215,84,0,0,...,1.857143,1.142857,1.691429,1.561429,1.511429,1.402857,0.158571,6.0,10.142857,1.442857
72842,Kevin_De Bruyne_215,1,0,24,1,91.1,215,95,0,0,...,0.857143,2.857143,1.255714,1.147143,1.955714,1.738571,-0.591429,5.428571,7.428571,1.082857


In [None]:
# TODO:
# - take average of xG, G etc stats per game +++
# - create feature 'opponent_next_game' for which new columns will be created +++
# - think about to add player club stats (need to encode club name for every player)