In [2]:
import pandas as pd

from src.data.data_loader import get_league_table
import asyncio
from src.features.utils import idx_to_team_name, str_date_months_back, str_date_days_forward
from src.data.data_loader import load_master_team_list
from src.features.data_engineering import get_merged_seasons_data
import time
import json

# Scraping data about next opponent team stats and saving into json

In [3]:
data = get_merged_seasons_data()

data = data.copy()

In [4]:
data['opponent_next_gameweek'] = data.sort_values('kickoff_time').groupby(['season', 'element'])['opponent_team'].shift(-1)

In [5]:
# dropna() to remove rows with NaN values
data = data.dropna(subset=['opponent_next_gameweek']).astype({'opponent_next_gameweek': int})

In [6]:
master_team_list = load_master_team_list()

In [7]:
def scrape_team_stats(row, master_team_list, table_dict):
    columns_to_get = ['Position', 'PPDA', 'OPPDA', 'G', 'GA', 'xG', 'NPxG', 'xGA', 'NPxGA', 'NPxGD', 'DC', 'ODC', 'xPTS']
    opponent_team = idx_to_team_name(master_team_list, row['opponent_next_gameweek'], row['season'])

    season_year = row['season'].split('-')[0]
    date = str_date_days_forward(row['kickoff_time'].split('T')[0], 2)

    key = date + '_' + opponent_team

    # if key is in the dict, pass
    if key in table_dict:
        return

    date_back = str_date_months_back(date, 2)

    table = asyncio.run(get_league_table(season_year, date_back, date))

    # get row from table where Team == opponent_team
    table_opponent = table.loc[table['Team'] == opponent_team]

    cols_normalize = table_opponent.filter(items=columns_to_get[3:]).columns
    table_opponent[cols_normalize] = table_opponent[cols_normalize].divide(table_opponent['M'], axis=0)

    value = table_opponent[columns_to_get].add_prefix('opponent_next_')

    table_dict[key] = value.to_dict()
    len_dict = len(table_dict)
    if len_dict % 50 == 0:
        print(len_dict)

    return True

In [8]:
seasons = list(data['season'].unique())

In [9]:
seasons

['2016-17', '2017-18', '2018-19', '2019-20', '2020-21', '2021-22']

In [11]:

team_stats = {}
for season in seasons:
    team_stats[season] = load_understat_team_stats(season)

for s in seasons:
    table_dict = {}
    result = None
    data_season = data[data['season'] == s].copy()
    print('start scraping season: ', s)

    while result is None:
        try:
            # connect
            result = data_season.apply(lambda row: scrape_team_stats(row, master_team_list, table_dict), axis=1)
        except:
            print('error')
            time.sleep(5)
            pass

    with open(f'stats{s}.json', 'w') as convert_file:
        convert_file.write(json.dumps(table_dict))

start scraping season:  2016-17
50
100
150
200
error
250
300
error
350
400
450
500
550
600
650
700
start scraping season:  2017-18
50
100
150
200
error
250
300
350
400
450
500
550
600
error
650
700
start scraping season:  2018-19
50
100
150
200
250
300
350
400
450
500
550
600
650
700
start scraping season:  2019-20
50
100
150
200
250
300
350
400
450
500
550
600
650
700
start scraping season:  2020-21
50
100
150
200
250
300
350
400
450
500
550
600
650
700


In [17]:
from src.data.data_loader import load_understat_team_stats

def scrape_team_stats_season_loop(data, season):
    table_dict = load_understat_team_stats(season)
    result = None
    data_season = data[data['season'] == season].copy()
    print('start scraping season: ', season)

    while result is None:
        try:
            # connect
            result = data_season.apply(lambda row: scrape_team_stats(row, master_team_list, table_dict), axis=1)
        except:
            print('error')
            time.sleep(5)
            pass

    with open(f'stats{season}.json', 'w') as convert_file:
        convert_file.write(json.dumps(table_dict))

In [16]:
scrape_team_stats_season_loop('2021-22')

<class 'dict'>


In [None]:
# df_opponent = pd.concat([r for r in opponent_data], ignore_index=True)
# data_processed = pd.concat([data, df_opponent.set_index(data.index)], axis=1)

In [13]:
# load stats2021.json into table_dict
with open('stats2020-21.json') as f:
    table_dict = json.load(f)

In [18]:
pd.DataFrame(table_dict['2020-09-14_West Ham'])

Unnamed: 0,opponent_next_Position,opponent_next_PPDA,opponent_next_OPPDA,opponent_next_G,opponent_next_GA,opponent_next_xG,opponent_next_NPxG,opponent_next_xGA,opponent_next_NPxGA,opponent_next_NPxGD,opponent_next_DC,opponent_next_ODC,opponent_next_xPTS
15,16,6.41,24.18,0.0,2.0,0.86,0.86,1.66,1.66,-0.8,5.0,10.0,0.7
