In [2]:
import pandas as pd
import numpy as np
from statsbombpy import sb
import matplotlib.pyplot as plt
import warnings


In [3]:
warnings.filterwarnings('ignore')

In [4]:
free_comps = sb.competitions()

In [5]:
free_comps.columns

Index(['competition_id', 'season_id', 'country_name', 'competition_name',
       'competition_gender', 'competition_youth', 'competition_international',
       'season_name', 'match_updated', 'match_updated_360',
       'match_available_360', 'match_available'],
      dtype='object')

In [6]:
wsl_2018 = sb.matches(competition_id= 37, season_id = 4)
wsl_2019 = sb.matches(competition_id= 37, season_id = 42)
wsl_2020 = sb.matches(competition_id= 37, season_id = 90)

In [7]:
wsl_2018.head()

Unnamed: 0,match_id,match_date,kick_off,competition,season,home_team,away_team,home_score,away_score,match_status,...,last_updated_360,match_week,competition_stage,stadium,referee,home_managers,away_managers,data_version,shot_fidelity_version,xy_fidelity_version
0,19770,2018-12-02,13:30:00.000,England - FA Women's Super League,2018/2019,Manchester City WFC,Arsenal WFC,2,0,available,...,2021-06-13T16:17:31.694,11,Regular Season,Academy Stadium,Abigail Marriott,Nick Cushing,Joseph Montemurro,1.0.3,,
1,19820,2019-05-11,13:30:00.000,England - FA Women's Super League,2018/2019,Reading WFC,Chelsea FCW,2,3,available,...,2021-06-13T16:17:31.694,22,Regular Season,Adams Park,Abigail Byrne,Kelly Chambers,Emma Hayes,1.1.0,2.0,
2,19772,2018-12-02,15:00:00.000,England - FA Women's Super League,2018/2019,Chelsea FCW,Reading WFC,1,0,available,...,2021-06-13T16:17:31.694,11,Regular Season,Kingsmeadow,Lucy Oliver,Emma Hayes,Kelly Chambers,1.0.3,,
3,19769,2018-12-02,15:00:00.000,England - FA Women's Super League,2018/2019,Brighton & Hove Albion WFC,West Ham United LFC,0,1,available,...,2021-06-13T16:17:31.694,11,Regular Season,Broadfield Stadium,Kirsty Dowle,Hope Patricia Powell,Matt Beard,1.0.3,,
4,19746,2018-10-28,15:00:00.000,England - FA Women's Super League,2018/2019,Everton LFC,West Ham United LFC,1,2,available,...,2021-06-13T16:17:31.694,7,Regular Season,Haig Avenue,Aaron Jackson,Andy Spence,Matt Beard,1.0.3,,


In [8]:
def get_player_data():
    ''' Function to extract all player level data for WSL'''
    all_events = []
    season_ids = [4,42,90]  # 3 full seasons available on StatsBomb with those IDs
    
    for season in season_ids:
        matches = sb.matches(competition_id = 37, season_id = season)
        
        for match_id in matches['match_id']:
            try:
                events = sb.events(match_id = match_id)
                events['match_id'] = match_id
                events['season_id'] = season
                all_events.append(events)
            except Exception as e:
                print(f"Failed match {match_id}: {e}")
                continue

    print("Combine all events")
    all_events_df = pd.concat(all_events, ignore_index = True)
    print(f"Total events: {len(all_events_df):,}")
    print(f"Unique players: {all_events_df['player_id'].nunique()}")
        
    return all_events_df
        
    

In [9]:
events_df = get_player_data()

Combine all events
Total events: 1,095,921
Unique players: 408


In [10]:
# Saving the data in parquet format give the size

events_df.to_parquet('/Users/amitmishra/wsl_data/data/fawsl_events.parquet')

In [12]:
print(f'Total events: {len(events_df)}')
print(f'Unique players: {events_df['player_id'].nunique()}')
print(f'Seasons included: {events_df['season_id'].nunique()}')

Total events: 1095921
Unique players: 408
Seasons included: 3


In [13]:
df_18 = events_df[events_df['season_id'] == 4]
df_19 = events_df[events_df['season_id'] == 42]
df_20 = events_df[events_df['season_id'] == 92]

In [14]:
# Saving individual data in parquet format

df_18.to_parquet('/Users/amitmishra/wsl_data/data/df_18.parquet')
df_19.to_parquet('/Users/amitmishra/wsl_data/data/df_19.parquet')
df_20.to_parquet('/Users/amitmishra/wsl_data/data/df_20.parquet')