In [1]:
import os
import ast
import numpy as np 
import pandas as pd

In [2]:
# import wyscout public match event data loader from socceraction library
from socceraction.data.statsbomb import StatsBombLoader 

# remove credentials warning from statsbomb api due to public data access 
import warnings
warnings.filterwarnings("ignore", message="credentials were not supplied. open data access only")


# load public wyscout data
stbm_data = StatsBombLoader()

In [3]:
competitions = stbm_data.competitions()
female_comps = competitions.loc[competitions['competition_gender'] == 'female', :].reset_index(drop = True)
female_comps

Unnamed: 0,season_id,competition_id,competition_name,country_name,competition_gender,season_name
0,90,37,FA Women's Super League,England,female,2020/2021
1,42,37,FA Women's Super League,England,female,2019/2020
2,4,37,FA Women's Super League,England,female,2018/2019
3,3,49,NWSL,United States of America,female,2018
4,106,53,UEFA Women's Euro,Europe,female,2022
5,30,72,Women's World Cup,International,female,2019


In [5]:
# names of folders to save files
dir_names = ['FAWSL_2021', 'FAWSL_1920', 'FAWSL_1819', 'NWSL', 'EURO_2022', 'WC_2019']

# for each competition save all games as .csv files
for i, j in female_comps.loc[:, ['season_id', 'competition_id']].iterrows():
    # j[0] = season_id, j[1] = competition_id
    games = stbm_data.games(j[1], j[0]).loc[:, 'game_id']
    for k in games:
        events = stbm_data.events(k)
        events.to_csv(f'data/{dir_names[i]}/games/{k}.csv', index = False)

In [6]:
# concatenate all events into a single data frame
all_events = pd.DataFrame()
for i in dir_names:
    games = os.listdir(f'data/{i}/games')
    for j in games:
        df = pd.read_csv(f'data/{i}/games/{j}')
        all_events = pd.concat([all_events, df])

# rest index and save as .csv file
all_events = all_events.reset_index(drop = True)
all_events.to_csv('data/all_events.csv', index = False)

In [9]:
all_events.head()

Unnamed: 0,game_id,event_id,period_id,team_id,player_id,type_id,type_name,index,timestamp,minute,...,team_name,duration,extra,related_events,player_name,position_id,position_name,location,under_pressure,counterpress
0,3764230,3f5dde74-d91b-44ea-9a1f-88e84da555ab,1,749,,35,Starting XI,1,1900-01-01 00:00:00.000,0,...,Tottenham Hotspur Women,0.0,"{'tactics': {'formation': 4231, 'lineup': [{'p...",[],,,,,False,False
1,3764230,e4fefe61-4e08-47e0-be4d-2276388e6eb4,1,972,,35,Starting XI,2,1900-01-01 00:00:00.000,0,...,West Ham United LFC,0.0,"{'tactics': {'formation': 433, 'lineup': [{'pl...",[],,,,,False,False
2,3764230,ff9a99d3-3efd-45c2-8736-a8a93dd02638,1,972,,18,Half Start,3,1900-01-01 00:00:00.000,0,...,West Ham United LFC,0.0,{},['5fb7026c-83aa-4490-96b1-a55825c4dcb8'],,,,,False,False
3,3764230,5fb7026c-83aa-4490-96b1-a55825c4dcb8,1,749,,18,Half Start,4,1900-01-01 00:00:00.000,0,...,Tottenham Hotspur Women,0.0,{},['ff9a99d3-3efd-45c2-8736-a8a93dd02638'],,,,,False,False
4,3764230,8a29c8bc-df2e-4467-8ca3-2ea98574581c,1,749,31550.0,30,Pass,5,1900-01-01 00:00:00.218,0,...,Tottenham Hotspur Women,1.167717,"{'pass': {'recipient': {'id': 15569, 'name': '...",['4fdedcf7-587a-48cb-a46e-bc4804317114'],Angela Addison,21.0,Left Wing,"[61.0, 40.1]",False,False


In [10]:
# list all features to select ones required for xG model
all_events.columns

Index(['game_id', 'event_id', 'period_id', 'team_id', 'player_id', 'type_id',
       'type_name', 'index', 'timestamp', 'minute', 'second', 'possession',
       'possession_team_id', 'possession_team_name', 'play_pattern_id',
       'play_pattern_name', 'team_name', 'duration', 'extra', 'related_events',
       'player_name', 'position_id', 'position_name', 'location',
       'under_pressure', 'counterpress'],
      dtype='object')

In [38]:
# filter event type_name = 'Shot' and leave only required columns 
shots = all_events.loc[all_events['type_name'] == 'Shot', ['type_name', 'play_pattern_name', 'position_name', 
                                                           'location', 'under_pressure', 'extra']].reset_index(drop = True)

shots.head()

Unnamed: 0,type_name,play_pattern_name,position_name,location,under_pressure,extra
0,Shot,Regular Play,Right Wing,"[95.9, 58.9]",False,"{'shot': {'statsbomb_xg': 0.013642391, 'end_lo..."
1,Shot,From Free Kick,Center Forward,"[106.1, 54.3]",False,"{'shot': {'statsbomb_xg': 0.04084396, 'end_loc..."
2,Shot,From Free Kick,Left Wing,"[110.0, 28.2]",True,"{'shot': {'statsbomb_xg': 0.13687119, 'end_loc..."
3,Shot,From Throw In,Center Attacking Midfield,"[113.2, 40.4]",False,"{'shot': {'statsbomb_xg': 0.12462413, 'end_loc..."
4,Shot,From Counter,Center Attacking Midfield,"[95.2, 39.8]",False,"{'shot': {'statsbomb_xg': 0.02380701, 'end_loc..."


In [40]:
# unlist location column into (x, y) and remove it
shots.loc[:, 'location'] = shots.loc[:, 'location'].apply(ast.literal_eval)
shots.loc[:, 'x_start'] = shots.loc[:, 'location'].apply(lambda x: x[0])
shots.loc[:, 'y_start'] = shots.loc[:, 'location'].apply(lambda x: x[1])
shots = shots.drop(columns = 'location')

In [41]:
shots.head()

Unnamed: 0,type_name,play_pattern_name,position_name,under_pressure,extra,x_start,y_start
0,Shot,Regular Play,Right Wing,False,"{'shot': {'statsbomb_xg': 0.013642391, 'end_lo...",95.9,58.9
1,Shot,From Free Kick,Center Forward,False,"{'shot': {'statsbomb_xg': 0.04084396, 'end_loc...",106.1,54.3
2,Shot,From Free Kick,Left Wing,True,"{'shot': {'statsbomb_xg': 0.13687119, 'end_loc...",110.0,28.2
3,Shot,From Throw In,Center Attacking Midfield,False,"{'shot': {'statsbomb_xg': 0.12462413, 'end_loc...",113.2,40.4
4,Shot,From Counter,Center Attacking Midfield,False,"{'shot': {'statsbomb_xg': 0.02380701, 'end_loc...",95.2,39.8


In [42]:
# convert 'extra' column to dict readable format using ast.literal_eval
shots_extra = shots.loc[:, 'extra'].apply(ast.literal_eval).reset_index(drop = True)

# specify which features to extract from 'extra' column
keys = ['follows_dribble', 'first_time', 'open_goal', 'statsbomb_xg', 'type', 'technique', 'body_part', 'outcome']
# save selected features in a dataframe
extra_features = pd.DataFrame(np.nan, columns = keys, index = range(shots.shape[0]))
for i, j in shots_extra.iteritems():
    for k in list(j['shot'].keys()):
        if k in ['type', 'technique', 'body_part', 'outcome']:
            extra_features.loc[i, k] = j['shot'][k]['name']
        elif k in keys:
            extra_features.loc[i, k] = j['shot'][k]
        elif k == 'freeze_frame':
            extra_features.loc[i, k] = [{'freeze_frame':j['shot'][k]}]

# fill NAs with boolean = False (technically, these are not NAs but just unspecified False values )
extra_features = extra_features.fillna(value = False)
# transform columns with boolean values into integers 
extra_features.loc[:, ['follows_dribble', 'first_time', 'open_goal']] = \
extra_features.loc[:, ['follows_dribble', 'first_time', 'open_goal']].astype(int)
shots.loc[:, 'under_pressure'] = shots.loc[:, 'under_pressure'].astype(int)

In [43]:
refined_shots = pd.concat([shots.drop(columns = ['extra', 'type_name']), extra_features], axis = 1)
refined_shots.head()

Unnamed: 0,play_pattern_name,position_name,under_pressure,x_start,y_start,follows_dribble,first_time,open_goal,statsbomb_xg,type,technique,body_part,outcome,freeze_frame
0,Regular Play,Right Wing,0,95.9,58.9,0,0,0,0.013642,Open Play,Normal,Left Foot,Saved,"[{'freeze_frame': [{'location': [119.6, 42.3],..."
1,From Free Kick,Center Forward,0,106.1,54.3,0,0,0,0.040844,Open Play,Normal,Right Foot,Off T,"[{'freeze_frame': [{'location': [118.8, 43.2],..."
2,From Free Kick,Left Wing,1,110.0,28.2,0,0,0,0.136871,Open Play,Normal,Left Foot,Saved,"[{'freeze_frame': [{'location': [111.3, 39.8],..."
3,From Throw In,Center Attacking Midfield,0,113.2,40.4,0,0,0,0.124624,Open Play,Normal,Head,Post,"[{'freeze_frame': [{'location': [105.8, 46.6],..."
4,From Counter,Center Attacking Midfield,0,95.2,39.8,0,0,0,0.023807,Open Play,Normal,Left Foot,Post,"[{'freeze_frame': [{'location': [97.8, 49.4], ..."


In [44]:
refined_shots.to_csv('data/shots.csv', index = False)