**Disclaimer**: this notebook's compatibility with StatsBomb event data 4.0.0 was last checked on December 30th, 2023.

In [1]:
!pip install socceraction



In [2]:
!pip install statsbombpy



In [3]:
from socceraction.data.statsbomb import StatsBombLoader
import socceraction.spadl as spadl
import socceraction.atomic.spadl as atomicspadl

## Set up the statsbombloader

In [4]:
# Use this if you only want to use the free public statsbomb data
free_open_data_remote = "https://raw.githubusercontent.com/statsbomb/open-data/master/data/"
SBL = StatsBombLoader(root=free_open_data_remote, getter="remote")

# # Uncomment the code below if you have a local folder on your computer with statsbomb data
#datafolder = "../data-epl" # Example of local folder with statsbomb data
#SBL = statsbomb.StatsBombLoader(root=datafolder, getter="local")

## Select competitions to load and convert

In [5]:
# View all available competitions
competitions = SBL.competitions()
set(competitions.competition_name)



{'1. Bundesliga',
 'African Cup of Nations',
 'Champions League',
 'Copa del Rey',
 "FA Women's Super League",
 'FIFA U20 World Cup',
 'FIFA World Cup',
 'Indian Super league',
 'La Liga',
 'Liga Profesional',
 'Ligue 1',
 'Major League Soccer',
 'NWSL',
 'North American League',
 'Premier League',
 'Serie A',
 'UEFA Euro',
 'UEFA Europa League',
 "UEFA Women's Euro",
 "Women's World Cup"}

In [6]:
# # FA Women's Super League
selected_competitions = competitions[competitions.competition_name == "FA Women's Super League"]
selected_competitions

Unnamed: 0,season_id,competition_id,competition_name,country_name,competition_gender,season_name
23,90,37,FA Women's Super League,England,female,2020/2021
24,42,37,FA Women's Super League,England,female,2019/2020
25,4,37,FA Women's Super League,England,female,2018/2019


In [7]:
import pandas as pd
pd.set_option('display.max_columns', None)
import tqdm

In [8]:
import pandas as pd
# Get games from all selected competitions
games = pd.concat([
    SBL.games(row.competition_id, row.season_id)
    for row in selected_competitions.itertuples()
])
games[["home_team_id", "away_team_id", "game_date", "home_score", "away_score"]]



Unnamed: 0,home_team_id,away_team_id,game_date,home_score,away_score
0,2647,968,2021-02-28 15:00:00,0,4
1,968,972,2021-04-28 20:30:00,2,0
2,2647,749,2021-02-06 13:30:00,1,0
3,965,967,2021-03-28 13:30:00,0,5
4,971,2647,2021-03-28 15:30:00,2,0
...,...,...,...,...,...
103,966,968,2019-03-24 13:30:00,1,5
104,968,972,2018-09-23 15:00:00,4,3
105,973,972,2019-04-28 16:00:00,1,2
106,973,967,2019-03-24 16:00:00,1,0


## Load and convert match data

In [9]:
games_verbose = tqdm.tqdm(list(games.itertuples()), desc="Loading game data")
teams, players = [], []
actions = {}
atomic_actions = {}
for game in games_verbose:
    # load data
    teams.append(SBL.teams(game.game_id))
    players.append(SBL.players(game.game_id))
    events = SBL.events(game.game_id)

    # convert data
    actions = spadl.statsbomb.convert_to_actions(
        events,
        home_team_id=game.home_team_id,
        xy_fidelity_version=1,
        shot_fidelity_version=1
    )
    atomic_actions[game.game_id] = atomicspadl.convert_to_atomic(actions)

teams = pd.concat(teams).drop_duplicates(subset="team_id")
players = pd.concat(players)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  eventsdf["under_pressure"] = eventsdf["under_pressure"].fillna(False).astype(bool)
  eventsdf["counterpress"] = eventsdf["counterpress"].fillna(False).astype(bool)
  eventsdf["under_pressure"] = eventsdf["under_pressure"].fillna(False).astype(bool)
  eventsdf["counterpress"] = eventsdf["counterpress"].fillna(False).astype(bool)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  events['extra'].fillna({}, inplace=True)
  eventsdf["under_pressure"] = eventsdf["under_pressure"].fillna(False).astype(bool)
  eventsdf["counterpress"] = eventsdf["counterpress"].fillna(False).astype(bool)
  eventsdf["

## Store converted spadl data in a h5-file

In [17]:
pd.concat(atomic_actions.values()).to_csv("WSL_actions.csv")