**Disclaimer**: this notebook's compatibility with StatsBomb event data 4.0.0 was last checked on December 30th, 2023.

In [1]:
!pip install socceraction

Collecting socceraction
  Downloading socceraction-1.5.1-py3-none-any.whl.metadata (6.6 kB)
Collecting lxml<5.0.0,>=4.9.3 (from socceraction)
  Using cached lxml-4.9.4.tar.gz (3.6 MB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting numpy<2.0.0,>=1.26.0 (from socceraction)
  Downloading numpy-1.26.4-cp39-cp39-macosx_11_0_arm64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandas<3.0.0,>=2.1.1 (from socceraction)
  Downloading pandas-2.2.1-cp39-cp39-macosx_11_0_arm64.whl.metadata (19 kB)
Collecting pandera<0.18.0,>=0.17.2 (from socceraction)
  Downloading pandera-0.17.2-py3-none-any.whl.metadata (15 kB)
Collecting scikit-learn<2.0.0,>=1.3.1 (from socceraction)
  Downloading scikit_learn-1.4.1

In [2]:
!pip install statsbombpy

Collecting statsbombpy
  Using cached statsbombpy-1.12.0-py3-none-any.whl.metadata (63 kB)
Collecting requests (from statsbombpy)
  Using cached requests-2.31.0-py3-none-any.whl.metadata (4.6 kB)
Collecting requests-cache (from statsbombpy)
  Using cached requests_cache-1.2.0-py3-none-any.whl.metadata (9.9 kB)
Collecting inflect (from statsbombpy)
  Using cached inflect-7.0.0-py3-none-any.whl.metadata (21 kB)
Collecting charset-normalizer<4,>=2 (from requests->statsbombpy)
  Downloading charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl.metadata (33 kB)
Collecting idna<4,>=2.5 (from requests->statsbombpy)
  Using cached idna-3.6-py3-none-any.whl.metadata (9.9 kB)
Collecting urllib3<3,>=1.21.1 (from requests->statsbombpy)
  Using cached urllib3-2.2.1-py3-none-any.whl.metadata (6.4 kB)
Collecting certifi>=2017.4.17 (from requests->statsbombpy)
  Using cached certifi-2024.2.2-py3-none-any.whl.metadata (2.2 kB)
Collecting attrs>=21.2 (from requests-cache->statsbombpy)
  Using cached 

In [11]:
!pip install tqdm

Collecting tqdm
  Using cached tqdm-4.66.2-py3-none-any.whl.metadata (57 kB)
Using cached tqdm-4.66.2-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.66.2


In [18]:
from socceraction.data.statsbomb import StatsBombLoader
import socceraction.spadl as spadl
import socceraction.atomic.spadl as atomicspadl
import tqdm

## Set up the statsbombloader

In [19]:
# Use this if you only want to use the free public statsbomb data
free_open_data_remote = "https://raw.githubusercontent.com/statsbomb/open-data/master/data/"
SBL = StatsBombLoader(root=free_open_data_remote, getter="remote")

# # Uncomment the code below if you have a local folder on your computer with statsbomb data
#datafolder = "../data-epl" # Example of local folder with statsbomb data
#SBL = statsbomb.StatsBombLoader(root=datafolder, getter="local")

## Select competitions to load and convert

In [20]:
# View all available competitions
competitions = SBL.competitions()
set(competitions.competition_name)

{'1. Bundesliga',
 'African Cup of Nations',
 'Champions League',
 'Copa del Rey',
 "FA Women's Super League",
 'FIFA U20 World Cup',
 'FIFA World Cup',
 'Indian Super league',
 'La Liga',
 'Liga Profesional',
 'Ligue 1',
 'Major League Soccer',
 'NWSL',
 'North American League',
 'Premier League',
 'Serie A',
 'UEFA Euro',
 'UEFA Europa League',
 "UEFA Women's Euro",
 "Women's World Cup"}

In [21]:
# # FA Women's Super League
selected_competitions = competitions[competitions.competition_name == "FA Women's Super League"]
selected_competitions

Unnamed: 0,season_id,competition_id,competition_name,country_name,competition_gender,season_name
23,90,37,FA Women's Super League,England,female,2020/2021
24,42,37,FA Women's Super League,England,female,2019/2020
25,4,37,FA Women's Super League,England,female,2018/2019


In [22]:
import pandas as pd
pd.set_option('display.max_columns', None)
import tqdm

In [23]:
import pandas as pd
# Get games from all selected competitions
games = pd.concat([
    SBL.games(row.competition_id, row.season_id)
    for row in selected_competitions.itertuples()
])
games[["home_team_id", "away_team_id", "game_date", "home_score", "away_score"]]

Unnamed: 0,home_team_id,away_team_id,game_date,home_score,away_score
0,2647,968,2021-02-28 15:00:00,0,4
1,968,972,2021-04-28 20:30:00,2,0
2,2647,749,2021-02-06 13:30:00,1,0
3,965,967,2021-03-28 13:30:00,0,5
4,971,2647,2021-03-28 15:30:00,2,0
...,...,...,...,...,...
103,966,968,2019-03-24 13:30:00,1,5
104,968,972,2018-09-23 15:00:00,4,3
105,973,972,2019-04-28 16:00:00,1,2
106,973,967,2019-03-24 16:00:00,1,0


## Load and convert match data

In [16]:
# import warnings
# warnings.filterwarnings("ignore")


In [24]:
games_verbose = tqdm.tqdm(list(games.itertuples()), desc="Loading game data")
teams, players = [], []
actions = {}
atomic_actions = {}
for game in games_verbose:
    # load data
    teams.append(SBL.teams(game.game_id))
    players.append(SBL.players(game.game_id))
    events = SBL.events(game.game_id)

    # convert data
    actions = spadl.statsbomb.convert_to_actions(
        events,
        home_team_id=game.home_team_id,
        xy_fidelity_version=1,
        shot_fidelity_version=1
    )
    atomic_actions[game.game_id] = atomicspadl.convert_to_atomic(actions)

teams = pd.concat(teams).drop_duplicates(subset="team_id")
players = pd.concat(players)

Loading game data: 100%|██████████| 326/326 [04:47<00:00,  1.13it/s]


## Store converted spadl data in a h5-file

In [25]:
pd.concat(atomic_actions.values()).to_csv("WSL_actions.csv")