In [1]:
import pandas as pd
import numpy as np
import json
import seaborn as sns

# accessing data library
from statsbombpy import sb

# opening the libraries as DF's
from mplsoccer import Sbopen
parser = Sbopen()

# plotting
import os
import pathlib
import warnings
import statsmodels.api as sm
import statsmodels.formula.api as smf
from mplsoccer import Pitch, VerticalPitch
import matplotlib.pyplot as plt

pd.options.mode.chained_assignment = None
warnings.filterwarnings('ignore')

# What is a Possession Chain?


| Variable | Variable Type | Variable Description | Values | Value Description |
|----------|---------------|----------------------|--------|-------------------|
| possession | integer | Indicates the current unique possession in the game. A single possession denotes a period of play in which the ball is in play and a single team is in control of the ball. | e.g., 1 - # of unique possessions | New possession are triggered after a team demonstrate they've established control of the ball. A new possession can begin even if the same team has possession of the ball for example, a blocked pass goes out for a throw in for the same team, this would be a new possession for the same attacking team.

# Working with the WSL, the competition_id = 37

## The relevant season_id will be 90 (20/21), 42 (19/20), and 4 (18/19)


In [2]:
competitions = sb.competitions()

competitions[competitions['competition_name']=="FA Women's Super League"]

Unnamed: 0,competition_id,season_id,country_name,competition_name,competition_gender,competition_youth,competition_international,season_name,match_updated,match_updated_360,match_available_360,match_available
22,37,90,England,FA Women's Super League,female,False,False,2020/2021,2023-02-25T14:52:09.326729,2021-06-13T16:17:31.694,,2023-02-25T14:52:09.326729
23,37,42,England,FA Women's Super League,female,False,False,2019/2020,2023-07-25T01:08:03.214566,2021-06-13T16:17:31.694,,2023-07-25T01:08:03.214566
24,37,4,England,FA Women's Super League,female,False,False,2018/2019,2023-07-24T19:50:20.881595,2021-06-13T16:17:31.694,,2023-07-24T19:50:20.881595


# There are 130 games for each season
## I'll need to iterate over each game in each season, and add a column to describe each for the event data.

# Below are the list of match_id 's for each of the seasons

In [None]:
wsl_18_19 = sb.matches(competition_id=37,season_id=4)
match_list_18_19 = wsl_18_19.sort_values(['match_date','kick_off','match_id'])['match_id'].tolist()


wsl_19_20 = sb.matches(competition_id=37,season_id=42)
match_list_19_20 = wsl_19_20.sort_values(['match_date','kick_off','match_id'])['match_id'].tolist()

wsl_20_21 = sb.matches(competition_id=37,season_id=90)
match_list_20_21 = wsl_20_21.sort_values(['match_date','kick_off','match_id'])['match_id'].tolist()

#match_list_18_19
#match_list_19_20
#match_list_20_21


# Below is how I access specific matches

In [4]:
match_events = sb.events(match_id=3764234)

In [14]:
match_events.shape

(3758, 88)

In [26]:
match_events.columns

Index(['ball_receipt_outcome', 'ball_recovery_recovery_failure',
       'block_offensive', 'carry_end_location', 'clearance_aerial_won',
       'clearance_body_part', 'clearance_head', 'clearance_left_foot',
       'clearance_right_foot', 'counterpress', 'dribble_nutmeg',
       'dribble_outcome', 'dribble_overrun', 'duel_outcome', 'duel_type',
       'duration', 'foul_committed_advantage', 'foul_committed_offensive',
       'foul_committed_type', 'foul_won_advantage', 'foul_won_defensive',
       'goalkeeper_body_part', 'goalkeeper_end_location', 'goalkeeper_outcome',
       'goalkeeper_position', 'goalkeeper_technique', 'goalkeeper_type', 'id',
       'index', 'interception_outcome', 'location', 'match_id', 'minute',
       'miscontrol_aerial_won', 'off_camera', 'out', 'pass_aerial_won',
       'pass_angle', 'pass_assisted_shot_id', 'pass_body_part', 'pass_cross',
       'pass_cut_back', 'pass_deflected', 'pass_end_location', 'pass_height',
       'pass_inswinging', 'pass_length', 'p

In [36]:
match_events[match_events[
    'possession']==16].loc[match_events[
    'type']=='Pass'][['timestamp','possession_team','team','player','position',
                      'pass_recipient', 'pass_outcome','pass_miscommunication',
                      'location','pass_end_location']]

Unnamed: 0,timestamp,possession_team,team,player,position,pass_recipient,pass_outcome,pass_miscommunication,location,pass_end_location
100,00:07:24.806,Manchester City WFC,Manchester City WFC,Ellie Roebuck,Goalkeeper,Gemma Bonner,,,"[6.0, 44.0]","[11.2, 24.3]"
101,00:07:31.081,Manchester City WFC,Manchester City WFC,Gemma Bonner,Left Center Back,Ellie Roebuck,,,"[14.6, 23.9]","[6.9, 41.0]"
102,00:07:34.938,Manchester City WFC,Manchester City WFC,Ellie Roebuck,Goalkeeper,Stephanie Houghton,,,"[9.5, 42.9]","[7.5, 55.8]"
103,00:07:36.369,Manchester City WFC,Manchester City WFC,Stephanie Houghton,Right Center Back,Samantha June Mewis,,,"[7.5, 55.1]","[24.4, 56.0]"
104,00:07:37.642,Manchester City WFC,Manchester City WFC,Samantha June Mewis,Right Center Midfield,Esme Beth Morgan,,,"[23.4, 55.1]","[27.9, 73.7]"
105,00:07:44.183,Manchester City WFC,Manchester City WFC,Esme Beth Morgan,Right Back,Stephanie Houghton,,,"[49.5, 68.4]","[32.8, 61.5]"
106,00:07:47.348,Manchester City WFC,Manchester City WFC,Stephanie Houghton,Right Center Back,Janine Elizabeth Beckie,,,"[33.2, 59.6]","[55.9, 75.2]"
107,00:07:49.244,Manchester City WFC,Manchester City WFC,Janine Elizabeth Beckie,Right Wing,Samantha June Mewis,Incomplete,,"[56.1, 75.2]","[55.7, 67.5]"
108,00:07:51.734,Manchester City WFC,Aston Villa,Marisa Ewers,Right Center Midfield,Diana Micaela Abreu de Sousa e Silva,,,"[68.3, 4.2]","[70.4, 6.8]"
109,00:07:53.726,Manchester City WFC,Manchester City WFC,Esme Beth Morgan,Right Back,Stephanie Houghton,,,"[46.9, 72.7]","[34.9, 59.4]"


+ # I want to be able to concatenate all games across a season to then analyse each team's performance for the whole season.

+ # Firstly, I only want to consider the possesion chains beginning with a GK as well as the chain following:

```python
## ONLY GK CHAINS ##
gk_chains = (df[(df['play_pattern']=='From Goal Kick')]['possession'].unique())
gk_chain_list = gk_chains.tolist()

## THE CHAIN FOLLOWING A GK CHAIN ##
aftr_gk_chain_list = (gk_chains+1).tolist()
aftr_gk_chain_list
````

+ ## Once I have event data for a single game, I want to filter to the relevant chains, add columns for 'season', 'match', 'home_team', 'away_team', and 'defending_team' team:

```python

## CREATING THE DEFENDING TEAM COLUMN ##

home_team = df_match['team'].iloc[0]
away_team = df_match['team'].iloc[1]

def out_of_possession_team(row):
    if row['possession_team'] == home_team:
        return away_team
    elif row['possession_team'] == away_team:
        return home_team
    else:
        return None

df_match['defending_team'] = df_match.apply(out_of_possession_team, axis=1)

```

+ ## I then want to move onto the next game's event data and repeat until end.

+ ## I'll also need to split the start & end locations for each event (or 'type):
```python
df[['X_start', 'Y_start']] = df['location'].apply(pd.Series)
df[['X_end_pass', 'Y_end_pass']] = df['pass_end_location'].apply(pd.Series)
df[['X_end_carry', 'Y_end_carry']] = df['carry_end_location'].apply(pd.Series)
df[['X_end_shot', 'Y_end_shot', 'Z_end_shot']] = df['shot_end_location'].apply(pd.Series)
df[['X_end_gk', 'Y_end_gk']] = df['goalkeeper_end_location'].apply(pd.Series)
````
Is it possible to fit the end locations into single x, y, z columns, oppose to one for each event type?

+ ## Once completed, I want to narrow the columns to only the relevant...
... Currently, this would be:
        

```python

df[['match','season','home_team','away_team',
    'timestamp', 'duration','possession','possession_team','defending_team',
    'play_pattern','type','team','player','position',
    'pass_length','pass_angle','pass_height','pass_outcome','pass_body_part','pass_shot_assist',
    'shot_statsbomb_xg','shot_outcome',
    'X_start', 'Y_start',
    'X_end_pass','Y_end_pass','X_end_carry','Y_end_carry','X_end_shot','Y_end_shot','Z_end_shot']]
         
````


In [6]:
match_events['play_pattern'].unique()

array(['Regular Play', 'From Kick Off', 'From Goal Kick', 'From Throw In',
       'From Free Kick', 'From Corner', 'From Keeper', 'From Counter',
       'Other'], dtype=object)

In [5]:
## VIEWING THE STARTING 'TYPE' OF EVERY POSSESSION CHAIN TO UNDERSTAND WHICH TYPES ARE NEEDED...

#match_events.sort_values(by='timestamp').groupby('possession').first()['type'].unique()

#match_events.groupby('possession').last()['type'].unique()
match_events.groupby('possession').first()['type'].unique()


## VIEWING THE STARTING 'PLAY_PATTERN' OF EVERY POSSESSION CHAIN...

#match_events.sort_values(by='timestamp').groupby('possession').first()['play_pattern'].unique()
#match_events.groupby('possession').first()['play_pattern'].unique()

array(['Starting XI', 'Pass', 'Carry', 'Half Start', 'Goal Keeper',
       'Referee Ball-Drop'], dtype=object)

# Creating a means of concatenating the match event data


## Creating the match lists for each season:

In [None]:
wsl_18_19 = sb.matches(competition_id=37,season_id=4)
match_list_18_19 = wsl_18_19.sort_values(['match_date','kick_off','match_id'])['match_id'].tolist()


wsl_19_20 = sb.matches(competition_id=37,season_id=42)
match_list_19_20 = wsl_19_20.sort_values(['match_date','kick_off','match_id'])['match_id'].tolist()

wsl_20_21 = sb.matches(competition_id=37,season_id=90)
match_list_20_21 = wsl_20_21.sort_values(['match_date','kick_off','match_id'])['match_id'].tolist()

#match_list_18_19
#match_list_19_20
#match_list_20_21

In [None]:
# match_events = sb.events(match_id=3764234

# Creating the function to output match event data:

In [None]:
match = 19770
season = '18/19'

def parse_match_data(match): ## SETTING-UP HOME & AWAY TEAMS AS VARIABLES
    df_match = sb.events(match_id = match)
    home_team = df_match['team'].iloc[0]
    away_team = df_match['team'].iloc[1]

    def out_of_possession_team(row): ## CREATING THE DEFENDING TEAM COLUMN
        if row['possession_team'] == home_team:
            return away_team
        elif row['possession_team'] == away_team:
            return home_team
        else:
            return None

    df_match['defending_team'] = df_match.apply(out_of_possession_team, axis=1)
    
    ## SPLITTING THE START LOACTION INTO X,Y COLUMNS
    df_match[['x_start', 'y_start']] = df_match['location'].apply(pd.Series)
    
    ## SPLITTING THE END LOACTIONS INTO X,Y COLUMNS
    df_match[['X_end_pass', 'Y_end_pass']] = df_match['pass_end_location'].apply(pd.Series)
    df_match[['X_end_carry', 'Y_end_carry']] = df_match['carry_end_location'].apply(pd.Series)
    df_match[['X_end_shot', 'Y_end_shot', 'z_end_shot']] = df_match['shot_end_location'].apply(pd.Series)
    df_match[['X_end_gk', 'Y_end_gk']] = df_match['goalkeeper_end_location'].apply(pd.Series)
    
    ## Combine 'x_end' columns into one column 'x_end'
    X_end_columns = [col for col in df_match.columns if 'X_end' in col]
    df_match['x_end'] = df_match[X_end_columns].apply(lambda row: row.dropna().iloc[0] if not row.dropna().empty else np.nan, axis=1)

    ## Combine 'y_end' columns into one column 'y_end'
    Y_end_columns = [col for col in df_match.columns if 'Y_end' in col]
    df_match['y_end'] = df_match[Y_end_columns].apply(lambda row: row.dropna().iloc[0] if not row.dropna().empty else np.nan, axis=1)
    
    ## Drop the original 'x_end' and 'y_end' columns
    df_match = df_match.drop(columns=X_end_columns + Y_end_columns)
    
    df_match['home_team'] = home_team
    df_match['away_team'] = away_team
    
    df_match['match'] = match
    df_match['season'] = '2018/19'
    
    df_match = df_match[['match','season','home_team','away_team',
    'timestamp', 'duration','possession','possession_team','defending_team',
    'play_pattern','type','team','player','position',
    'pass_recipient','pass_outcome','pass_length','pass_height','pass_angle','pass_body_part','pass_shot_assist',
    'shot_statsbomb_xg','shot_outcome',
    'x_start', 'y_start','x_end','y_end','z_end_shot']]
        
    return df_match

# Data Check...

In [None]:
parse_match_data(19777).columns

In [None]:
## Check actual data...

sb.events(match_id = 19777)[['type']]

In [None]:
## Check parsed data...

parse_match_data(19777)[['type']]

In [None]:
parse_match_data(19777).groupby('possession').first()['type'].unique()

In [None]:
parse_match_data(19777)[parse_match_data(19777)['type']=='Referee Ball-Drop']

In [None]:
parse_match_data(19777)[parse_match_data(19777)['possession']==169].sort_values(by='timestamp')

# Concatenating a season's matches

In [7]:
wsl_18_19 = sb.matches(competition_id=37,season_id=4)
match_list_18_19 = wsl_18_19.sort_values(['match_date','kick_off','match_id'])['match_id'].tolist()


wsl_19_20 = sb.matches(competition_id=37,season_id=42)
match_list_19_20 = wsl_19_20.sort_values(['match_date','kick_off','match_id'])['match_id'].tolist()

wsl_20_21 = sb.matches(competition_id=37,season_id=90)
match_list_20_21 = wsl_20_21.sort_values(['match_date','kick_off','match_id'])['match_id'].tolist()

#match_list_18_19
#match_list_19_20
#match_list_20_21

In [37]:
def parse_match_data(match): ## SETTING-UP HOME & AWAY TEAMS AS VARIABLES
    df_match = sb.events(match_id = match)
    home_team = df_match['team'].iloc[0]
    away_team = df_match['team'].iloc[1]

    def out_of_possession_team(row): ## CREATING THE DEFENDING TEAM COLUMN
        if row['possession_team'] == home_team:
            return away_team
        elif row['possession_team'] == away_team:
            return home_team
        else:
            return None

    df_match['defending_team'] = df_match.apply(out_of_possession_team, axis=1)
    
    ## SPLITTING THE START LOACTION INTO X,Y COLUMNS
    df_match[['x_start', 'y_start']] = df_match['location'].apply(pd.Series)
    
    ## SPLITTING THE END LOACTIONS INTO X,Y COLUMNS
    df_match[['X_end_pass', 'Y_end_pass']] = df_match['pass_end_location'].apply(pd.Series)
    df_match[['X_end_carry', 'Y_end_carry']] = df_match['carry_end_location'].apply(pd.Series)
    df_match[['X_end_shot', 'Y_end_shot', 'z_end_shot']] = df_match['shot_end_location'].apply(pd.Series)
    df_match[['X_end_gk', 'Y_end_gk']] = df_match['goalkeeper_end_location'].apply(pd.Series)
    
    ## Combine 'x_end' columns into one column 'x_end'
    X_end_columns = [col for col in df_match.columns if 'X_end' in col]
    df_match['x_end'] = df_match[X_end_columns].apply(lambda row: row.dropna().iloc[0] if not row.dropna().empty else np.nan, axis=1)

    ## Combine 'y_end' columns into one column 'y_end'
    Y_end_columns = [col for col in df_match.columns if 'Y_end' in col]
    df_match['y_end'] = df_match[Y_end_columns].apply(lambda row: row.dropna().iloc[0] if not row.dropna().empty else np.nan, axis=1)
    
    ## Drop the original 'x_end' and 'y_end' columns
    df_match = df_match.drop(columns=X_end_columns + Y_end_columns)
    
    df_match['home_team'] = home_team
    df_match['away_team'] = away_team
    
    df_match['match'] = match
    
    df_match = df_match[['match','home_team','away_team',
    'timestamp', 'duration','possession','possession_team','defending_team',
    'play_pattern','type','team','player','position',
    'pass_recipient','pass_outcome','pass_length','pass_height','pass_angle','pass_body_part','pass_shot_assist',
    'shot_statsbomb_xg','shot_outcome',
    'x_start', 'y_start','x_end','y_end','z_end_shot']]
        
    return df_match

# 2018-19 DF

In [38]:
matches_ssn1819 = pd.DataFrame()

In [39]:
for match in match_list_18_19:
    # Call the parse_match_data function for each match_id
    match_data = parse_match_data(match)
    
    # Concatenate the current match_data to the overall dataframe
    matches_ssn1819 = pd.concat([matches_ssn1819, match_data], ignore_index=True)
    
    matches_ssn1819['season'] = '2018/19'

In [40]:
matches_ssn1819.shape

(360362, 28)

In [None]:
matches_ssn1819.columns

In [41]:
matches_ssn1819 = matches_ssn1819[['season','match', 'home_team', 'away_team', 'timestamp', 'duration',
       'possession', 'possession_team', 'defending_team', 'play_pattern',
       'type', 'team', 'player', 'position', 'pass_length', 'pass_angle',
       'pass_recipient','pass_outcome','pass_length','pass_height','pass_angle','pass_body_part','pass_shot_assist',
       'shot_statsbomb_xg', 'shot_outcome', 'x_start', 'y_start', 'x_end',
       'y_end', 'z_end_shot']]

In [42]:
matches_ssn1819

Unnamed: 0,season,match,home_team,away_team,timestamp,duration,possession,possession_team,defending_team,play_pattern,...,pass_angle,pass_body_part,pass_shot_assist,shot_statsbomb_xg,shot_outcome,x_start,y_start,x_end,y_end,z_end_shot
0,2018/19,7298,Manchester City WFC,Chelsea FCW,00:00:00.000,0.00,1,Manchester City WFC,Chelsea FCW,Regular Play,...,,,,,,,,,,
1,2018/19,7298,Manchester City WFC,Chelsea FCW,00:00:00.000,0.00,1,Manchester City WFC,Chelsea FCW,Regular Play,...,,,,,,,,,,
2,2018/19,7298,Manchester City WFC,Chelsea FCW,00:00:00.000,7.96,1,Manchester City WFC,Chelsea FCW,Regular Play,...,,,,,,,,,,
3,2018/19,7298,Manchester City WFC,Chelsea FCW,00:00:00.000,8.16,1,Manchester City WFC,Chelsea FCW,Regular Play,...,,,,,,,,,,
4,2018/19,7298,Manchester City WFC,Chelsea FCW,00:00:00.000,9.52,100,Manchester City WFC,Chelsea FCW,Regular Play,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
360357,2018/19,19822,Yeovil Town LFC,Birmingham City WFC,00:41:05.024,0.00,191,Yeovil Town LFC,Birmingham City WFC,Regular Play,...,,,,,,,,,,
360358,2018/19,19822,Yeovil Town LFC,Birmingham City WFC,00:41:06.758,0.00,191,Yeovil Town LFC,Birmingham City WFC,Regular Play,...,,,,,,,,,,
360359,2018/19,19822,Yeovil Town LFC,Birmingham City WFC,00:18:51.295,0.00,133,Birmingham City WFC,Yeovil Town LFC,Regular Play,...,,,,,,116.0,43.0,,,
360360,2018/19,19822,Yeovil Town LFC,Birmingham City WFC,00:18:51.295,0.00,133,Birmingham City WFC,Yeovil Town LFC,Regular Play,...,,,,,,5.0,38.0,,,


In [None]:
matches_ssn1819['shot_outcome'].describe()

# 2019-20 DF

In [43]:
matches_ssn1920 = pd.DataFrame()

In [44]:
for match in match_list_19_20:
    match_data = parse_match_data(match)
    matches_ssn1920 = pd.concat([matches_ssn1920, match_data], ignore_index=True)
    matches_ssn1920['season'] = '2019/20'

In [45]:
matches_ssn1920 = matches_ssn1920[['season','match', 'home_team', 'away_team', 'timestamp', 'duration',
       'possession', 'possession_team', 'defending_team', 'play_pattern',
       'type', 'team', 'player', 'position', 'pass_length', 'pass_angle',
       'pass_recipient','pass_outcome','pass_length','pass_height','pass_angle','pass_body_part','pass_shot_assist',
       'shot_statsbomb_xg', 'shot_outcome', 'x_start', 'y_start', 'x_end',
       'y_end', 'z_end_shot']]

In [47]:
matches_ssn1920.columns

Index(['season', 'match', 'home_team', 'away_team', 'timestamp', 'duration',
       'possession', 'possession_team', 'defending_team', 'play_pattern',
       'type', 'team', 'player', 'position', 'pass_length', 'pass_angle',
       'pass_recipient', 'pass_outcome', 'pass_length', 'pass_height',
       'pass_angle', 'pass_body_part', 'pass_shot_assist', 'shot_statsbomb_xg',
       'shot_outcome', 'x_start', 'y_start', 'x_end', 'y_end', 'z_end_shot'],
      dtype='object')

# 2020-21 DF

In [48]:
matches_ssn2021 = pd.DataFrame()

In [49]:
for match in match_list_20_21:
    match_data = parse_match_data(match)
    matches_ssn2021 = pd.concat([matches_ssn2021, match_data], ignore_index=True)
    matches_ssn2021['season'] = '2020/21'

In [50]:
matches_ssn2021 = matches_ssn2021[['season','match', 'home_team', 'away_team', 'timestamp', 'duration',
       'possession', 'possession_team', 'defending_team', 'play_pattern',
       'type', 'team', 'player', 'position', 'pass_length', 'pass_angle',
       'pass_recipient','pass_outcome','pass_length','pass_height','pass_angle','pass_body_part','pass_shot_assist',
       'shot_statsbomb_xg', 'shot_outcome', 'x_start', 'y_start', 'x_end',
       'y_end', 'z_end_shot']]

In [51]:
matches_ssn2021.shape

(443304, 30)

In [52]:
matches_ssn2021.to_csv('../data/matches_ssn2021.csv', index=False)

matches_ssn1920.to_csv('../data/matches_ssn1920.csv', index=False)

matches_ssn1819.to_csv('../data/matches_ssn1819.csv', index=False)