In [None]:

!pip install statsbombpy
!pip install mplsoccer
!pip install socceraction




In [None]:
import pandas as pd
import json
from google.colab import drive
from statsbombpy import sb
from mplsoccer import Pitch
import os
import math
from tqdm.notebook import tqdm

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
def determine_action_type(event):
    event_type = event['type']
    result = 'success'

    if event_type == 'Pass':
        if event.get('pass_cross'):
            return 'cross', result
        elif event.get('pass_type') == 'Free Kick':
            return 'freekick', result
        elif event.get('pass_type') == 'Throw-in':
            return 'throw_in', result
        elif event.get('pass_type') == 'Corner':
            return 'corner', result
        elif event.get('pass_type') == 'Goal Kick':
            return 'goal_kick', result
        else:
            return 'pass', result

    elif event_type == 'Shot':
        if event.get('penalty', False):
            return 'shot_penalty', 'success' if event.get('shot_outcome') == 'Goal' else 'fail'
        elif event.get('freekick', False):
            return 'shot_freekick', 'success' if event.get('shot_outcome') == 'Goal' else 'fail'
        else:
            return 'shot', 'success' if event.get('shot_outcome') == 'Goal' else 'fail'

    elif event_type == 'Dribble':
        return 'take_on', 'success' if event.get('dribble_outcome') == 'Complete' else 'fail'

    elif event_type == 'Interception':
        return 'interception', 'success' if event.get('interception_outcome') == 'Won' else 'fail'
    elif event_type == 'Tackle':
        return 'tackle', 'success' if event.get('duel_outcome') == 'Won' else 'fail'
    elif event_type == 'Clearance':
        return 'clearance', 'success' if event.get('clearance_body_part') else 'fail'

    elif event_type == 'Foul Committed':
        card = event.get('foul_committed_card')
        if card == 'Yellow Card':
            return 'foul', 'yellow_card'
        elif card == 'Red Card':
            return 'foul', 'red_card'
        else:
            return 'foul', 'fail'

    elif event_type == 'Bad Behaviour':
        card = event.get('bad_behaviour_card')
        if card == 'Yellow Card':
            return 'foul', 'yellow_card'
        elif card == 'Red Card':
            return 'foul', 'red_card'
        else:
            return 'foul', 'fail'

    elif event_type == 'Miscontrol':
        return 'bad_touch', 'fail'

    elif event_type == 'Carry':
        return 'dribble', result

    elif event_type == 'Goal Keeper':
        gk_type = event.get('goalkeeper_type')
        gk_outcome = event.get('goalkeeper_outcome')

        if gk_type in ['Shot Saved', 'Shot Saved Off T', 'Shot Saved To Post', 'Penalty Saved', 'Penalty Saved To Post']:
            return 'keeper_save', 'success'
        elif gk_type == 'Punch':
            return 'keeper_punch', 'fail' if gk_outcome == 'Fail' else 'success'
        elif gk_type in ['Claim', 'Pick Up', 'Collected']:
            return 'keeper_claim', 'success' if gk_outcome == 'Success' else 'fail'
        elif gk_type == 'Keeper Sweeper' and gk_outcome in ['Success', 'Claim']:
            return 'keeper_claim', 'success'

    return 'non_action', result

def convert_events_to_spadl(events):
    spadl_records = []

    for i, event in events.iterrows():
        action_type, result = determine_action_type(event)

        end_loc = [None, None]
        if event['type'] == 'Pass':
            end_loc = event.get('pass_end_location', [None, None])
        elif event['type'] == 'Carry':
            end_loc = event.get('carry_end_location', [None, None])
        elif event['type'] == 'Block':
            end_loc = event.get('block_end_location', [None, None])
        elif event['type'] == 'Shot':
            end_loc = event.get('shot_end_location', [None, None])

        start_x = event['location'][0] if isinstance(event.get('location'), list) else None
        start_y = event['location'][1] if isinstance(event.get('location'), list) else None
        end_x = end_loc[0]
        end_y = end_loc[1]
        dx = end_x - start_x if end_x is not None and start_x is not None else None
        dy = end_y - start_y if end_y is not None and start_y is not None else None


        start_r = start_angle = end_r = end_angle = None
        if start_x is not None and start_y is not None:
            start_r = math.sqrt(start_x**2 + start_y**2)
            start_angle = math.degrees(math.atan2(start_y, start_x))
        if end_x is not None and end_y is not None:
            end_r = math.sqrt(end_x**2 + end_y**2)
            end_angle = math.degrees(math.atan2(end_y, end_x))

        #Integrated bodypart processing
        raw_bodypart = None
        if event['type'] == 'Shot':
            raw_bodypart = event.get('shot_body_part')
        elif event['type'] == 'Pass':
            raw_bodypart = event.get('pass_body_part')
        elif event['type'] == 'Clearance':
            raw_bodypart = event.get('clearance_body_part')
        elif event['type'] == 'Goal Keeper':
            raw_bodypart = event.get('goalkeeper_body_part')

        # Keep the main body parts, and treat the rest as 'Other'
        if raw_bodypart in ['Left Foot', 'Right Foot', 'Head']:
            bodypart = raw_bodypart
        elif raw_bodypart is not None:
            bodypart = 'Other'
        else:
            bodypart = None

        '''
        if prev_time is not None:
            curr_time = event['time_seconds'] - prev_time
        else:
            curr_time = 0
        prev_time = event['time_seconds']
        '''
        seconds = event['minute']*60 + event['second']

        spadl_records.append({
            'event_id': event['id'],
            'game_id': event['match_id'],
            'period_id': event['period'],
            'seconds': seconds,
            'player_id': event.get('player_id'),
            'player_name': event.get('player'),
            'team_id': event.get('team_id'),
            'team_name': event.get('team'),
            'start_x': start_x,
            'start_y': start_y,
            'end_x': end_x,
            'end_y': end_y,
            'dx': dx,
            'dy': dy,
            'start_r': start_r,
            'start_angle': start_angle,
            'end_r': end_r,
            'end_angle': end_angle,
            'bodypart': bodypart,
            'action_type': action_type,
            'result': result,
            'shot_xg': event.get('shot_statsbomb_xg') if action_type.startswith('shot') else None,
        })

    return pd.DataFrame(spadl_records)


In [None]:
def process_competition_and_save(competition_id, season_id, save_path):
    matches = sb.matches(competition_id=competition_id, season_id=season_id)
    all_actions = {}

    for _, match in tqdm(matches.iterrows(), total=len(matches), desc="Processing matches"):
        match_id = match["match_id"]

        try:
            # Loading and converting event data
            events_df = sb.events(match_id=match_id)
            spadl_df = convert_events_to_spadl(events_df)

            # Add preprocessing
            spadl_df = spadl_df.dropna(subset=['start_x', 'start_y'])
            spadl_df = spadl_df.drop_duplicates(subset=['event_id'])
            spadl_df = spadl_df[spadl_df['action_type'] != 'non_action']
            spadl_df = spadl_df.sort_values(by=['game_id','period_id', 'seconds']).reset_index(drop=True)

            spadl_df["is_home_team"] = spadl_df["team_name"] == match["home_team"]
            spadl_df["is_away_team"] = spadl_df["team_name"] == match["away_team"]

            #Import 360 data
            file_path = f"/content/drive/My Drive/Data/statsbombpy/open-data/data/three-sixty/{match_id}.json"
            if os.path.exists(file_path):
                with open(file_path, "r", encoding="utf-8") as f:
                    match_360 = pd.read_json(f)
                match_360 = match_360.drop_duplicates(subset=['event_uuid'])
                merged_df = pd.merge(spadl_df, match_360, left_on="event_id", right_on="event_uuid", how="left")
                merged_df = merged_df.drop_duplicates(subset=['event_uuid'])
            else:
                merged_df = spadl_df

            merged_df['index'] = merged_df.index
            all_actions[match_id] = merged_df

        except Exception as e:
            print(f"process fail: match {match_id} | {e}")

    with pd.HDFStore(save_path, mode='w') as store:
        for match_id, df in all_actions.items():
            store[f"actions/{match_id}"] = df

    print(f"Save Completed to: {save_path}")


In [None]:
'''
process_competition_and_save(
    competition_id=55,
    season_id=43,
    save_path='/content/drive/My Drive/Data/Processed/euro2020_spadl_merged.h5'
)
'''

"\nprocess_competition_and_save(\n    competition_id=55,\n    season_id=43,\n    save_path='/content/drive/My Drive/Data/Processed/euro2020_spadl_merged.h5'\n)\n"