In [9]:
from statsbombpy import sb
import pandas as pd
from pandas import json_normalize
import numpy as np
from shapely.geometry import MultiPoint
from multiprocessing import Pool
pd.set_option('display.max_colwidth', None)

In [10]:
position_scores = {
    'Goalkeeper': 1,
    'Right Center Back': 2,
    'Left Center Back': 2,
    'Center Back': 2,
    'Right Back': 3,
    'Left Back': 3,
    'Right Wing Back': 4,
    'Left Wing Back': 4,
    'Center Defensive Midfield': 5,
    'Right Defensive Midfield': 5,
    'Left Defensive Midfield': 5,
    'Center Midfield': 6,
    'Left Center Midfield': 6,
    'Right Center Midfield': 6,
    'Left Midfield': 6,
    'Right Midfield': 6,
    'Center Attacking Midfield': 7,
    'Left Attacking Midfield': 7,
    'Right Attacking Midfield': 7,
    'Left Wing': 7,
    'Right Wing': 7,
    'Left Center Forward': 8,
    'Right Center Forward': 8,
    'Center Forward': 8,
    
}


In [11]:
all_comps = sb.competitions()
comps = all_comps[all_comps.season_name=="2015/2016"]
comps

Unnamed: 0,competition_id,season_id,country_name,competition_name,competition_gender,competition_youth,competition_international,season_name,match_updated,match_updated_360,match_available_360,match_available
1,9,27,Germany,1. Bundesliga,male,False,False,2015/2016,2024-05-19T11:11:14.192381,,,2024-05-19T11:11:14.192381
6,16,27,Europe,Champions League,male,False,False,2015/2016,2024-02-12T12:51:14.869531,2021-06-13T16:17:31.694,,2024-02-12T12:51:14.869531
42,11,27,Spain,La Liga,male,False,False,2015/2016,2024-05-16T14:06:52.149840,2021-06-13T16:17:31.694,,2024-05-16T14:06:52.149840
59,7,27,France,Ligue 1,male,False,False,2015/2016,2024-01-07T06:23:44.277971,,,2024-01-07T06:23:44.277971
63,2,27,England,Premier League,male,False,False,2015/2016,2024-01-25T11:45:53.333860,2021-06-13T16:17:31.694,,2024-01-25T11:45:53.333860
65,12,27,Italy,Serie A,male,False,False,2015/2016,2024-03-10T16:14:00.166197,,,2024-03-10T16:14:00.166197


In [12]:
def calculate_polygon_centroid(coords):
    """
        calculate the centre point (centroid) of a group of points

        Inputs:
        ==========================
        coords: coordinates of an event in the form [x,y] 
        Output:
        ==========================
        points.centroid.x: the x label of the centroid
        points.centroid.y: the y label of the centroid
        """
    points = MultiPoint([coord for coord in coords if pd.notnull(coord).all()])
    return points.centroid.x, points.centroid.y

def centroid_function(df,before_events):
     """
        take a dataframe and calculate the centroid of events grouped for each team

        Inputs:
        ==========================
        df: a dataframe on which to calculate the centroids 

        Output:
        ==========================
        team: the team matching this grouping
        centroid_x: the x label of the centroid
        centroid_y: the y label of the centroid
        """
     if len(df)==0:
        teams = pd.Series([before_events.team.unique()[0],before_events.team.unique()[1]])
        centroid_x = pd.Series(["",""])
        centroid_y = pd.Series(["",""])
        centroids = pd.DataFrame([teams,centroid_x,centroid_y]).transpose()
        centroids.columns = ['team', 'centroid_x', 'centroid_y']
        return centroids

     else:    
        centroids = df.groupby('team')['location'].apply(lambda loc: calculate_polygon_centroid(loc.tolist()))#calculate_polygon_centroid(loc.tolist())

        centroids_df = centroids.apply(pd.Series).reset_index()
        centroids_df.columns = ['team', 'centroid_x', 'centroid_y']

    #tools.display_dataframe_to_user(name="Team Polygon Centroid", dataframe=centroids_df)

        return centroids_df

In [13]:
def extract_last_position(position_list):
    """
        take a list of the positions a player has held during a game and extract the last one.

        Inputs:
        ==========================
        position_list: a a list of positions a player has played during a game

        Output:
        ==========================
        last_position: the last position this player played 
        """
    if len(position_list) > 0:
        last_position = position_list[-1]['position']
    else:
        last_position = None
    return last_position


def assign_our_their_stats(df, stats_df, team_col='team'):
    """
        attach a dataframe of statistics to another merged by team, if the teams match those stats appear as "our" otherwise they appear as "their"

        Inputs:
        ==========================
        df: the original summary dataframe
        stats_df: the dataframe of grouped statistics which we need to attach to df
        team_col: The name of the column denoting the team, defaults to team'

        Output:
        ==========================
        df: the original dataframe input
        our_"stat": a column of statistics matching the df.team column 
        their_"stat": a column of statistics of the team not matching the df.team column 
        """
    # Merge the stats data with the events DataFrame
    df = df.merge(stats_df, on=team_col, how='left', suffixes=('', '_stats'))
    
    # Get unique teams
    teams = df[team_col].unique()
    
    # Identify statistic columns (excluding the team column)
    stat_columns = [col for col in stats_df.columns if col != team_col]
    reversed_stats = {team: {} for team in teams}
    # Loop over each team and create 'our' and 'their' columns
    for team in teams:
        for stat in stat_columns:
            reversed_stats[team][f'our_{stat}'] = df[df[team_col] == team][stat].values[0]
            reversed_stats[team][f'their_{stat}'] = df[df[team_col] != team][stat].values[0]
    
    # Assign the reversed stats back to the events DataFrame
    for team in teams:
        for stat in stat_columns:
            df[f'our_{stat}'] = df.apply(lambda row: reversed_stats[row[team_col]][f'our_{stat}'], axis=1)
            df[f'their_{stat}'] = df.apply(lambda row: reversed_stats[row[team_col]][f'their_{stat}'], axis=1)
    
    
    # Drop the intermediate statistic columns if not needed
    df.drop(columns=stat_columns, inplace=True)
    
    return df

In [14]:

#for season in comps.competition_id:
def substition_func(comp_list):
    """
        Find every substitution in the datasets and get statistics around this

        Inputs:
        ==========================
        comp_list: a list of available competitions to loop through

        Output:
        ==========================
        minute: minute of substitution as int
        home_score: home team score at substitution time	
        away_score: away team score at substitution time		
        Time: time of score as string in format MM:SS	
        team: team making substitution
        starter_name: name of the player being taken off	
        starter_pos: position of player being taken off
        sub_name: player coming on	
        sub_pos: position of player coming on	
        match_id: unique match identifier	
        season: unique club season identifier	
        starter_pos_score: number representing how attacking/defensive the starter's position is	
        sub_pos_score: number representing how attacking/defensive the sub's position is	
        substitution_type: is the substitute's position more; defensive  (-1), attacking (1) of neutral (0)
        """
    results = pd.DataFrame()
    matches = sb.matches(competition_id=comp_list, season_id=27) 
    for match in matches.match_id:
        events = sb.events(match_id = match)    
        lineup = sb.lineups(match_id = match)
        home = matches[matches.match_id==match].home_team.values[0]
        home_goal = 0
        away_goal = 0
        home_list = []
        away_list = []
        for index, row in events.iterrows():
            if row.shot_outcome =="Goal":
                if row.team ==home:
                    home_goal+=1
                else:
                    away_goal+=1
            home_list.append(home_goal)
            away_list.append(away_goal)
        events["home_score"]=home_list
        events["away_score"]=away_list
        events["loser"] = np.where(((events.team==home) &(events.home_score<events.away_score))|((events.team!=home) &(events.home_score>events.away_score)),1,0)
        events["Time"] = events.minute.astype("str")+":"+events.second.astype("str")
        for team in events.team.unique():
            df_event = events[(events.team==team) & (events.substitution_outcome=="Tactical")&(events.loser==1)]
            df_lineup = lineup[team]
            df_lineup['last_position'] = df_lineup['positions'].apply(extract_last_position)
            merge_one = df_event[["minute","home_score","away_score","player",'substitution_replacement',"Time","team"]].merge(df_lineup[["player_name","last_position"]], left_on = "player", right_on = "player_name").rename(columns={"last_position":"starter_pos"})
            merge_two = merge_one[["minute","home_score","away_score","Time","team","player_name","starter_pos",'substitution_replacement']].merge(df_lineup[["player_name","last_position"]], right_on = "player_name", left_on = "substitution_replacement").rename(columns={"last_position":"sub_pos","player_name_x":"starter_name","substitution_replacement":"sub_name"}).drop("player_name_y",axis=1)
            merge_two["match_id"] = match
            merge_two["season"] = comp_list
            results = pd.concat([results,merge_two],axis=0)
    results['starter_pos_score'] = results['starter_pos'].map(position_scores)
    results['sub_pos_score'] = results['sub_pos'].map(position_scores)
    sub_type = []
    for index, row in results.iterrows():
        if row['starter_pos_score'] > row['sub_pos_score']:
            sub_type.append(-1)
        elif row['starter_pos_score'] < row['sub_pos_score']:
            sub_type.append(1)
        else:
            sub_type.append(0)
    results["substitution_type"] = sub_type
    return results

In [15]:
def xg_share_func(row):
    """
        for every relevant substitution calculate statistics for x time before the substitution

        Inputs:
        ==========================
        row:one row of the dataframe produced by the substitution_func function

        Output:
        ==========================
        'team': the substituting team
        'home_score': obv 
        'away_score': obv
       'before_xg_share': subbing team's share of xg(our_xg/(our_xg+their_xg)) in the period before the sub
       'our_corner_count': count of subbing team's corners before sub
       'their_corner_count': count of non-subbing team's corners before sub
       'our_foul_count' : count of subbing team's fouls before sub
       'their_foul_count', : count of non-subbing team's fouls before sub
       'our_foul_centroid_x': x val of centroid of subbing teams fouls
       'their_foul_centroid_x': x val of centroid of non-subbing teams fouls
       'our_foul_centroid_y': y val of centroid of subbing teams fouls
       'their_foul_centroid_y': y val of centroid of non-subbing teams fouls
       'our_dispo_count': count of subbing team's being dispossessed before sub
       'their_dispo_count': count of non-subbing team's being dispossessed before sub 
       'our_dispo_centroid_x': x val of centroid of subbing teams dispossessions
       'their_dispo_centroid_x': obv from above
       'our_dispo_centroid_y': obv from above
       'their_dispo_centroid_y': obv from above
       'our_pass_count': count of subbing team passes 
       'their_pass_count': obv from above
       'our_pass_centroid_x': obv from above
       'their_pass_centroid_x': obv from above
       'our_pass_centroid_y': obv from above
       'their_pass_centroid_y': obv from above 
       'our_press_count': count of subbing team presses
       'their_press_count': obv from above
       'our_press_centroid_x': obv from above
       'their_press_centroid_x': obv from above
       'our_press_centroid_y': obv from above
       'their_press_centroid_y': obv from above
       'our_shot_count': count of subbing team shots
       'their_shot_count': obv from above
       'our_pass_length_sum': total length of subbing team's passes in period
         'their_pass_length_sum': total length of non-subbing team's passes in period
       'our_pass_length_mean': average length of subbing team's passes in period
         'their_pass_length_mean': average length of non-subbing team's passes in period
         'our_poss_count': count of subbing team's possessions
       'their_poss_count': obv from above
         'our_possession_percentage': subbing team's possession time over total time of possession
       'their_possession_percentage': non-subbing team's possession time over total time of possession
       'our_total_seconds': subbing team's total time in possession
       'their_total_seconds': non-subbing team's total time in possession
       'Time': time of sub as string in format MM:SS
       'after_xg_share': subbing team's share of xg(our_xg/(our_xg+their_xg)) in the period after the sub
        'xg_change': after_xg_share/before_xg_share
        """
    xg_share_df = pd.DataFrame()
    #for index, row in df.iterrows():
    events =  sb.events(match_id = row.match_id)
    matches = sb.matches(competition_id=row.season, season_id=27) 
    home = matches[matches.match_id==row.match_id].home_team.values[0]
    home_goal = 0
    away_goal = 0
    home_list = []
    away_list = []
    for index, row2 in events.iterrows():
        if row2.shot_outcome =="Goal":
            if row2.team ==home:
                home_goal+=1
            else:
                away_goal+=1
        home_list.append(home_goal)
        away_list.append(away_goal)
    events["home_score"]=home_list
    events["away_score"]=away_list
    events["loser"] = np.where(((events.team==home) &(events.home_score<events.away_score))|((events.team!=home) &(events.home_score>events.away_score)),1,0)
    events["Time"] = events.minute.astype("str")+":"+events.second.astype("str")
    sub_time = row["minute"]
    before_events = events[(events.minute<sub_time) &(events.minute>=(sub_time-15))]
    before = before_events[["home_score","away_score","team","shot_statsbomb_xg"]].groupby(["team"],as_index=False).agg({"shot_statsbomb_xg":"sum","home_score":"max","away_score":"max"})#/events[["shot_statsbomb_xg"]][(events.minute<sub_time) &(events.minute>=(sub_time-10))].sum()
    before["xg_share"] = before["shot_statsbomb_xg"]/before["shot_statsbomb_xg"].sum()
    after = events[["team","shot_statsbomb_xg"]][(events.minute>=sub_time) &(events.minute<=(sub_time+15))].groupby("team",as_index=False).sum()
    after["xg_share"] = after["shot_statsbomb_xg"]/after["shot_statsbomb_xg"].sum()
    #add corners count
    corner_com = before_events[["team","possession"]][(before_events.play_pattern == "From Corner" )&(before_events.type == "Pass" )].groupby("team",as_index = False).nunique().rename(columns={"possession":"corner_count"})
    before = assign_our_their_stats(before, corner_com)
    #add foul count and centroid
    foul_com = before_events[["team","type"]][before_events.type == "Foul Committed"].groupby("team",as_index = False).count().rename(columns={"type":"foul_count"})
    foulo = before_events[before_events.type == "Foul Committed"]
    foul_com_centroid = centroid_function(foulo,before_events)
    before = assign_our_their_stats(before, foul_com)
    before = assign_our_their_stats(before, foul_com_centroid)
    before = before.rename(columns={"our_centroid_x":"our_foul_centroid_x",	"their_centroid_x":"their_foul_centroid_x",	"our_centroid_y":"our_foul_centroid_y",	"their_centroid_y":"their_foul_centroid_y"}) 
    #add dispossessions
    dispos_count = before_events[["team","type"]][before_events.type == "Dispossessed"].groupby("team",as_index = False).count().rename(columns={"type":"dispo_count"})
    dispos = before_events[before_events.type == "Dispossessed"]
    dispos_centroid = centroid_function(dispos,before_events)
    before = assign_our_their_stats(before, dispos_count)
    before = assign_our_their_stats(before, dispos_centroid)
    before = before.rename(columns={"our_centroid_x":"our_dispo_centroid_x",	"their_centroid_x":"their_dispo_centroid_x",	"our_centroid_y":"our_dispo_centroid_y",	"their_centroid_y":"their_dispo_centroid_y"}) 
    #add passes
    pass_count = before_events[["team","type"]][before_events.type == "Pass"].groupby("team",as_index = False).count().rename(columns={"type":"pass_count"})
    passos = before_events[before_events.type == "Pass"]
    pass_centroid = centroid_function(passos,before_events)
    before = assign_our_their_stats(before, pass_count)
    before = assign_our_their_stats(before, pass_centroid)
    before = before.rename(columns={"our_centroid_x":"our_pass_centroid_x",	"their_centroid_x":"their_pass_centroid_x",	"our_centroid_y":"our_pass_centroid_y",	"their_centroid_y":"their_pass_centroid_y"}) 
    #add presses
    press_count = before_events[["team","type"]][before_events.type == "Pressure"].groupby("team",as_index = False).count().rename(columns={"type":"press_count"})
    presso = before_events[before_events.type == "Pressure"]
    press_centroid = centroid_function(presso,before_events)
    before = assign_our_their_stats(before, press_count)
    before = assign_our_their_stats(before, press_centroid)
    before = before.rename(columns={"our_centroid_x":"our_press_centroid_x",	"their_centroid_x":"their_press_centroid_x",	"our_centroid_y":"our_press_centroid_y",	"their_centroid_y":"their_press_centroid_y"}) 
    #add shots
    shot_count = before_events[["team","type"]][before_events.type == "Shot"].groupby("team",as_index = False).count().rename(columns={"type":"shot_count"})
    before = assign_our_their_stats(before, shot_count)
    #add_pass_len
    pass_len_sum = before_events[["team","pass_length"]].groupby("team",as_index = False).sum().rename(columns={"pass_length":"pass_length_sum"})
    pass_len_mean = before_events[["team","pass_length"]].groupby("team",as_index = False).mean().rename(columns={"pass_length":"pass_length_mean"})
    before = assign_our_their_stats(before, pass_len_sum)
    before = assign_our_their_stats(before, pass_len_mean)
    #add possessions
    poss_count = before_events[["possession_team","possession"]].groupby("possession_team",as_index = False).nunique().rename(columns={"possession_team":"team","possession":"poss_count"})
    before = assign_our_their_stats(before, poss_count)
    #poss_time
    poss_time = before_events[["possession_team","possession","timestamp"]].groupby(["possession_team","possession"],as_index = False).agg(time_min=('timestamp', 'min'), time_max=('timestamp', 'max')).rename(columns={"possession_team":"team"})
    poss_time['time_min'] = pd.to_datetime(poss_time['time_min'], format='%H:%M:%S.%f')
    poss_time['time_max'] = pd.to_datetime(poss_time['time_max'], format='%H:%M:%S.%f')
    poss_time["poss_time"] = poss_time["time_max"] - poss_time["time_min"]
    poss_time['poss_age_seconds'] = poss_time['poss_time'].apply(lambda x: f"{int(x.total_seconds() // 60)}:{int(x.total_seconds() % 60):02d}.{x.microseconds // 1000:03d}")
    grouped_df = poss_time.groupby('team')['poss_time'].sum().reset_index()
    grouped_df['total_seconds'] = grouped_df['poss_time'].dt.total_seconds()
    # Calculate the total possession time
    total_possession_time = grouped_df['total_seconds'].sum()
    # Calculate the percentage of total possession time for each team
    grouped_df['possession_percentage'] = (grouped_df['total_seconds'] / total_possession_time) * 100
    # Convert the summed time differences to minutes and seconds
    grouped_df = grouped_df[["team",'possession_percentage','total_seconds']]
    before = assign_our_their_stats(before, grouped_df)
    result = before.rename(columns={"xg_share":"before_xg_share"})
    result["Time"]= row.Time
    result = result.merge(after[["team","xg_share"]], on = "team").rename(columns={"xg_share":"after_xg_share"})
    result["xg_change"]=result["after_xg_share"]/result["before_xg_share"]
    result = result.drop("shot_statsbomb_xg")
    xg_share_df = pd.concat([xg_share_df,result], axis = 0)
    #xg_share_df 
    return xg_share_df


In [16]:
comp_list = comps.competition_id[:3]
asnwer2 = [*map(substition_func, comp_list)]

results = pd.concat(asnwer2, ignore_index=True)
results




Unnamed: 0,minute,home_score,away_score,Time,team,starter_name,starter_pos,sub_name,sub_pos,match_id,season,starter_pos_score,sub_pos_score,substitution_type
0,59,1,3,59:29,Hoffenheim,Pirmin Schwegler,Center Defensive Midfield,Kevin Kuranyi,Right Center Forward,3890561,9,5,8,1
1,67,1,3,67:15,Hoffenheim,Tobias Strobl,Left Defensive Midfield,Eugen Polanski,Left Defensive Midfield,3890561,9,5,5,0
2,73,1,3,73:25,Hoffenheim,Kevin Volland,Left Midfield,Nadiem Amiri,Left Midfield,3890561,9,6,6,0
3,57,1,0,57:41,Eintracht Frankfurt,Haris Seferović,Center Forward,Luc Castaignos,Center Forward,3890505,9,8,8,0
4,63,1,0,63:29,Eintracht Frankfurt,Änis Ben-Hatira,Left Wing,Sonny Kittel,Left Wing,3890505,9,7,7,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1408,67,4,1,67:13,Levante UD,Roger Martí Salvador,Left Center Forward,Deyverson Brum Silva Acosta,Left Center Forward,266490,11,8,8,0
1409,79,4,1,79:20,Levante UD,Víctor Camarasa Ferrando,Right Center Midfield,Rubén García Santos,Right Center Midfield,266490,11,6,6,0
1410,64,6,1,64:21,Celta Vigo,Pedro Pablo Hernández,Left Center Midfield,Marcelo Alfonso Díaz Rojas,Left Defensive Midfield,266254,11,6,5,-1
1411,76,6,1,76:56,Celta Vigo,John Guidetti,Center Forward,Dejan Dražić,Left Midfield,266254,11,8,6,-1


In [17]:
#asnwer3 = [*map(xg_share_func, results)]
xg_thing = results.apply(xg_share_func, axis=1)
#xg_thing = pd.concat(asnwer3, ignore_index=True)
dataframes = list(xg_thing)

# Concatenate the list of DataFrames into one DataFrame
result_df = pd.concat(dataframes, ignore_index=True)




In [21]:
result_df.columns

Index(['team', 'shot_statsbomb_xg', 'home_score', 'away_score',
       'before_xg_share', 'our_corner_count', 'their_corner_count',
       'our_foul_count', 'their_foul_count', 'our_foul_centroid_x',
       'their_foul_centroid_x', 'our_foul_centroid_y', 'their_foul_centroid_y',
       'our_dispo_count', 'their_dispo_count', 'our_dispo_centroid_x',
       'their_dispo_centroid_x', 'our_dispo_centroid_y',
       'their_dispo_centroid_y', 'our_pass_count', 'their_pass_count',
       'our_pass_centroid_x', 'their_pass_centroid_x', 'our_pass_centroid_y',
       'their_pass_centroid_y', 'our_press_count', 'their_press_count',
       'our_press_centroid_x', 'their_press_centroid_x',
       'our_press_centroid_y', 'their_press_centroid_y', 'our_shot_count',
       'their_shot_count', 'our_pass_length_sum', 'their_pass_length_sum',
       'our_pass_length_mean', 'their_pass_length_mean', 'our_poss_count',
       'their_poss_count', 'our_possession_percentage',
       'their_possession_percenta

In [19]:
all_needed = results.merge(result_df, on = ["team","Time","home_score","away_score"])
all_needed.to_csv("outputv1.csv")

NameError: name 'xg_share_df' is not defined

# ADD Features
including:
DONE avg & centroid of lost possession, and for lost dribble and interception/duel
DONE avg foul given and conceded pos and centroid
same for passes
avg pass length
poss count, opp poss count
shot count and xg given and conceded
time in possession or % for period
corners for and against

