In [79]:
from statsbombpy import sb
import pandas as pd
from pandas import json_normalize
import numpy as np
from shapely.geometry import MultiPoint
pd.set_option('display.max_colwidth', None)

In [80]:
position_scores = {
    'Goalkeeper': 1,
    'Right Center Back': 2,
    'Left Center Back': 2,
    'Center Back': 2,
    'Right Back': 3,
    'Left Back': 3,
    'Right Wing Back': 4,
    'Left Wing Back': 4,
    'Center Defensive Midfield': 5,
    'Right Defensive Midfield': 5,
    'Left Defensive Midfield': 5,
    'Center Midfield': 6,
    'Left Center Midfield': 6,
    'Right Center Midfield': 6,
    'Left Midfield': 6,
    'Right Midfield': 6,
    'Center Attacking Midfield': 7,
    'Left Attacking Midfield': 7,
    'Right Attacking Midfield': 7,
    'Left Wing': 7,
    'Right Wing': 7,
    'Left Center Forward': 8,
    'Right Center Forward': 8,
    'Center Forward': 8,
    
}


In [81]:
all_comps = sb.competitions()
comps = all_comps[all_comps.season_name=="2015/2016"]
comps



Unnamed: 0,competition_id,season_id,country_name,competition_name,competition_gender,competition_youth,competition_international,season_name,match_updated,match_updated_360,match_available_360,match_available
1,9,27,Germany,1. Bundesliga,male,False,False,2015/2016,2024-05-19T11:11:14.192381,,,2024-05-19T11:11:14.192381
6,16,27,Europe,Champions League,male,False,False,2015/2016,2024-02-12T12:51:14.869531,2021-06-13T16:17:31.694,,2024-02-12T12:51:14.869531
42,11,27,Spain,La Liga,male,False,False,2015/2016,2024-05-16T14:06:52.149840,2021-06-13T16:17:31.694,,2024-05-16T14:06:52.149840
59,7,27,France,Ligue 1,male,False,False,2015/2016,2024-01-07T06:23:44.277971,,,2024-01-07T06:23:44.277971
63,2,27,England,Premier League,male,False,False,2015/2016,2024-01-25T11:45:53.333860,2021-06-13T16:17:31.694,,2024-01-25T11:45:53.333860
65,12,27,Italy,Serie A,male,False,False,2015/2016,2024-03-10T16:14:00.166197,,,2024-03-10T16:14:00.166197


In [83]:
def calculate_polygon_centroid(coords):
    points = MultiPoint([coord for coord in coords if pd.notnull(coord).all()])
    return points.centroid.x, points.centroid.y

def centroid_function(df):
    if len(dispos)==0:
        teams = pd.Series([before_events.team.unique()[0],before_events.team.unique()[1]])
        centroid_x = pd.Series(["",""])
        centroid_y = pd.Series(["",""])
        centroids = pd.DataFrame([teams,centroid_x,centroid_y]).transpose()
        centroids.columns = ['team', 'centroid_x', 'centroid_y']
        return centroids

    else:    
        centroids = dispos.groupby('team')['location'].apply(lambda loc: calculate_polygon_centroid(loc.tolist()))#calculate_polygon_centroid(loc.tolist())

        centroids_df = centroids.apply(pd.Series).reset_index()
        centroids_df.columns = ['team', 'centroid_x', 'centroid_y']

    #tools.display_dataframe_to_user(name="Team Polygon Centroid", dataframe=centroids_df)

        return centroids_df

In [84]:
def extract_last_position(position_list):
    if len(position_list) > 0:
        last_position = position_list[-1]['position']
    else:
        last_position = None
    return last_position


def assign_our_their_stats(df, stats_df, team_col='team'):
    # Merge the stats data with the events DataFrame
    df = df.merge(stats_df, on=team_col, how='left', suffixes=('', '_stats'))
    
    # Get unique teams
    teams = df[team_col].unique()
    
    # Identify statistic columns (excluding the team column)
    stat_columns = [col for col in stats_df.columns if col != team_col]
    reversed_stats = {team: {} for team in teams}
    # Loop over each team and create 'our' and 'their' columns
    for team in teams:
        for stat in stat_columns:
            reversed_stats[team][f'our_{stat}'] = df[df[team_col] == team][stat].values[0]
            reversed_stats[team][f'their_{stat}'] = df[df[team_col] != team][stat].values[0]
    
    # Assign the reversed stats back to the events DataFrame
    for team in teams:
        for stat in stat_columns:
            df[f'our_{stat}'] = df.apply(lambda row: reversed_stats[row[team_col]][f'our_{stat}'], axis=1)
            df[f'their_{stat}'] = df.apply(lambda row: reversed_stats[row[team_col]][f'their_{stat}'], axis=1)
    
    
    # Drop the intermediate statistic columns if not needed
    df.drop(columns=stat_columns, inplace=True)
    
    return df

In [85]:
results = pd.DataFrame()
xg_share_df = pd.DataFrame()
for season in comps.competition_id:
    matches = sb.matches(competition_id=season, season_id=27) 
    for match in matches.match_id:
        events = sb.events(match_id = match)    
        lineup = sb.lineups(match_id = match)
        home = matches[matches.match_id==match].home_team.values[0]
        home_goal = 0
        away_goal = 0
        home_list = []
        away_list = []
        for index, row in events.iterrows():
            if row.shot_outcome =="Goal":
                if row.team ==home:
                    home_goal+=1
                else:
                    away_goal+=1
            home_list.append(home_goal)
            away_list.append(away_goal)
        events["home_score"]=home_list
        events["away_score"]=away_list
        events["loser"] = np.where(((events.team==home) &(events.home_score<events.away_score))|((events.team!=home) &(events.home_score>events.away_score)),1,0)
        events["Time"] = events.minute.astype("str")+":"+events.second.astype("str")
        for team in events.team.unique():
            df_event = events[(events.team==team) & (events.substitution_outcome=="Tactical")&(events.loser==1)]
            df_lineup = lineup[team]
            df_lineup['last_position'] = df_lineup['positions'].apply(extract_last_position)
            merge_one = df_event[["minute","home_score","away_score","player",'substitution_replacement',"Time","team"]].merge(df_lineup[["player_name","last_position"]], left_on = "player", right_on = "player_name").rename(columns={"last_position":"starter_pos"})
            merge_two = merge_one[["minute","home_score","away_score","Time","team","player_name","starter_pos",'substitution_replacement']].merge(df_lineup[["player_name","last_position"]], right_on = "player_name", left_on = "substitution_replacement").rename(columns={"last_position":"sub_pos","player_name_x":"starter_name","substitution_replacement":"sub_name"}).drop("player_name_y",axis=1)
            merge_two["match_id"] = match
            merge_two["season"] = season
            results = pd.concat([results,merge_two],axis=0)
    results['starter_pos_score'] = results['starter_pos'].map(position_scores)
    results['sub_pos_score'] = results['sub_pos'].map(position_scores)
    sub_type = []
    for index, row in results.iterrows():
        if row['starter_pos_score'] > row['sub_pos_score']:
            sub_type.append(-1)
        elif row['starter_pos_score'] < row['sub_pos_score']:
            sub_type.append(1)
        else:
            sub_type.append(0)
    results["substitution_type"] = sub_type



In [86]:
xg_share_df = pd.DataFrame()
for index, row in results.iterrows():
    events =  sb.events(match_id = row.match_id)
    matches = sb.matches(competition_id=row.season, season_id=27) 
    home = matches[matches.match_id==row.match_id].home_team.values[0]
    home_goal = 0
    away_goal = 0
    home_list = []
    away_list = []
    for index, row2 in events.iterrows():
        if row2.shot_outcome =="Goal":
            if row2.team ==home:
                home_goal+=1
            else:
                away_goal+=1
        home_list.append(home_goal)
        away_list.append(away_goal)
    events["home_score"]=home_list
    events["away_score"]=away_list
    events["loser"] = np.where(((events.team==home) &(events.home_score<events.away_score))|((events.team!=home) &(events.home_score>events.away_score)),1,0)
    events["Time"] = events.minute.astype("str")+":"+events.second.astype("str")
    sub_time = row["minute"]
    before_events = events[(events.minute<sub_time) &(events.minute>=(sub_time-15))]
    before = before_events[["home_score","away_score","team","shot_statsbomb_xg"]].groupby(["team"],as_index=False).agg({"shot_statsbomb_xg":"sum","home_score":"max","away_score":"max"})#/events[["shot_statsbomb_xg"]][(events.minute<sub_time) &(events.minute>=(sub_time-10))].sum()
    before["xg_share"] = before["shot_statsbomb_xg"]/before["shot_statsbomb_xg"].sum()
    after = events[["team","shot_statsbomb_xg"]][(events.minute>=sub_time) &(events.minute<=(sub_time+15))].groupby("team",as_index=False).sum()
    after["xg_share"] = after["shot_statsbomb_xg"]/after["shot_statsbomb_xg"].sum()
    #add corners count
    corner_com = before_events[["team","possession"]][(before_events.play_pattern == "From Corner" )&(before_events.type == "Pass" )].groupby("team",as_index = False).nunique().rename(columns={"possession":"corner_count"})
    before = assign_our_their_stats(before, corner_com)
    #add foul count and centroid
    foul_com = before_events[["team","type"]][before_events.type == "Foul Committed"].groupby("team",as_index = False).count().rename(columns={"type":"foul_count"})
    foulo = before_events[before_events.type == "Foul Committed"]
    foul_com_centroid = centroid_function(foulo)
    before = assign_our_their_stats(before, foul_com)
    before = assign_our_their_stats(before, foul_com_centroid)
    before = before.rename(columns={"our_centroid_x":"our_foul_centroid_x",	"their_centroid_x":"their_foul_centroid_x",	"our_centroid_y":"our_foul_centroid_y",	"their_centroid_y":"their_foul_centroid_y"}) 
    #add dispossessions
    dispos_count = before_events[["team","type"]][before_events.type == "Dispossessed"].groupby("team",as_index = False).count().rename(columns={"type":"dispo_count"})
    dispos = before_events[before_events.type == "Dispossessed"]
    dispos_centroid = centroid_function(dispos)
    before = assign_our_their_stats(before, dispos_count)
    before = assign_our_their_stats(before, dispos_centroid)
    before = before.rename(columns={"our_centroid_x":"our_dispo_centroid_x",	"their_centroid_x":"their_dispo_centroid_x",	"our_centroid_y":"our_dispo_centroid_y",	"their_centroid_y":"their_dispo_centroid_y"}) 
    #add passes
    pass_count = before_events[["team","type"]][before_events.type == "Pass"].groupby("team",as_index = False).count().rename(columns={"type":"pass_count"})
    passos = before_events[before_events.type == "Pass"]
    pass_centroid = centroid_function(passos)
    before = assign_our_their_stats(before, pass_count)
    before = assign_our_their_stats(before, pass_centroid)
    before = before.rename(columns={"our_centroid_x":"our_pass_centroid_x",	"their_centroid_x":"their_pass_centroid_x",	"our_centroid_y":"our_pass_centroid_y",	"their_centroid_y":"their_pass_centroid_y"}) 
    #add presses
    press_count = before_events[["team","type"]][before_events.type == "Pressure"].groupby("team",as_index = False).count().rename(columns={"type":"press_count"})
    presso = before_events[before_events.type == "Pressure"]
    press_centroid = centroid_function(presso)
    before = assign_our_their_stats(before, press_count)
    before = assign_our_their_stats(before, press_centroid)
    before = before.rename(columns={"our_centroid_x":"our_press_centroid_x",	"their_centroid_x":"their_press_centroid_x",	"our_centroid_y":"our_press_centroid_y",	"their_centroid_y":"their_press_centroid_y"}) 
    #add shots
    shot_count = before_events[["team","type"]][before_events.type == "Shot"].groupby("team",as_index = False).count().rename(columns={"type":"shot_count"})
    before = assign_our_their_stats(before, shot_count)
    #add_pass_len
    pass_len_sum = before_events[["team","pass_length"]].groupby("team",as_index = False).sum().rename(columns={"pass_length":"pass_length_sum"})
    pass_len_mean = before_events[["team","pass_length"]].groupby("team",as_index = False).mean().rename(columns={"pass_length":"pass_length_mean"})
    before = assign_our_their_stats(before, pass_len_sum)
    before = assign_our_their_stats(before, pass_len_mean)
    #add possessions
    poss_count = before_events[["possession_team","possession"]].groupby("possession_team",as_index = False).nunique().rename(columns={"possession_team":"team","possession":"poss_count"})
    before = assign_our_their_stats(before, poss_count)
    #poss_time
    poss_time = before_events[["possession_team","possession","timestamp"]].groupby(["possession_team","possession"],as_index = False).agg(time_min=('timestamp', 'min'), time_max=('timestamp', 'max')).rename(columns={"possession_team":"team"})
    poss_time['time_min'] = pd.to_datetime(poss_time['time_min'], format='%H:%M:%S.%f')
    poss_time['time_max'] = pd.to_datetime(poss_time['time_max'], format='%H:%M:%S.%f')
    poss_time["poss_time"] = poss_time["time_max"] - poss_time["time_min"]
    poss_time['poss_age_seconds'] = poss_time['poss_time'].apply(lambda x: f"{int(x.total_seconds() // 60)}:{int(x.total_seconds() % 60):02d}.{x.microseconds // 1000:03d}")
    grouped_df = poss_time.groupby('team')['poss_time'].sum().reset_index()
    grouped_df['total_seconds'] = grouped_df['poss_time'].dt.total_seconds()
    # Calculate the total possession time
    total_possession_time = grouped_df['total_seconds'].sum()
    # Calculate the percentage of total possession time for each team
    grouped_df['possession_percentage'] = (grouped_df['total_seconds'] / total_possession_time) * 100
    # Convert the summed time differences to minutes and seconds
    grouped_df = grouped_df[["team",'possession_percentage','total_seconds']]
    before = assign_our_their_stats(before, grouped_df)
    result = before.rename(columns={"xg_share":"before_xg_share"})
    result["Time"]= row.Time
    result = result.merge(after[["team","xg_share"]], on = "team").rename(columns={"xg_share":"after_xg_share"})
    result["xg_change"]=result["after_xg_share"]/result["before_xg_share"]
    xg_share_df = pd.concat([xg_share_df,result], axis = 0)
xg_share_df




Unnamed: 0,team,shot_statsbomb_xg,home_score,away_score,before_xg_share,our_corner_count,their_corner_count,our_foul_count,their_foul_count,our_foul_centroid_x,...,their_pass_length_mean,our_poss_count,their_poss_count,our_possession_percentage,their_possession_percentage,our_total_seconds,their_total_seconds,Time,after_xg_share,xg_change
0,Hoffenheim,0.241028,1,3,0.300355,2.0,,3.0,1.0,,...,21.171799,17,17,8.769649,91.230351,306.218,3185.575,59:29,0.772377,2.571549
1,Schalke 04,0.561449,1,3,0.699645,,2.0,1.0,3.0,,...,18.315172,17,17,91.230351,8.769649,3185.575,306.218,59:29,0.227623,0.325341
0,Hoffenheim,0.000000,1,3,0.000000,,,3.0,3.0,60.68,...,19.476777,13,15,31.886188,68.113812,214.808,458.863,67:15,1.000000,inf
1,Schalke 04,0.682988,1,3,1.000000,,,3.0,3.0,36.15,...,20.774281,15,13,68.113812,31.886188,458.863,214.808,67:15,0.000000,0.000000
0,Hoffenheim,0.412408,1,3,0.772377,1.0,,1.0,4.0,57.35,...,17.762784,17,13,36.358919,63.641081,233.309,408.374,73:25,0.864171,1.118846
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,Fiorentina,0.061688,2,0,0.570425,,1.0,2.0,2.0,57.0,...,22.787962,17,24,59.546426,40.453574,339.707,230.784,77:42,0.000000,0.000000
0,Bologna,0.322147,2,1,0.768804,1.0,2.0,1.0,2.0,,...,19.988471,17,13,45.886948,54.113052,285.398,336.561,45:0,0.000000,0.000000
1,Lazio,0.096877,2,1,0.231196,2.0,1.0,2.0,1.0,,...,17.319432,13,17,54.113052,45.886948,336.561,285.398,45:0,1.000000,4.325336
0,Bologna,0.029323,2,1,0.091898,,2.0,2.0,3.0,,...,21.054506,15,16,56.111349,43.888651,322.390,252.164,75:2,0.483812,5.264677


In [88]:
all_needed = results.merge(xg_share_df, on = ["team","Time","home_score","away_score"])
all_needed.to_csv("outputv1.csv")

# ADD Features
including:
DONE avg & centroid of lost possession, and for lost dribble and interception/duel
DONE avg foul given and conceded pos and centroid
same for passes
avg pass length
poss count, opp poss count
shot count and xg given and conceded
time in possession or % for period
corners for and against

