In [1]:
import datetime
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import json
from mplsoccer import Pitch, Sbopen
from typing import Literal

### Custom Functions

In [11]:
def get_passing_data(event_data, team_name, sub_idx):
    '''
    Creating the pass data frame from the event data

    INPUT
        event_data: pd.DataFrame with event data
        team_name: str
        sub_idx: int, index that corresponds to the substitution event

    OUTPUT
        pass_data: pd.DataFrame with x,y positions of passer and receiver
        pass_network_df: pd.DataFrame to be used as an edgelist for Graph creation
    '''
    pass_data = (event_data.type_name == "Pass") & (event_data.team_name == team_name) & (event_data.index < sub_idx) & (event_data.outcome_name.isnull()) & (event_data.sub_type_name != "Throw-in")
    pass_data = event_data.loc[pass_data, ["x", "y", "end_x", "end_y", "player_name", "pass_recipient_name"]]
    # Keep player surname only and jersey number
    pass_data["player_name"] = pass_data["player_name"].apply(lambda x: str(x).split()[-1])
    pass_data["pass_recipient_name"] = pass_data["pass_recipient_name"].apply(lambda x: str(x).split()[-1])
    # Passing network data frame
    pass_net_edgelist = (
        pass_data.groupby(["player_name", "pass_recipient_name"])
        .size()
        .reset_index(name = "Intensity")
        .rename(columns = {"player_name": "Passer", "pass_recipient_name": "Receiver"})
    )

    # Create a DiGraph object
    G = nx.from_pandas_edgelist(
        pass_net_edgelist,
        source = "Passer",
        target = "Receiver",
        edge_attr = "Intensity",
        create_using = nx.MultiDiGraph
    )

    # Set average position as nodal attribute
    avg_pos = {}
    for player in G.nodes():
        # Get x, y, endx, endy positions of player only
        player_x = pass_data[pass_data["player_name"] == player]["x"].to_numpy()
        player_y = pass_data[pass_data["player_name"] == player]["y"].to_numpy()
        player_endx = pass_data[pass_data["pass_recipient_name"] == player]["end_x"].to_numpy()
        player_endy = pass_data[pass_data["pass_recipient_name"] == player]["end_y"].to_numpy()
        
        avg_Xpos = float(np.round(np.mean(np.concatenate([player_x, player_endx])), 2))
        avg_Ypos = float(np.round(np.mean(np.concatenate([player_y, player_endy])), 2))

        avg_pos.update({player: [avg_Xpos, avg_Ypos]})

    # Set node attribute: average X and average Y position
    nx.set_node_attributes(G, avg_pos, "Avg Pos")

    # Set edge attribute: Distance defined as the inverse of edge weights, i.e. passes
    distances = {}
    for key, values in nx.get_edge_attributes(G, "Intensity").items():
        distances.update({key: round(1 / values, 4) * 10000})

    nx.set_edge_attributes(G, distances, name = "Distance")
    
    return pass_data, pass_net_edgelist, G

In [5]:
def node_strength(G, direction: Literal["in", "out", None], weight):
    '''
    Calculates the strength (weighted degree) of the nodes in a graph for
    a directed and undirected network.

    INPUT:
        G: networkx graph object
        direction:  "in", "out" or None; specifies the directionality of the network
        weight: str, denotes the variable that acts as an edge weight
        
    OUTPUT:
        -
    '''
    # Get the adjacency matrix and create a dictionary to store strength values
    adj_matrix = nx.to_numpy_array(G, weight = weight)
    strength_values = {node: None for node in G.nodes()}

    # Calculation of strength according to direction. Adjacency matrix is read 
    # from Left to Right
    for idx, node in enumerate(G.nodes()):
        if direction == "in":
            strength_values[node] = int(sum(adj_matrix[:, idx]))
            name = "In-Strength"
        elif direction == "out":
            strength_values[node] = int(sum(adj_matrix[idx]))
            name = "Out-Strength"
        else:
            strength_values[node] = int(sum(adj_matrix[:, idx]) + sum(adj_matrix[idx]))
            name = "Strength"
            
    # Store strength values as node attribute
    nx.set_node_attributes(G, strength_values, name = name)

In [39]:
def distance_centralities(G, centrality_name, distance_attribute):
    '''
    Calculate distance-related centralities like Betweenness, Harmonic Closeness

    INPUT:
        G: networkx Graph object
        centrality_name: str
        distance_attribute: str, specify the name of the edge attribute used as distance

    OUTPUT:
        -
    '''
    if centrality_name == "Betweenness":
        centrality_dict = nx.betweenness_centrality(G, normalized = True, weight = distance_attribute)
    elif centrality_name == "In-Harmonic":
        centrality_dict = nx.harmonic_centrality(G, distance = distance_attribute)
    elif centrality_name == "Out-Harmonic":
        Grev = G.reverse(copy = True)
        centrality_dict = nx.harmonic_centrality(Grev, distance = distance_attribute)

    # Up to four (4) decimals
    for key, value in centrality_dict.items():
        centrality_dict[key] = round(value, 4)
        
    # Store strength values as node attribute
    nx.set_node_attributes(G, centrality_dict, name = centrality_name)

### Data Preparation

In [4]:
# Read config json file and define parameters
with open("config.json", "r") as file:
    params = json.load(file)

# Load data and get match data
parser, match_data = get_match_data(**params)

# Get match info
match_info_dict, lineups = get_match_info(match_data, 3942819)

In [5]:
# Match Event Data
event_df, related, freeze, tactics = parser.event(3942819)

In [13]:
# Get the indices of substitution events (np.array)
subs_hteam = event_df[(event_df["type_name"] == "Substitution") & (event_df["team_name"] == match_info_dict["Home Team"])]["index"].values
subs_ateam = event_df[(event_df["type_name"] == "Substitution") & (event_df["team_name"] == match_info_dict["Away Team"])]["index"].values

In [14]:
x = phase_of_play(event_df, match_info_dict["Home Team"])
phase_idx = [_ for _ in range(1, len(x)+1)]

In [22]:
np.round(1.2343, 2)

1.23

In [8]:
x1 = np.array([1, 2, 3])
x2 = np.array([3, 4, 5])
y1 = np.array([3, 4, 5])
y2 = np.array([1, 2, 3])

deltax = x2 - x1
deltay = y2 - y1

In [9]:
np.hypot(deltax, deltay)

array([2.82842712, 2.82842712, 2.82842712])

In [20]:
def cosine_pass_vector(pass_data):
    '''
    A function that calculates the cosine of the angle of a pass

    INPUT:
        pass_data: pd.DataFrame with passing data
        x_start: str, column name for the x-position of passer
        x_end: str, column name for the x-position of recipient
        y_start: str, column name for the y-position of passer
        y_end: str, column name for the position of recipient

    OUTPUT:
        pass_data: pd.DataFrame with a "cosine" column
    '''
    # Get x, y coordinates
    x_start, y_start = np.array(pass_data["x"].to_list()), np.array(pass_data["y"].to_list())
    x_end, y_end = np.array(pass_data["end_x"].to_list()), np.array(pass_data["end_y"].to_list())

    delta_x = x_end - x_start
    delta_y = y_end - y_start

    # Euclidean norm
    norm = np.hypot(delta_x, delta_y)

    # Create the cosine column in the pass data frame
    pass_data["cosine"] = np.round(np.divide(delta_x, norm), 4)

    return pass_data 

pass_data = cosine_pass_vector(pass_xy_data)

In [14]:
# Create pass data frames and Graph object
pass_xy_data, pass_net_edgelist, pass_graph = get_passing_data(event_df, match_info_dict["Home Team"], subs_hteam[0])

### Network Analysis

In [40]:
# Calculate node centralities
node_strength(pass_graph, "in", "Intensity")
node_strength(pass_graph, "out", "Intensity")
node_strength(pass_graph, None, "Intensity")
distance_centralities(pass_graph, "Betweenness", "Distance")
distance_centralities(pass_graph, "In-Harmonic", "Distance")
distance_centralities(pass_graph, "Out-Harmonic", "Distance")

In [45]:
# TEST 1: Loading data and creating the Graph object
print("TEST 1: Network Creation")
print(pass_graph.number_of_nodes())
print(pass_net_edgelist.shape[0] == pass_graph.number_of_edges())

# TEST 2: Check node and edge attributes
print("TEST 2: Initial Node and Edge Attributes")
print(pass_graph.nodes["Dijk"]["Avg Pos"])
pass_graph.get_edge_data("Dijk", "Depay")

# TEST 3:
print(pass_graph.nodes["Simons"])

TEST 1: Network Creation
11
True
TEST 2: Initial Node and Edge Attributes
[34.28, 37.4]
{'Avg Pos': [64.29, 69.5], 'In-Strength': 7, 'Out-Strength': 7, 'Strength': 14, 'Betweenness': 0.0, 'In-Harmonic': 0.0013, 'Out-Harmonic': 0.0011}


### Visualization & Statistics

In [58]:
np.unique(match_data["competition_stage_name"].values)

array(['Final', 'Group Stage', 'Quarter-finals', 'Round of 16',
       'Semi-finals'], dtype=object)

In [44]:
import datetime

py_dt = datetime.datetime.utcfromtimestamp(x.astype('datetime64[s]').astype(int))

# Format to friendly string
friendly_str = py_dt.strftime("%B %d, %Y")  # e.g., "July 10, 2024"

print(type(friendly_str))

<class 'str'>


In [8]:
match_data[match_data["competition_stage_name"] == "Semi-finals"]

Unnamed: 0,match_id,match_date,kick_off,home_score,away_score,match_status,match_status_360,last_updated,last_updated_360,match_week,...,competition_stage_id,competition_stage_name,stadium_id,stadium_name,stadium_country_id,stadium_country_name,referee_id,referee_name,referee_country_id,referee_country_name
0,3942819,2024-07-10,2024-07-10 22:00:00,1,2,available,available,2024-07-15 07:54:23.583297,2024-07-15 07:57:36.684453,6,...,15,Semi-finals,373,Signal-Iduna-Park,85,Germany,241,Felix Zwayer,85,Germany
2,3942752,2024-07-09,2024-07-09 22:00:00,2,1,available,available,2024-07-10 08:51:07.032303,2024-07-10 13:01:26.597509,6,...,15,Semi-finals,4867,Allianz Arena,85,Germany,943,Slavko Vinčić,208,Slovenia


In [7]:
subs_hteam

array([1101, 1701, 3395, 3396])

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
       ... 
3480    NaN
3481    NaN
3482    NaN
3483    NaN
3484    NaN
Name: substitution_replacement_name, Length: 3485, dtype: object

In [16]:
pass_data = (event_df.type_name == "Pass") & (event_df.team_name == match_info_dict["Home Team"]) & (event_df.index < subs_hteam[0]) & (event_df.outcome_name.isnull()) & (event_df.sub_type_name != "Throw-in")
#pass_data = event_df.loc[pass_data, ["x", "y", "end_x", "end_y", "player_name", "pass_recipient_name"]]

In [29]:
event_df[event_df["type_name"] == "Substitution"]["sub_type_name"]

1100    NaN
1699    NaN
1700    NaN
3014    NaN
3015    NaN
3392    NaN
3393    NaN
3394    NaN
3395    NaN
Name: sub_type_name, dtype: object

In [15]:
phase_idx

[1, 2, 3]

In [21]:
pass_data.head(1)

Unnamed: 0,x,y,end_x,end_y,player_name,pass_recipient_name,cosine
22,50.1,16.3,37.1,24.1,Aké,Dijk,-0.8575


In [19]:
(37.1 - 50.1) /np.sqrt((37.1 - 50.1)**2 + (24.1 - 16.3)**2)

-0.8574929257125442

In [22]:
lineups.head(5)

Unnamed: 0,player_id,player_name,player_nickname,jersey_number,match_id,team_id,team_name,country_id,country_name
0,2988,Memphis Depay,Memphis Depay,10,3942819,941,Netherlands,160,Netherlands
1,3306,Nathan Aké,Nathan Aké,5,3942819,941,Netherlands,160,Netherlands
2,3311,Daley Blind,Daley Blind,17,3942819,941,Netherlands,160,Netherlands
3,3567,Georginio Wijnaldum,Georginio Wijnaldum,8,3942819,941,Netherlands,160,Netherlands
4,3669,Virgil van Dijk,Virgil van Dijk,4,3942819,941,Netherlands,160,Netherlands


In [29]:
def get_passing_data(event_data, lineup, team_name, phase_idx, subs_id):
    '''
    Creating the pass data frame from the event data for the selected phase

    INPUT
        event_data: pd.DataFrame with event data
        lineup: pd.DataFrame with lineups to obtain jersey numbers
        team_name: str
        phase_idx: int, index that corresponds to the phase selected by user
        subs_id: array, array of indices corresponding to substitution events

    OUTPUT
        pass_data: pd.DataFrame with x,y positions of passer and receiver
    '''
    if len(subs_id) == 0: # case of zero substitutions
        start_idx, end_idx = 0, len(event_data)
    else:
        start_idx = 0 if phase_idx == 0 else subs_id[phase_idx - 1]
        end_idx = subs_id[phase_idx] if phase_idx < len(subs_id) else len(event_data)

    mask = (
        (event_data.type_name == "Pass") &
        (event_data.team_name == team_name) &
        (event_data.outcome_name.isnull()) &
        (event_data.sub_type_name != "Throw-in")
    )

    pass_data = event_data.loc[start_idx: end_idx, :].loc[mask, ["x", "y", "end_x", "end_y", "player_id", "player_name", "pass_recipient_id", "pass_recipient_name"]]

    # Keep player surname only
    pass_data["player_name"] = pass_data["player_name"].apply(lambda x: str(x).split()[-1])
    pass_data["pass_recipient_name"] = pass_data["pass_recipient_name"].apply(lambda x: str(x).split()[-1])
    
    # Assign jersey number to passer according to lineups data frame
    pass_data = pass_data.merge(
        lineup[["player_id", "jersey_number"]],
        on = "player_id",
        how = "left"
    ).rename(columns = {"jersey_number": "player_name_jersey"})

    # Assign jersey number to recipient according to lineups data frame
    pass_data = pass_data.merge(
        lineup[["player_id", "jersey_number"]],
        left_on = "pass_recipient_id",
        right_on = "player_id",
        how = "left",
        suffixes = ("", "_recipient")
    ).rename(columns = {"jersey_number": "pass_recipient_jersey"})

    pass_data = pass_data.drop(columns = ["player_id_recipient"])

    return pass_data

In [30]:
passes = get_passing_data(event_df, lineups, match_info_dict["Home Team"], 0, np.array([1101, 1701, 3396]))

In [31]:
passes.shape

(92, 10)

In [2]:
pitch = Pitch(pitch_type = "statsbomb")

In [7]:
vars(pitch)

{'pitch_type': 'statsbomb',
 'half': False,
 'pitch_color': 'white',
 'line_color': '#b0b0b0',
 'linewidth': 2,
 'linestyle': None,
 'spot_scale': 0.002,
 'spot_type': 'circle',
 'line_zorder': 0.9,
 'stripe': False,
 'stripe_color': '#c2d59d',
 'stripe_zorder': 0.6,
 'pad_left': 4,
 'pad_right': 4,
 'pad_bottom': 4,
 'pad_top': 4,
 'positional': False,
 'positional_zorder': 0.8,
 'positional_linewidth': 2,
 'positional_linestyle': None,
 'positional_color': '#eadddd',
 'positional_alpha': 1,
 'shade_middle': False,
 'shade_color': '#f2f2f2',
 'shade_alpha': 1,
 'shade_zorder': 0.7,
 'pitch_length': None,
 'pitch_width': None,
 'goal_type': 'line',
 'goal_alpha': 1,
 'goal_linestyle': None,
 'line_alpha': 1,
 'axis': False,
 'label': False,
 'tick': False,
 'corner_arcs': False,
 'diameter1': 20.0,
 'diameter2': 20.0,
 'diameter_spot1': 0.48,
 'diameter_spot2': 0.48,
 'diameter_corner1': 2.186,
 'diameter_corner2': 2.186,
 'arc1_theta1': -53.05,
 'arc1_theta2': 53.05,
 'arc2_theta1': 1

In [10]:
type(params)

dict