In [1]:
# Import libraries
import datetime
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import json
from mplsoccer import Pitch, Sbopen
from typing import Literal

In [2]:
# Find sys path
import sys
sys.path.append("../src")

In [3]:
# Import custom functions
import load_data as ld

## Load data

In [4]:
# Read config json file and define parameters
with open("../config.json", "r") as file:
    params = json.load(file)

# Load data and get match data
parser, match_data = ld.get_match_data(params["competition"], params["season"])

# Get match info
match_info_dict, lineups = ld.get_match_info(parser, match_data, 3942819)

In [5]:
# TEST 1: Show match info and a glimpse of the lineups
print(match_info_dict)
print(lineups.head(2))

{'Home Team': 'Netherlands', 'Away Team': 'England', 'Final Score': '1-2', 'Match Date': 'July 10, 2024', 'Stage': 'Semi-finals', 'Stadium': 'Signal-Iduna-Park'}
   player_id    player_name player_nickname  jersey_number  match_id  team_id  \
0       2988  Memphis Depay   Memphis Depay             10   3942819      941   
1       3306     Nathan Aké      Nathan Aké              5   3942819      941   

     team_name  country_id country_name  
0  Netherlands         160  Netherlands  
1  Netherlands         160  Netherlands  


In [6]:
# Get match event data & phases of play
event_df, related, freeze, tactics = parser.event(3942819)
phase_bounds = ld.phase_of_play(event_df, match_info_dict["Home Team"])

In [7]:
# TEST 2: Show phases of play bounds
print(phase_bounds)

[1101, 1701, 3396]


In [8]:
# Obtain pass data for a specific phase of play and add 
passes = ld.get_passing_data(event_df, lineups, match_info_dict["Home Team"], 1, phase_bounds)
passes = ld.cosine_pass_vector(passes)

In [9]:
# TEST 3: Show passes for the Home Team after the first substitution
passes.head(2)

Unnamed: 0,x,y,end_x,end_y,pass_length,player_id,player_name,pass_recipient_id,pass_recipient_name,player_name_jersey,pass_recipient_jersey,cosine
0,47.8,66.3,65.4,55.1,19.075708,15582.0,Malen,20750.0,Gakpo,18,11,0.843661
1,4.6,42.0,30.9,24.2,29.03893,37274.0,Verbruggen,21582.0,Reijnders,1,14,0.828154


In [10]:
# Divide pitch and get filtered data for a specific region
first_bound, second_bound, mid_region_passes = ld.region_pass_filter(passes, params, "statsbomb", "mid")

# TEST 4: Show passes originating from midfield
print(f"End of defensive third: {first_bound} | Var. Type: {type(first_bound)}")
print(f"End of midfield third: {second_bound} | Var. Type: {type(second_bound)}")
print(mid_region_passes.head(2))

End of defensive third: 40.0 | Var. Type: <class 'numpy.float64'>
End of midfield third: 80.0 | Var. Type: <class 'numpy.float64'>
       x     y  end_x  end_y  pass_length  player_id player_name  \
0   47.8  66.3   65.4   55.1    19.075708    15582.0       Malen   
10  43.5  34.3   38.8   44.1     9.938397    15582.0       Malen   

    pass_recipient_id pass_recipient_name  player_name_jersey  \
0             20750.0               Gakpo                  18   
10            39167.0              Simons                  18   

    pass_recipient_jersey    cosine  
0                      11  0.843661  
10                      7 -0.432432  


In [17]:
# Obtain forward passes that originated in midfield
fwd_mid_passes = ld.direction_pass_filter(mid_region_passes, "fwd")
# Obtain progressive passes from midfield to attacking third
prog_m2a_passes = ld.progressive_passes(fwd_mid_passes, "m2a", first_bound, second_bound)

# TEST 5: Show forward passes from midfield and progressive passes
print(f"Overall passes: {len(passes)}") 
print(f"Passes originating from midfield: {len(mid_region_passes)}")
print(f"Forward passes originating from midfield: {len(fwd_mid_passes)} | Progressive passes to attacking third from midfield: {len(prog_m2a_passes)}")

Overall passes: 74
Passes originating from midfield: 38
Forward passes originating from midfield: 9 | Progressive passes to attacking third from midfield: 3


### Network Analysis

In [None]:
# Calculate node centralities
node_strength(pass_graph, "in", "Intensity")
node_strength(pass_graph, "out", "Intensity")
node_strength(pass_graph, None, "Intensity")
distance_centralities(pass_graph, "Betweenness", "Distance")
distance_centralities(pass_graph, "In-Harmonic", "Distance")
distance_centralities(pass_graph, "Out-Harmonic", "Distance")

In [None]:
# TEST 1: Loading data and creating the Graph object
print("TEST 1: Network Creation")
print(pass_graph.number_of_nodes())
print(pass_net_edgelist.shape[0] == pass_graph.number_of_edges())

# TEST 2: Check node and edge attributes
print("TEST 2: Initial Node and Edge Attributes")
print(pass_graph.nodes["Dijk"]["Avg Pos"])
pass_graph.get_edge_data("Dijk", "Depay")

# TEST 3:
print(pass_graph.nodes["Simons"])

### Visualization & Statistics

In [None]:
pass_data = (event_df.type_name == "Pass") & (event_df.team_name == match_info_dict["Home Team"]) & (event_df.index < subs_hteam[0]) & (event_df.outcome_name.isnull()) & (event_df.sub_type_name != "Throw-in")
#pass_data = event_df.loc[pass_data, ["x", "y", "end_x", "end_y", "player_name", "pass_recipient_name"]]

In [None]:
phase_idx

In [None]:
pass_data.head(1)

In [None]:
(37.1 - 50.1) /np.sqrt((37.1 - 50.1)**2 + (24.1 - 16.3)**2)

In [None]:
lineups.head(5)

In [None]:
def get_passing_data(event_data, lineup, team_name, phase_idx, subs_id):
    '''
    Creating the pass data frame from the event data for the selected phase

    INPUT
        event_data: pd.DataFrame with event data
        lineup: pd.DataFrame with lineups to obtain jersey numbers
        team_name: str
        phase_idx: int, index that corresponds to the phase selected by user
        subs_id: array, array of indices corresponding to substitution events

    OUTPUT
        pass_data: pd.DataFrame with x,y positions of passer and receiver
    '''
    if len(subs_id) == 0: # case of zero substitutions
        start_idx, end_idx = 0, len(event_data)
    else:
        start_idx = 0 if phase_idx == 0 else subs_id[phase_idx - 1]
        end_idx = subs_id[phase_idx] if phase_idx < len(subs_id) else len(event_data)

    mask = (
        (event_data.type_name == "Pass") &
        (event_data.team_name == team_name) &
        (event_data.outcome_name.isnull()) &
        (event_data.sub_type_name != "Throw-in")
    )

    pass_data = event_data.loc[start_idx: end_idx, :].loc[mask, ["x", "y", "end_x", "end_y", "player_id", "player_name", "pass_recipient_id", "pass_recipient_name"]]

    # Keep player surname only
    pass_data["player_name"] = pass_data["player_name"].apply(lambda x: str(x).split()[-1])
    pass_data["pass_recipient_name"] = pass_data["pass_recipient_name"].apply(lambda x: str(x).split()[-1])
    
    # Assign jersey number to passer according to lineups data frame
    pass_data = pass_data.merge(
        lineup[["player_id", "jersey_number"]],
        on = "player_id",
        how = "left"
    ).rename(columns = {"jersey_number": "player_name_jersey"})

    # Assign jersey number to recipient according to lineups data frame
    pass_data = pass_data.merge(
        lineup[["player_id", "jersey_number"]],
        left_on = "pass_recipient_id",
        right_on = "player_id",
        how = "left",
        suffixes = ("", "_recipient")
    ).rename(columns = {"jersey_number": "pass_recipient_jersey"})

    pass_data = pass_data.drop(columns = ["player_id_recipient"])

    return pass_data

In [None]:
passes = get_passing_data(event_df, lineups, match_info_dict["Home Team"], 0, np.array([1101, 1701, 3396]))

In [None]:
passes.shape

In [None]:
pitch = Pitch(pitch_type = "statsbomb")

In [None]:
vars(pitch)

In [None]:
type(params)