In [None]:
# matplotlib.pyplot: base plotting library for figure and axes management
# numpy: numerical operations — used here for array concatenation and mean computation
# mplsoccer.Pitch: renders the pitch in standard horizontal orientation,
#   the conventional layout for pass networks as it mirrors the tactical board view
# mplsoccer.Sbopen: StatsBomb-integrated data parser built into mplsoccer;
#   handles JSON loading, coordinate normalization, and splits events into typed DataFrames
# pandas: DataFrame operations for filtering, groupby, and aggregation
import matplotlib.pyplot as plt
import numpy as np
from mplsoccer import Pitch, Sbopen
import pandas as pd

In [None]:
# Sbopen() instantiates the mplsoccer StatsBomb parser.
# parser.event() loads all event data for the given match_id and returns four DataFrames:
#   df       — the main event stream (one row per discrete action with x, y coordinates)
#   related  — a mapping table linking events that are connected (e.g., a pass and its reception)
#   freeze   — shot freeze frame data: positions of all visible players at the moment of each shot
#   tactics  — lineup and formation data extracted per team per period
# match_id=3923881 is the same match used in notebook 3.
parser = Sbopen()
df, related, freeze, tactics = parser.event(3923881)

In [None]:
# Identify the index of the first substitution made by Cote d'Ivoire.
# Constraining the pass network to pre-substitution events ensures the network
# represents the starting eleven only — mixing substituted players distorts the
# average positions and edge weights because they played different minutes.
sub = df.loc[df["type_name"] == "Substitution"].loc[df["team_name"] == "Côte d'Ivoire"].iloc[0]["index"]

# Build a boolean mask to isolate the pass events used for the network:
#   type_name == 'Pass'           — restricts to pass events only
#   team_name == "Côte d'Ivoire"  — isolates the target team
#   index < sub                   — excludes events after the first lineup change
#   outcome_name.isnull()         — in StatsBomb's model, incomplete passes have a non-null
#                                    outcome (e.g. "Incomplete", "Out"); successful passes
#                                    have a null outcome. This filters to completed passes only.
mask_cdm = (
    (df.type_name == 'Pass') &
    (df.team_name == "Côte d'Ivoire") &
    (df.index < sub) &
    (df.outcome_name.isnull())
)

# Project to only the columns needed for pass network computation:
#   x, y           — pass origin coordinates
#   end_x, end_y   — pass destination coordinates
#   player_name    — the player who made the pass
#   pass_recipient_name — the player who received the pass
df_pass = df.loc[mask_cdm, ['x', 'y', 'end_x', 'end_y', "player_name", 'pass_recipient_name']]

# Shorten names to last name only to reduce label clutter on the final visualization.
# str.split()[-1] extracts the last whitespace-delimited token from each full name.
df_pass["player_name"] = df_pass["player_name"].apply(lambda x: str(x).split()[-1])
df_pass["pass_recipient_name"] = df_pass["pass_recipient_name"].apply(lambda x: str(x).split()[-1])

In [None]:
# Compute the average pitch position for each player to use as their node location.
# A player's position is estimated as the centroid of all their pass touch coordinates:
#   - passx/passy: all x,y points where the player initiated a pass
#   - recx/recy:   all end_x/end_y points where the player received a pass
# np.concatenate() merges both arrays before np.mean() computes the centroid.
# This touch-weighted average position reflects the player's zone of activity more
# accurately than using pass origins alone (which would bias toward defensive roles)
# or reception points alone (which would bias toward forward roles).
scatter_df = pd.DataFrame()

for i, name in enumerate(df_pass["player_name"].unique()):
    passx = df_pass.loc[df_pass["player_name"] == name]["x"].to_numpy()
    recx = df_pass.loc[df_pass["pass_recipient_name"] == name]["end_x"].to_numpy()
    passy = df_pass.loc[df_pass["player_name"] == name]["y"].to_numpy()
    recy = df_pass.loc[df_pass["pass_recipient_name"] == name]["end_y"].to_numpy()
    scatter_df.at[i, "player_name"] = name

    # Centroid of all pass-touch locations (both as passer and as recipient)
    scatter_df.at[i, "x"] = np.mean(np.concatenate([passx, recx]))
    scatter_df.at[i, "y"] = np.mean(np.concatenate([passy, recy]))

    # Count of completed passes made by this player — used for node size encoding
    scatter_df.at[i, "no"] = df_pass.loc[df_pass["player_name"] == name].count().iloc[0]

# Normalize node size relative to the most active passer.
# The most active player gets marker_size=1500; others scale proportionally.
scatter_df['marker_size'] = (scatter_df['no'] / scatter_df['no'].max() * 1500)

In [None]:
# Create a canonical edge key for each pass by sorting the two player names alphabetically
# and joining them with "_". Sorting ensures that A→B and B→A passes are counted
# under the same key, representing the total bidirectional pass volume between any pair.
# This is appropriate here because we want to show connection strength, not directionality.
df_pass["pair_key"] = df_pass.apply(
    lambda x: "_".join(sorted([x["player_name"], x["pass_recipient_name"]])), axis=1
)

# groupby("pair_key").x.count() aggregates the number of passes per player pair.
# .x is used as a count column because .count() excludes NaN values;
# any non-null column would produce the same result.
# reset_index() converts the groupby result back into a flat DataFrame.
lines_df = df_pass.groupby(["pair_key"]).x.count().reset_index()
lines_df.rename({'x': 'pass_count'}, axis='columns', inplace=True)

# Filter to pairs with more than 2 passes to suppress noise from incidental connections.
# Single or double passes between players may reflect set pieces or accidental adjacency
# rather than genuine positional links. This threshold can be tuned based on the match.
lines_df = lines_df[lines_df['pass_count'] > 2]

In [None]:
# pitch.grid() creates a structured figure layout with separate axes for:
#   ax["pitch"]    — the main pitch area
#   ax["title"]    — the title zone above the pitch
#   ax["endnote"]  — the footnote zone below the pitch
# axis=False removes tick marks and axis labels from all sub-axes.
# grid_height=0.9 allocates 90% of the figure height to the pitch area.
pitch = Pitch(line_color='black')
fig, ax = pitch.grid(
    grid_height=0.9, title_height=0.06, axis=False,
    endnote_height=0.04, title_space=0, endnote_space=0
)

# Draw player nodes: size encodes pass volume (marker_size), grey edgecolor adds contrast.
# Each node is positioned at the player's average touch location computed in scatter_df.
pitch.scatter(
    scatter_df.x, scatter_df.y,
    s=scatter_df.marker_size,
    color='red', edgecolors='grey', linewidth=1, alpha=1,
    ax=ax["pitch"]
)

# pitch.annotate() places a text label at each node's (x, y) position.
# zorder=5 ensures labels are rendered above the connection lines (zorder=2).
# weight='bold' and size=16 ensure legibility at standard figure size.
for i, row in scatter_df.iterrows():
    pitch.annotate(
        row.player_name, xy=(row.x, row.y),
        c='black', va='center', ha='center',
        weight='bold', size=16, ax=ax["pitch"], zorder=5
    )

# Draw pass connection edges between player nodes.
# For each pair, look up both players' average positions from scatter_df,
# then draw a line with width proportional to pass_count / max_pass_count.
# The busiest pair gets lw=10; all others scale down proportionally.
# pitch.lines() handles the coordinate transformation from StatsBomb to axes space.
for i, row in lines_df.iterrows():
    player1 = row["pair_key"].split("_")[0]
    player2 = row["pair_key"].split("_")[1]

    # Retrieve each player's centroid position from the node DataFrame
    player1_x = scatter_df.loc[scatter_df["player_name"] == player1]['x'].iloc[0]
    player1_y = scatter_df.loc[scatter_df["player_name"] == player1]['y'].iloc[0]
    player2_x = scatter_df.loc[scatter_df["player_name"] == player2]['x'].iloc[0]
    player2_y = scatter_df.loc[scatter_df["player_name"] == player2]['y'].iloc[0]
    num_passes = row["pass_count"]

    # Normalize line width: maximum pass pair gets lw=10; others scale proportionally
    line_width = (num_passes / lines_df["pass_count"].max() * 10)
    pitch.lines(
        player1_x, player1_y, player2_x, player2_y,
        alpha=1, lw=line_width, zorder=2, color='red',
        ax=ax["pitch"]
    )

fig.suptitle("Pass network - Cote d Ivore", fontsize=30)
plt.show()

## Summary: Pass Network Analysis

### What This Notebook Does

This notebook constructs a pass network graph for Cote d'Ivoire in a match, using StatsBomb event data processed through the `mplsoccer` `Sbopen` parser. A pass network represents a team's positional structure and inter-player passing relationships as a graph: players are nodes positioned at their average touch location, and edges between them are weighted by the number of completed passes exchanged.

### Key Concepts

- **`Sbopen` parser**: mplsoccer's built-in StatsBomb parser returns four typed DataFrames (`df`, `related`, `freeze`, `tactics`) from a single call. The `df` DataFrame uses normalized column names (`type_name`, `outcome_name`, `team_name`) rather than the raw StatsBomb JSON keys, making filtering more readable.
- **Successful passes only** (`outcome_name.isnull()`): In the StatsBomb model, a pass with a non-null `outcome_name` (e.g. "Incomplete", "Out") is a failed pass. Only passes where `outcome_name` is null are completed. Including failed passes would add noise and misrepresent the team's actual ball circulation.
- **Pre-substitution filter**: Restricting events to `index < sub` (the first substitution) ensures the network exclusively represents the starting eleven. Including substitute minutes inflates the pass counts of players who came on late at the expense of those who were substituted off.
- **Average touch position as node location**: The centroid of all pass initiations and receptions for a player is a standard approximation of their positional role. Using both directions (as passer and as recipient) is more stable than either alone and reflects the player's full area of involvement.
- **Canonical pair key via sorting**: `"_".join(sorted([p1, p2]))` collapses directed edges into undirected ones. This aggregates A→B and B→A together, which is appropriate for measuring the total communication volume between any pair of players.
- **Proportional edge width**: `line_width = pass_count / max_pass_count * 10` encodes relationship strength directly in the visual weight of each edge, making dominant passing lanes immediately identifiable without a legend.

### Data Available

| DataFrame | Content |
|---|---|
| `scatter_df` | Player name, average x/y position, pass count, normalized marker size |
| `lines_df` | Pair key and pass count for all pairs with more than 2 completed passes |
| `df_pass` | Full filtered pass records with origin/destination coordinates and player names |

### Ideas to Extract More Value

- **Directed edges**: Split `pair_key` into separate A→B and B→A edges to identify asymmetric relationships. A high-volume directed edge from a midfielder to a forward with no return edge suggests a target man role and one-way supply dependency.
- **Betweenness centrality**: Apply graph-theoretic centrality measures using `networkx.betweenness_centrality()` on the pass network graph to identify structural connectors. A player with high betweenness sits on the most shortest paths between teammates — their removal or pressing disrupts ball circulation most.
- **Formation recovery from node positions**: Apply k-means clustering (`k=4` for a back four, etc.) to the `scatter_df` x/y coordinates to infer the team's actual spatial shape and compare it against the listed formation in the `tactics` DataFrame.
- **Multi-match network comparison**: Build pass networks for the same team across multiple matches and measure the Euclidean distance between each player's average position vectors. Low variance signals tactical consistency; high variance indicates positional flexibility or tactical shifts between matches.
- **Progressive pass filtering**: Restrict `df_pass` to passes that advance the ball at least 10 yards toward the opposition goal (or into the final third) before building the network. This produces a "progression network" that reveals which players drive the team's forward play.
- **xT (Expected Threat) edge weighting**: Assign an Expected Threat value to each pass based on its start and end zone and weight edges by cumulative xT rather than raw count. This reframes the network from "who passes most" to "which connections generate the most attacking threat."