# **Network Creation from Football Pass Data**

## **Objective**
Create passing networks from football event data to analyze team performance and player interaction patterns.

## **Steps**
1. Load processed pass event data
2. Create passing networks for each team in each match
3. Calculate comprehensive network metrics
4. Export network data for analysis

## **Output**
- Network summary with metrics for each team/match
- Network edges (pass connections between players)
- Player positions (average field positions)

## **1. Environment Setup**

In [1]:
import warnings
from pathlib import Path
from collections import defaultdict
from typing import Dict, List, Tuple, Any

import numpy as np
import pandas as pd
import networkx as nx
from networkx.exception import AmbiguousSolution
from networkx.algorithms import community as nx_comm

warnings.filterwarnings('ignore')

## **2. Data Loading**

In [2]:
DATA_PATH = Path("../data")
PROCESSED_DATA_PATH = DATA_PATH / "processed"

print("Loading processed event data...")
events_df = pd.read_parquet(PROCESSED_DATA_PATH / "events_processed.parquet")
print(f"✓ Data loaded: {len(events_df):,} records")
print(f"\nAvailable columns: {events_df.columns.tolist()}")

Loading processed event data...
✓ Data loaded: 7,209,091 records

Available columns: ['match_id', 'period', 'index', 'timestamp', 'type', 'team', 'team_id', 'player', 'player_id', 'pass_outcome', 'pass_recipient', 'pass_recipient_id', 'location_x', 'location_y', 'home_or_away', 'home_abbrev_name', 'away_abbrev_name', 'home_goals', 'away_goals', 'score_momentum', 'game_state', 'scoresheet', 'score_final', 'final_result']


## **3. Event Filtering**

In [3]:
PASS_EVENTS = ['Pass']

pass_events = events_df[events_df['type'].isin(PASS_EVENTS)].copy()

print(f"✓ On-ball events filtered: {len(pass_events):,} records")
print(f"  Percentage of total: {len(pass_events)/len(events_df)*100:.1f}%")
print("\nEvent type distribution:")
print(pass_events['type'].value_counts())

✓ On-ball events filtered: 2,016,542 records
  Percentage of total: 28.0%

Event type distribution:
type
Pass    2016542
Name: count, dtype: int64


## **4. Network Construction Functions**

In [4]:
def calculate_player_positions(pass_events: pd.DataFrame) -> pd.DataFrame:
   player_positions = pass_events.groupby(
       ['match_id', 'team', 'player_id', 'player']
   ).agg({
       'location_x': 'mean',
       'location_y': 'mean',
       'team_id': 'first',
       'home_or_away': 'first',
       'home_abbrev_name': 'first',
       'away_abbrev_name': 'first',
       'score_final': 'first',
       'final_result': 'first'
   }).reset_index()
   
   return player_positions


def create_network_edges(pass_events: pd.DataFrame) -> pd.DataFrame:
    network_edges = []
    groups = pass_events.groupby(['match_id', 'team'])
    
    for (match_id, team), group in groups:
        pass_counts = defaultdict(int)
        player_total_passes = defaultdict(int)
        
        for _, row in group.iterrows():
            if (pd.notna(row['player_id']) and 
                pd.notna(row['pass_recipient_id']) and 
                row['player_id'] != row['pass_recipient_id']):
                
                source = row['player_id']
                target = row['pass_recipient_id']
                
                key = (source, target)
                pass_counts[key] += 1
                player_total_passes[source] += 1
        
        for (source, target), count in pass_counts.items():
            total_by_source = player_total_passes[source]
            weight = count / total_by_source if total_by_source > 0 else 0
            network_edges.append({
                'match_id': match_id,
                'team': team,
                'source_id': source,
                'target_id': target,
                'weight': weight
            })
    
    return pd.DataFrame(network_edges)

## **5. Network Metrics Calculation**

In [14]:
def calculate_network_metrics(pass_events: pd.DataFrame, edges_df: pd.DataFrame, player_positions: pd.DataFrame) -> pd.DataFrame:
    network_summaries = []
    metric_funcs = [
        calculate_basic_metrics,
        calculate_centrality_metrics,
        calculate_clustering_metrics,
        calculate_spectral_metrics,
        calculate_advanced_metrics
    ]
    groups = pass_events.groupby(['match_id', 'team'])
    for (match_id, team), group in groups:
        if group.empty:
            continue
        first = group.iloc[0]
        metrics = {
            'match_id': match_id,
            'team': team,
            'team_id': first.get('team_id'),
            'home_or_away': first.get('home_or_away'),
            'home_abbrev_name': first.get('home_abbrev_name'),
            'away_abbrev_name': first.get('away_abbrev_name'),
            'goal_in_match': check_goal_in_match(pass_events, match_id, team),
            'score_final': get_match_final_score(pass_events, match_id),
            'final_result': first.get('final_result'),
            'edge_count': 0, 'network_density': 0,
            'avg_in_degree': 0, 'std_in_degree': 0,
            'avg_out_degree': 0, 'std_out_degree': 0,
            'avg_betweenness': 0, 'std_betweenness': 0, 'max_betweenness': 0,
            'avg_pagerank': 0, 'std_pagerank': 0,
            'avg_eigenvector': 0, 'std_eigenvector': 0,
            'avg_clustering': 0, 'std_clustering': 0,
            'transitivity': 0, 'triangle_count': 0,
            'reciprocity': 0, 'modularity': 0,
            'num_cycles': 0,
            'spectral_radius': 0, 'fiedler_value': 0,
            'edge_weight_entropy': 0,
            'avg_katz': 0, 'std_katz': 0,
            'avg_harmonic_closeness': 0, 'std_harmonic_closeness': 0
        }
        team_edges = edges_df.query("match_id == @match_id and team == @team")
        pass_edges = team_edges[team_edges.source_id != team_edges.target_id]
        if not pass_edges.empty:
            G = nx.DiGraph()
            players = player_positions.query("match_id == @match_id and team == @team")
            G.add_nodes_from(players['player_id'])
            for _, e in pass_edges.iterrows():
                G.add_edge(e.source_id, e.target_id, weight=e.weight)
            for func in metric_funcs:
                metrics.update(func(G))
        network_summaries.append(metrics)
    return pd.DataFrame(network_summaries)

def check_goal_in_match(df: pd.DataFrame, match_id: int, team: str) -> bool:
    team_data = df[(df['match_id'] == match_id) & (df['team'] == team)]
    if team_data.empty:
        return False
    side = team_data['home_or_away'].iat[0]
    goals = team_data['home_goals'] if side == 'HOME' else team_data['away_goals']
    return (goals.max() - goals.min()) > 0

def get_match_final_score(df: pd.DataFrame, match_id: int) -> str:
    md = df[df['match_id'] == match_id]
    if md.empty:
        return "0 x 0"
    if 'index' in md.columns and 'score_momentum' in md.columns:
        last = md.loc[md['index'] == md['index'].max()]
        if not last.empty:
            return last['score_momentum'].iat[0]
    return f"{md['home_goals'].max()} x {md['away_goals'].max()}"

def calculate_basic_metrics(G: nx.DiGraph) -> Dict[str, float]:
    m = {}
    m['edge_count'] = G.number_of_edges()
    m['network_density'] = nx.density(G)
    in_deg = np.array([d for _, d in G.in_degree(weight='weight')])
    out_deg = np.array([d for _, d in G.out_degree(weight='weight')])
    if in_deg.size:
        m['avg_in_degree'], m['std_in_degree'] = in_deg.mean(), in_deg.std()
        m['avg_out_degree'], m['std_out_degree'] = out_deg.mean(), out_deg.std()
    return m

def calculate_centrality_metrics(G: nx.DiGraph) -> Dict[str, float]:
    m = {}
    bt = nx.betweenness_centrality(G, weight='weight', normalized=True)
    vals = np.array(list(bt.values()))
    if vals.size:
        m['avg_betweenness'], m['std_betweenness'], m['max_betweenness'] = vals.mean(), vals.std(), vals.max()
    pr = np.array(list(nx.pagerank(G, weight='weight').values()))
    m['avg_pagerank'], m['std_pagerank'] = pr.mean(), pr.std()
    Gu = G.to_undirected()
    try:
        ev = nx.eigenvector_centrality_numpy(Gu, weight='weight')
    except nx.exception.AmbiguousSolution:
        comps = list(nx.connected_components(Gu))
        lc = max(comps, key=len)
        ev_sub = nx.eigenvector_centrality_numpy(Gu.subgraph(lc), weight='weight')
        ev = {n: ev_sub.get(n, 0) for n in Gu.nodes()}
    evv = np.array(list(ev.values()))
    m['avg_eigenvector'], m['std_eigenvector'] = evv.mean(), evv.std()
    try:
        kz = np.array(list(nx.katz_centrality_numpy(G, weight='weight', alpha=0.005).values()))
        m['avg_katz'], m['std_katz'] = kz.mean(), kz.std()
    except:
        pass
    hc = np.array(list(nx.harmonic_centrality(G).values()))
    m['avg_harmonic_closeness'], m['std_harmonic_closeness'] = hc.mean(), hc.std()
    return m

def calculate_clustering_metrics(G: nx.DiGraph) -> Dict[str, float]:
    m = {}
    Gu = G.to_undirected()
    cl = np.array(list(nx.clustering(Gu, weight='weight').values()))
    if cl.size:
        m['avg_clustering'], m['std_clustering'] = cl.mean(), cl.std()
    m['transitivity'] = nx.transitivity(Gu)
    tri = nx.triangles(Gu)
    m['triangle_count'] = sum(tri.values()) // 3
    m['reciprocity'] = nx.reciprocity(G) or 0
    comms = list(nx_comm.greedy_modularity_communities(Gu, weight='weight'))
    if comms:
        m['modularity'] = nx_comm.modularity(Gu, comms, weight='weight')
    m['num_cycles'] = len(nx.cycle_basis(Gu))
    return m

def calculate_spectral_metrics(G: nx.DiGraph) -> Dict[str, float]:
    m = {}
    Gu = G.to_undirected()
    A = nx.to_numpy_array(Gu, weight='weight')
    eigs = np.linalg.eigvals(A)
    if eigs.size:
        m['spectral_radius'] = float(np.max(np.abs(eigs)))
    L = nx.normalized_laplacian_matrix(Gu, weight='weight').toarray()
    lev = np.sort(np.linalg.eigvals(L))
    if lev.size > 1:
        m['fiedler_value'] = float(lev[1])
    return m

def calculate_advanced_metrics(G: nx.DiGraph) -> Dict[str, float]:
    m = {}
    weights = np.array([data['weight'] for _, _, data in G.edges(data=True)])
    total = weights.sum()
    if total > 0:
        probs = weights / total
        nz = probs[probs > 0]
        m['edge_weight_entropy'] = float(-np.sum(nz * np.log2(nz)))
    return m

## **6. Network Creation Pipeline**

In [15]:
def create_football_networks(pass_events: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
   player_positions = calculate_player_positions(pass_events)
   network_edges = create_network_edges(pass_events)
   network_summary = calculate_network_metrics(pass_events, network_edges, player_positions)
   
   return network_summary, network_edges, player_positions

network_summary, network_edges, player_positions = create_football_networks(pass_events)

print("\n✓ Network creation completed!")
print(f"  Networks created: {len(network_summary)}")
print(f"  Total edges: {len(network_edges)}")
print(f"  Total player positions: {len(player_positions)}")


✓ Network creation completed!
  Networks created: 4050
  Total edges: 479129
  Total player positions: 56979


## **7. Data Preparation for Analysis**

In [16]:
network_summary['final_result'] = network_summary['final_result'].map({'W': 1, 'L': 0})
print("✓ Final result converted to binary format (Win=1, Loss=0)")

✓ Final result converted to binary format (Win=1, Loss=0)


## **8. Network Analysis**

In [17]:
print("\n=== NETWORK ANALYSIS ===")

key_metrics = [
    'edge_count', 'network_density', 'avg_betweenness', 
    'avg_pagerank', 'avg_clustering', 'transitivity'
]

print("\nSummary statistics for key network metrics:")
for metric in key_metrics:
    if metric in network_summary.columns:
        values = network_summary[metric]
        print(f"\n{metric}:")
        print(f"  Mean: {values.mean():.4f}")
        print(f"  Median: {values.median():.4f}")
        print(f"  Std Dev: {values.std():.4f}")
        print(f"  Min: {values.min():.4f}")
        print(f"  Max: {values.max():.4f}")

print("\n\nMetrics comparison by match result:")
for metric in key_metrics[:3]:  
    if metric in network_summary.columns:
        wins = network_summary[network_summary['final_result'] == 1][metric]
        losses = network_summary[network_summary['final_result'] == 0][metric]
        print(f"\n{metric}:")
        print(f"  Wins - Mean: {wins.mean():.4f}, Std: {wins.std():.4f}")
        print(f"  Losses - Mean: {losses.mean():.4f}, Std: {losses.std():.4f}")


=== NETWORK ANALYSIS ===

Summary statistics for key network metrics:

edge_count:
  Mean: 118.3035
  Median: 119.0000
  Std Dev: 13.2752
  Min: 61.0000
  Max: 166.0000

network_density:
  Mean: 0.6403
  Median: 0.6429
  Std Dev: 0.0753
  Min: 0.3250
  Max: 0.8939

avg_betweenness:
  Mean: 0.0701
  Median: 0.0694
  Std Dev: 0.0090
  Min: 0.0435
  Max: 0.1106

avg_pagerank:
  Mean: 0.0710
  Median: 0.0714
  Std Dev: 0.0037
  Min: 0.0588
  Max: 0.0909

avg_clustering:
  Mean: 0.1660
  Median: 0.1693
  Std Dev: 0.0555
  Min: 0.0567
  Max: 0.3810

transitivity:
  Mean: 0.8303
  Median: 0.8355
  Std Dev: 0.0471
  Min: 0.5899
  Max: 0.9677


Metrics comparison by match result:

edge_count:
  Wins - Mean: 117.1867, Std: 13.8986
  Losses - Mean: 119.4202, Std: 12.5254

network_density:
  Wins - Mean: 0.6379, Std: 0.0785
  Losses - Mean: 0.6426, Std: 0.0719

avg_betweenness:
  Wins - Mean: 0.0722, Std: 0.0092
  Losses - Mean: 0.0680, Std: 0.0084


## **9. Export Network Data**

In [18]:
print("\n=== EXPORTING NETWORK DATA ===")

networks_path = PROCESSED_DATA_PATH / "on_ball_networks.parquet"
network_summary.to_parquet(networks_path, index=False)
print(f"✓ Network summary saved to: {networks_path}")
print(f"  File size: {networks_path.stat().st_size / (1024*1024):.2f} MB")

edges_path = PROCESSED_DATA_PATH / "network_edges.parquet"
network_edges.to_parquet(edges_path, index=False)
print(f"\n✓ Network edges saved to: {edges_path}")
print(f"  File size: {edges_path.stat().st_size / (1024*1024):.2f} MB")

positions_path = PROCESSED_DATA_PATH / "player_positions.parquet"
player_positions.to_parquet(positions_path, index=False)
print(f"\n✓ Player positions saved to: {positions_path}")
print(f"  File size: {positions_path.stat().st_size / (1024*1024):.2f} MB")

print("\n✓ All network data exported successfully!")


=== EXPORTING NETWORK DATA ===
✓ Network summary saved to: ..\data\processed\on_ball_networks.parquet
  File size: 0.65 MB

✓ Network edges saved to: ..\data\processed\network_edges.parquet
  File size: 2.27 MB

✓ Player positions saved to: ..\data\processed\player_positions.parquet
  File size: 1.17 MB

✓ All network data exported successfully!
