# **Network Creation from Football Event Data**

## **Objective**
Create complex networks from football event data to analyze team performance patterns.

## **Steps**
1. Load processed event data
2. Filter ball-related events (passes, dribbles, shots, fouls won)
3. Create passing networks for each team in each match
4. Calculate comprehensive network metrics
5. Export network data for analysis

## **Output**
- Network summary with metrics for each team/match
- Network edges (pass connections between players)
- Player positions (average field positions)

## **1. Environment Setup**

In [1]:
import warnings
from pathlib import Path
from collections import defaultdict
from typing import Dict, List, Tuple, Any

import numpy as np
import pandas as pd
import networkx as nx
from networkx.exception import AmbiguousSolution
from networkx.algorithms import community as nx_comm

warnings.filterwarnings('ignore')

## **2. Data Loading**

In [2]:
# Configure paths
DATA_PATH = Path("../data")
PROCESSED_DATA_PATH = DATA_PATH / "processed"

# Load processed event data
print("Loading processed event data...")
events_df = pd.read_parquet(PROCESSED_DATA_PATH / "events_processed.parquet")
print(f"✓ Data loaded: {len(events_df):,} records")
print(f"\nAvailable columns: {events_df.columns.tolist()}")

Loading processed event data...
✓ Data loaded: 7,209,091 records

Available columns: ['match_id', 'period', 'index', 'timestamp', 'type', 'team', 'team_id', 'player', 'player_id', 'pass_outcome', 'pass_recipient', 'pass_recipient_id', 'location_x', 'location_y', 'home_or_away', 'home_abbrev_name', 'away_abbrev_name', 'home_goals', 'away_goals', 'score_momentum', 'game_state', 'scoresheet', 'score_final', 'final_result']


## **3. Event Filtering**

In [3]:
# Define on-ball events
ON_BALL_EVENTS = ['Pass', 'Dribble', 'Shot', 'Foul Won']

# Filter on-ball events
on_ball_events = events_df[events_df['type'].isin(ON_BALL_EVENTS)].copy()

print(f"✓ On-ball events filtered: {len(on_ball_events):,} records")
print(f"  Percentage of total: {len(on_ball_events)/len(events_df)*100:.1f}%")
print("\nEvent type distribution:")
print(on_ball_events['type'].value_counts())

✓ On-ball events filtered: 2,193,999 records
  Percentage of total: 30.4%

Event type distribution:
type
Pass        2016542
Dribble       69318
Foul Won      57267
Shot          50872
Name: count, dtype: int64


## **4. Network Construction Functions**

In [4]:
def calculate_player_positions(on_ball_events: pd.DataFrame) -> pd.DataFrame:
    """Calculate average field positions for each player per match and team."""
    print("\nCalculating player positions...")
    
    player_positions = on_ball_events.groupby(
        ['match_id', 'team', 'player_id', 'player']
    ).agg({
        'location_x': 'mean',
        'location_y': 'mean',
        'team_id': 'first',
        'home_or_away': 'first',
        'home_abbrev_name': 'first',
        'away_abbrev_name': 'first',
        'score_final': 'first',
        'final_result': 'first'
    }).reset_index()
    
    print(f"  ✓ Positions calculated for {len(player_positions)} player-match combinations")
    return player_positions


def create_network_edges(on_ball_events: pd.DataFrame) -> pd.DataFrame:
    """Create network edges from passing and individual action data."""
    print("\nCreating network edges...")
    
    network_edges = []
    groups = on_ball_events.groupby(['match_id', 'team'])
    total_groups = len(groups)
    
    for idx, ((match_id, team), group) in enumerate(groups):
        # Count passes between players
        pass_counts = defaultdict(int)
        passes = group[group['type'] == 'Pass']
        total_passes = len(passes)
        
        for _, row in passes.iterrows():
            if pd.notna(row['player_id']) and pd.notna(row['pass_recipient_id']):
                key = (row['player_id'], row['pass_recipient_id'])
                pass_counts[key] += 1
        
        # Count individual actions (self-loops)
        individual_counts = defaultdict(int)
        individual_actions = group[group['type'].isin(['Shot', 'Carry', 'Dribble'])]
        
        for _, row in individual_actions.iterrows():
            if pd.notna(row['player_id']):
                individual_counts[row['player_id']] += 1
        
        # Create edge records for passes
        for (source, target), count in pass_counts.items():
            weight = count / total_passes if total_passes > 0 else 0
            network_edges.append({
                'match_id': match_id,
                'team': team,
                'source_id': source,
                'target_id': target,
                'weight': weight
            })
        
        # Create edge records for individual actions (self-loops)
        for player_id, count in individual_counts.items():
            network_edges.append({
                'match_id': match_id,
                'team': team,
                'source_id': player_id,
                'target_id': player_id,
                'weight': count
            })
        
        # Progress update
        if (idx + 1) % 100 == 0:
            print(f"  Processed {idx + 1}/{total_groups} team-match combinations...")
    
    edges_df = pd.DataFrame(network_edges)
    print(f"  ✓ Total edges created: {len(edges_df)}")
    return edges_df

## **5. Network Metrics Calculation**

In [5]:
def calculate_network_metrics(on_ball_events: pd.DataFrame, 
                            edges_df: pd.DataFrame, 
                            player_positions: pd.DataFrame) -> pd.DataFrame:
    """Calculate comprehensive network metrics for each team-match combination."""
    print("\nCalculating network metrics...")
    
    network_summaries = []
    groups = on_ball_events.groupby(['match_id', 'team'])
    
    for (match_id, team), group in groups:
        if group.empty:
            continue
        
        # Initialize metrics dictionary
        first_row = group.iloc[0]
        metrics = {
            # Match information
            'match_id': match_id,
            'team': team,
            'team_id': first_row.get('team_id'),
            'home_or_away': first_row.get('home_or_away'),
            'home_abbrev_name': first_row.get('home_abbrev_name'),
            'away_abbrev_name': first_row.get('away_abbrev_name'),
            'goal_in_match': check_goal_in_match(on_ball_events, match_id, team),
            'score_final': get_match_final_score(on_ball_events, match_id),
            'final_result': first_row.get('final_result'),
            
            # Initialize all metrics to zero
            'edge_count': 0,
            'network_density': 0,
            'avg_in_degree': 0,
            'std_in_degree': 0,
            'avg_out_degree': 0,
            'std_out_degree': 0,
            'avg_betweenness': 0,
            'std_betweenness': 0,
            'max_betweenness': 0,
            'avg_pagerank': 0,
            'std_pagerank': 0,
            'avg_eigenvector': 0,
            'std_eigenvector': 0,
            'avg_clustering': 0,
            'std_clustering': 0,
            'transitivity': 0,
            'triangle_count': 0,
            'reciprocity': 0,
            'assortativity': 0,
            'modularity': 0,
            'num_cycles': 0,
            'spectral_radius': 0,
            'fiedler_value': 0,
            'edge_weight_entropy': 0,
            'avg_katz': 0,
            'std_katz': 0,
            'avg_harmonic_closeness': 0,
            'std_harmonic_closeness': 0
        }
        
        # Get edges for this team-match
        team_edges = edges_df.query("match_id == @match_id and team == @team")
        # Exclude self-loops for main network
        pass_edges = team_edges[team_edges.source_id != team_edges.target_id]
        
        if not pass_edges.empty:
            # Create directed graph
            G = nx.DiGraph()
            
            # Add nodes from player positions
            team_players = player_positions.query("match_id == @match_id and team == @team")
            for _, player in team_players.iterrows():
                G.add_node(player.player_id)
            
            # Add edges
            for _, edge in pass_edges.iterrows():
                G.add_edge(edge.source_id, edge.target_id, weight=edge.weight)
            
            # Calculate all metrics
            metrics.update(calculate_basic_metrics(G))
            metrics.update(calculate_centrality_metrics(G))
            metrics.update(calculate_clustering_metrics(G))
            metrics.update(calculate_spectral_metrics(G))
            metrics.update(calculate_advanced_metrics(G))
        
        network_summaries.append(metrics)
    
    summary_df = pd.DataFrame(network_summaries)
    print(f"  ✓ Metrics calculated for {len(summary_df)} networks")
    return summary_df


def check_goal_in_match(df: pd.DataFrame, match_id: int, team: str) -> bool:
    """Check if the team scored in the match."""
    team_data = df[(df['match_id'] == match_id) & (df['team'] == team)]
    if team_data.empty:
        return False
    
    side = team_data['home_or_away'].iloc[0]
    if side == 'HOME':
        goals = team_data['home_goals']
    else:
        goals = team_data['away_goals']
    
    return (goals.max() - goals.min()) > 0


def get_match_final_score(df: pd.DataFrame, match_id: int) -> str:
    """Get the final score of the match."""
    match_data = df[df['match_id'] == match_id]
    if match_data.empty:
        return "0 x 0"
    
    # Use score_momentum from the last event
    if 'index' in match_data.columns and 'score_momentum' in match_data.columns:
        last_event = match_data.loc[match_data['index'] == match_data['index'].max()]
        if not last_event.empty:
            return last_event['score_momentum'].iloc[0]
    
    # Fallback to max goals
    if 'home_goals' in match_data.columns and 'away_goals' in match_data.columns:
        return f"{match_data['home_goals'].max()} x {match_data['away_goals'].max()}"
    
    return "0 x 0"


def calculate_basic_metrics(G: nx.DiGraph) -> Dict[str, float]:
    """Calculate basic network metrics."""
    metrics = {}
    
    # Edge count and density
    metrics['edge_count'] = G.number_of_edges()
    metrics['network_density'] = nx.density(G)
    
    # Degree metrics
    in_degrees = np.array([d for _, d in G.in_degree(weight='weight')])
    out_degrees = np.array([d for _, d in G.out_degree(weight='weight')])
    
    if len(in_degrees) > 0:
        metrics['avg_in_degree'] = in_degrees.mean()
        metrics['std_in_degree'] = in_degrees.std()
        metrics['avg_out_degree'] = out_degrees.mean()
        metrics['std_out_degree'] = out_degrees.std()
    
    return metrics


def calculate_centrality_metrics(G: nx.DiGraph) -> Dict[str, float]:
    """Calculate centrality metrics."""
    metrics = {}
    
    # Betweenness centrality
    betweenness = nx.betweenness_centrality(G, weight='weight', normalized=True)
    betweenness_values = np.array(list(betweenness.values()))
    
    if len(betweenness_values) > 0:
        metrics['avg_betweenness'] = betweenness_values.mean()
        metrics['std_betweenness'] = betweenness_values.std()
        metrics['max_betweenness'] = betweenness_values.max()
    
    # PageRank
    pagerank = nx.pagerank(G, weight='weight')
    pagerank_values = np.array(list(pagerank.values()))
    metrics['avg_pagerank'] = pagerank_values.mean()
    metrics['std_pagerank'] = pagerank_values.std()
    
    # Eigenvector centrality (on undirected version)
    G_undirected = G.to_undirected()
    try:
        eigenvector = nx.eigenvector_centrality_numpy(G_undirected, weight='weight')
    except AmbiguousSolution:
        # Handle disconnected components
        components = list(nx.connected_components(G_undirected))
        largest_component = max(components, key=len)
        subgraph = G_undirected.subgraph(largest_component)
        eigenvector_sub = nx.eigenvector_centrality_numpy(subgraph, weight='weight')
        eigenvector = {n: eigenvector_sub.get(n, 0) for n in G_undirected.nodes()}
    
    eigenvector_values = np.array(list(eigenvector.values()))
    metrics['avg_eigenvector'] = eigenvector_values.mean()
    metrics['std_eigenvector'] = eigenvector_values.std()
    
    # Katz centrality
    try:
        katz = nx.katz_centrality_numpy(G, weight='weight', alpha=0.005)
        katz_values = np.array(list(katz.values()))
        metrics['avg_katz'] = katz_values.mean()
        metrics['std_katz'] = katz_values.std()
    except:
        pass
    
    # Harmonic closeness centrality
    harmonic = nx.harmonic_centrality(G)
    harmonic_values = np.array(list(harmonic.values()))
    metrics['avg_harmonic_closeness'] = harmonic_values.mean()
    metrics['std_harmonic_closeness'] = harmonic_values.std()
    
    return metrics


def calculate_clustering_metrics(G: nx.DiGraph) -> Dict[str, float]:
    """Calculate clustering and community metrics."""
    metrics = {}
    G_undirected = G.to_undirected()
    
    # Clustering coefficient
    clustering = nx.clustering(G_undirected, weight='weight')
    clustering_values = np.array(list(clustering.values()))
    
    if len(clustering_values) > 0:
        metrics['avg_clustering'] = clustering_values.mean()
        metrics['std_clustering'] = clustering_values.std()
    
    # Transitivity
    metrics['transitivity'] = nx.transitivity(G_undirected)
    
    # Triangle count
    triangles = nx.triangles(G_undirected)
    metrics['triangle_count'] = sum(triangles.values()) // 3
    
    # Reciprocity (for directed graph)
    metrics['reciprocity'] = nx.reciprocity(G) or 0
    
    # Assortativity
    metrics['assortativity'] = nx.degree_assortativity_coefficient(G, weight='weight') or 0
    
    # Modularity
    communities = list(nx_comm.greedy_modularity_communities(G_undirected, weight='weight'))
    if communities:
        metrics['modularity'] = nx_comm.modularity(G_undirected, communities, weight='weight')
    
    # Number of cycles
    metrics['num_cycles'] = len(nx.cycle_basis(G_undirected))
    
    return metrics


def calculate_spectral_metrics(G: nx.DiGraph) -> Dict[str, float]:
    """Calculate spectral graph metrics."""
    metrics = {}
    G_undirected = G.to_undirected()
    
    # Spectral radius
    adjacency_matrix = nx.to_numpy_array(G_undirected, weight='weight')
    eigenvalues = np.linalg.eigvals(adjacency_matrix)
    
    if len(eigenvalues) > 0:
        metrics['spectral_radius'] = float(np.max(np.abs(eigenvalues)))
    
    # Fiedler value (algebraic connectivity)
    laplacian_matrix = nx.normalized_laplacian_matrix(G_undirected, weight='weight').toarray()
    laplacian_eigenvalues = np.linalg.eigvals(laplacian_matrix)
    sorted_eigenvalues = np.sort(laplacian_eigenvalues)
    
    if len(sorted_eigenvalues) > 1:
        metrics['fiedler_value'] = float(sorted_eigenvalues[1])
    
    return metrics


def calculate_advanced_metrics(G: nx.DiGraph) -> Dict[str, float]:
    """Calculate advanced network metrics."""
    metrics = {}
    
    # Edge weight entropy
    weights = np.array([data['weight'] for _, _, data in G.edges(data=True)])
    total_weight = weights.sum()
    
    if total_weight > 0:
        probabilities = weights / total_weight
        non_zero_probs = probabilities[probabilities > 0]
        metrics['edge_weight_entropy'] = float(-np.sum(non_zero_probs * np.log2(non_zero_probs)))
    
    return metrics

## **6. Network Creation Pipeline**

In [6]:
def create_football_networks(on_ball_events: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """Main pipeline to create football networks from event data."""
    print("\n=== CREATING FOOTBALL NETWORKS ===")
    
    # Step 1: Calculate player positions
    player_positions = calculate_player_positions(on_ball_events)
    
    # Step 2: Create network edges
    network_edges = create_network_edges(on_ball_events)
    
    # Step 3: Calculate network metrics
    network_summary = calculate_network_metrics(on_ball_events, network_edges, player_positions)
    
    return network_summary, network_edges, player_positions


# Execute network creation
network_summary, network_edges, player_positions = create_football_networks(on_ball_events)

print("\n✓ Network creation completed!")
print(f"  Networks created: {len(network_summary)}")
print(f"  Total edges: {len(network_edges)}")
print(f"  Total player positions: {len(player_positions)}")


=== CREATING FOOTBALL NETWORKS ===

Calculating player positions...
  ✓ Positions calculated for 57085 player-match combinations

Creating network edges...
  Processed 100/4050 team-match combinations...
  Processed 200/4050 team-match combinations...
  Processed 300/4050 team-match combinations...
  Processed 400/4050 team-match combinations...
  Processed 500/4050 team-match combinations...
  Processed 600/4050 team-match combinations...
  Processed 700/4050 team-match combinations...
  Processed 800/4050 team-match combinations...
  Processed 900/4050 team-match combinations...
  Processed 1000/4050 team-match combinations...
  Processed 1100/4050 team-match combinations...
  Processed 1200/4050 team-match combinations...
  Processed 1300/4050 team-match combinations...
  Processed 1400/4050 team-match combinations...
  Processed 1500/4050 team-match combinations...
  Processed 1600/4050 team-match combinations...
  Processed 1700/4050 team-match combinations...
  Processed 1800/40

## **7. Data Preparation for Analysis**

In [7]:
# Convert final_result to binary (W=1, L=0)
network_summary['final_result'] = network_summary['final_result'].map({'W': 1, 'L': 0})
print("✓ Final result converted to binary format (Win=1, Loss=0)")

✓ Final result converted to binary format (Win=1, Loss=0)


## **8. Network Analysis**

In [8]:
print("\n=== NETWORK ANALYSIS ===")

# Key metrics to analyze
key_metrics = [
    'edge_count', 'network_density', 'avg_betweenness', 
    'avg_pagerank', 'avg_clustering', 'transitivity'
]

print("\nSummary statistics for key network metrics:")
for metric in key_metrics:
    if metric in network_summary.columns:
        values = network_summary[metric]
        print(f"\n{metric}:")
        print(f"  Mean: {values.mean():.4f}")
        print(f"  Median: {values.median():.4f}")
        print(f"  Std Dev: {values.std():.4f}")
        print(f"  Min: {values.min():.4f}")
        print(f"  Max: {values.max():.4f}")

# Analyze by result
print("\n\nMetrics comparison by match result:")
for metric in key_metrics[:3]:  # Show first 3 metrics
    if metric in network_summary.columns:
        wins = network_summary[network_summary['final_result'] == 1][metric]
        losses = network_summary[network_summary['final_result'] == 0][metric]
        print(f"\n{metric}:")
        print(f"  Wins - Mean: {wins.mean():.4f}, Std: {wins.std():.4f}")
        print(f"  Losses - Mean: {losses.mean():.4f}, Std: {losses.std():.4f}")


=== NETWORK ANALYSIS ===

Summary statistics for key network metrics:

edge_count:
  Mean: 118.3035
  Median: 119.0000
  Std Dev: 13.2752
  Min: 61.0000
  Max: 166.0000

network_density:
  Mean: 0.6401
  Median: 0.6429
  Std Dev: 0.0753
  Min: 0.3250
  Max: 0.8939

avg_betweenness:
  Mean: 0.0765
  Median: 0.0760
  Std Dev: 0.0091
  Min: 0.0468
  Max: 0.1170

avg_pagerank:
  Mean: 0.0710
  Median: 0.0714
  Std Dev: 0.0037
  Min: 0.0588
  Max: 0.0909

avg_clustering:
  Mean: 0.1738
  Median: 0.1715
  Std Dev: 0.0425
  Min: 0.0569
  Max: 0.3443

transitivity:
  Mean: 0.8303
  Median: 0.8355
  Std Dev: 0.0471
  Min: 0.5899
  Max: 0.9677


Metrics comparison by match result:

edge_count:
  Wins - Mean: 117.1867, Std: 13.8986
  Losses - Mean: 119.4202, Std: 12.5254

network_density:
  Wins - Mean: 0.6377, Std: 0.0785
  Losses - Mean: 0.6425, Std: 0.0720

avg_betweenness:
  Wins - Mean: 0.0788, Std: 0.0092
  Losses - Mean: 0.0742, Std: 0.0084


## **9. Export Network Data**

In [9]:
print("\n=== EXPORTING NETWORK DATA ===")

# Export network summary
networks_path = PROCESSED_DATA_PATH / "on_ball_networks.parquet"
network_summary.to_parquet(networks_path, index=False)
print(f"✓ Network summary saved to: {networks_path}")
print(f"  File size: {networks_path.stat().st_size / (1024*1024):.2f} MB")

# Export network edges
edges_path = PROCESSED_DATA_PATH / "network_edges.parquet"
network_edges.to_parquet(edges_path, index=False)
print(f"\n✓ Network edges saved to: {edges_path}")
print(f"  File size: {edges_path.stat().st_size / (1024*1024):.2f} MB")

# Export player positions
positions_path = PROCESSED_DATA_PATH / "player_positions.parquet"
player_positions.to_parquet(positions_path, index=False)
print(f"\n✓ Player positions saved to: {positions_path}")
print(f"  File size: {positions_path.stat().st_size / (1024*1024):.2f} MB")

print("\n✓ All network data exported successfully!")


=== EXPORTING NETWORK DATA ===
✓ Network summary saved to: ..\data\processed\on_ball_networks.parquet
  File size: 0.84 MB

✓ Network edges saved to: ..\data\processed\network_edges.parquet
  File size: 2.51 MB

✓ Player positions saved to: ..\data\processed\player_positions.parquet
  File size: 1.20 MB

✓ All network data exported successfully!
