In [91]:
import importlib
import src.utils.config_loader
importlib.reload(src.utils.config_loader)

from src.utils.config_loader import ConfigLoader

config_loader = ConfigLoader()
all_configs = config_loader.load_configs()
base_configs = config_loader.get_section(all_configs, "base")
graph_configs = config_loader.get_section(all_configs, "graph")
community_cfg = config_loader.get_section(all_configs, "community")

In [32]:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}") # type: ignore

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

PyTorch version: 2.8.0+cu128
CUDA available: True
CUDA version: 12.8


device(type='cuda')

In [33]:
# Preprocess data
import importlib
import src.modules.data_processor
importlib.reload(src.modules.data_processor)

from src.modules.data_processor import DataProcessor

print(base_configs)
data_processor = DataProcessor(base_configs=base_configs)
processed_data = data_processor.run(device=device, summarize=True)

{'paths': {'raw': 'data/raw/deb_label.csv', 'processed': 'data/processed'}, 'labels': {0: 'disagree', 1: 'neutral', 2: 'agree'}, 'subreddits': {'brexit': 0, 'blacklivesmatter': 1, 'climate': 2, 'democrats': 3, 'republican': 4}, 'required_columns': ['label', 'msg_id_parent', 'msg_id_child', 'submission_id', 'body_parent', 'body_child', 'submission_text', 'subreddit', 'author_parent', 'author_child', 'datetime', 'agreement_fraction', 'individual_kappa'], 'cleaning': {'normalize_subreddits': True, 'rename_columns': {'author_child': 'src_author', 'author_parent': 'dst_author', 'msg_id_child': 'src_comment_id', 'msg_id_parent': 'dst_comment_id', 'body_child': 'src_comment_text', 'body_parent': 'dst_comment_text', 'datetime': 'timestamp'}, 'timestamp_parsing': {'primary_format': '%d/%m/%Y %H:%M', 'dayfirst': True, 'error_handling': 'coerce', 'fallback_formats': ['%Y-%m-%d %H:%M:%S', '%m/%d/%Y %H:%M', '%Y-%m-%d']}, 'remove_self_replies': True}, 'temporal': {'infer_parent_comment_time': {'enab

Unnamed: 0,subreddit,Earliest Date,Latest Date,# Replies,% Self-Replies,% Disagree,% Neutral,% Agree,# Unique Comments,# Unique Authors,Total Timesteps,Avg Timestep Window Size (days),Mean Confidence
0,All,2015-01-03,2021-05-19,42770,0.0,39.6,26.0,34.4,75034,27420,77,69.3,0.525
1,blacklivesmatter,2020-05-27,2021-05-19,1916,0.0,32.9,22.4,44.7,3453,2691,10,33.5,0.557
2,brexit,2016-06-21,2021-05-18,15703,0.0,41.6,29.2,29.2,28039,1400,23,76.0,0.498
3,climate,2015-01-03,2021-05-18,5754,0.0,40.6,27.7,31.7,10131,5377,26,87.0,0.543
4,democrats,2020-01-02,2021-05-19,9606,0.0,35.9,21.9,42.2,16499,8187,9,55.0,0.534
5,republican,2020-01-02,2021-05-19,9791,0.0,40.7,24.6,34.7,16912,10154,9,54.9,0.544


Subreddit timestep info for replies:
Subreddit: blacklivesmatter (replies)


Unnamed: 0,timestep,interval,actual_window_size,total_count
0,All,2020-05-27 - 2021-05-19,357,1916
1,0,2020-05-27 - 2020-06-25,29,604
2,1,2020-06-26 - 2020-07-25,29,328
3,2,2020-07-26 - 2020-08-24,29,174
4,3,2020-08-25 - 2020-09-23,29,183
5,4,2020-09-24 - 2020-10-23,29,167
6,5,2020-10-24 - 2020-11-22,29,140
7,6,2020-11-23 - 2020-12-22,28,71
8,7,2020-12-23 - 2021-01-21,29,77
9,8,2021-01-23 - 2021-03-17,53,61


Subreddit: brexit (replies)


Unnamed: 0,timestep,interval,actual_window_size,total_count
0,All,2016-06-21 - 2021-05-18,1792,15703
1,0,2016-06-21 - 2017-10-13,479,122
2,1,2017-10-14 - 2017-12-12,54,96
3,2,2017-12-13 - 2018-02-10,58,62
4,3,2018-02-11 - 2018-04-11,59,125
5,4,2018-04-12 - 2018-06-10,58,150
6,5,2018-06-11 - 2018-08-09,58,311
7,6,2018-08-10 - 2018-10-08,59,198
8,7,2018-10-09 - 2018-12-07,59,190
9,8,2018-12-08 - 2019-02-05,59,640


Subreddit: climate (replies)


Unnamed: 0,timestep,interval,actual_window_size,total_count
0,All,2015-01-03 - 2021-05-18,2327,5754
1,0,2015-01-03 - 2015-04-02,88,120
2,1,2015-04-03 - 2015-07-01,85,110
3,2,2015-07-02 - 2015-09-29,88,127
4,3,2015-09-30 - 2015-12-28,87,111
5,4,2015-12-29 - 2016-03-27,89,112
6,5,2016-03-28 - 2016-06-25,85,101
7,6,2016-06-26 - 2016-09-23,87,122
8,7,2016-09-24 - 2016-12-22,89,222
9,8,2016-12-23 - 2017-03-22,86,149


Subreddit: democrats (replies)


Unnamed: 0,timestep,interval,actual_window_size,total_count
0,All,2020-01-02 - 2021-05-19,503,9606
1,0,2020-01-02 - 2020-03-01,59,1125
2,1,2020-03-02 - 2020-04-30,59,1423
3,2,2020-05-01 - 2020-06-29,59,1456
4,3,2020-06-30 - 2020-08-28,59,1114
5,4,2020-08-29 - 2020-10-27,59,1108
6,5,2020-10-28 - 2020-12-26,59,1370
7,6,2020-12-27 - 2021-02-24,59,961
8,7,2021-02-25 - 2021-04-25,59,669
9,8,2021-04-26 - 2021-05-19,23,380


Subreddit: republican (replies)


Unnamed: 0,timestep,interval,actual_window_size,total_count
0,All,2020-01-02 - 2021-05-19,503,9791
1,0,2020-01-02 - 2020-03-01,59,470
2,1,2020-03-02 - 2020-04-30,59,766
3,2,2020-05-01 - 2020-06-29,59,1158
4,3,2020-06-30 - 2020-08-28,59,1383
5,4,2020-08-29 - 2020-10-27,59,1698
6,5,2020-10-28 - 2020-12-26,59,1909
7,6,2020-12-27 - 2021-02-24,59,839
8,7,2021-02-25 - 2021-04-25,59,1015
9,8,2021-04-26 - 2021-05-19,22,553


Subreddit timestep info for comments:
Subreddit: blacklivesmatter (comments)


Unnamed: 0,timestep,interval,actual_window_size,total_count
0,All,2020-05-27 - 2021-05-19,357,3453
1,0,2020-05-27 - 2020-06-25,29,1097
2,1,2020-06-26 - 2020-07-25,29,587
3,2,2020-07-26 - 2020-08-24,29,317
4,3,2020-08-25 - 2020-09-23,29,332
5,4,2020-09-24 - 2020-10-23,29,292
6,5,2020-10-24 - 2020-11-22,29,250
7,6,2020-11-23 - 2020-12-22,28,128
8,7,2020-12-23 - 2021-01-21,29,143
9,8,2021-01-23 - 2021-03-17,53,114


Subreddit: brexit (comments)


Unnamed: 0,timestep,interval,actual_window_size,total_count
0,All,2016-06-21 - 2021-05-18,1792,28039
1,0,2016-06-21 - 2017-10-13,479,225
2,1,2017-10-14 - 2017-12-12,54,173
3,2,2017-12-13 - 2018-02-10,58,114
4,3,2018-02-11 - 2018-04-11,59,222
5,4,2018-04-12 - 2018-06-10,58,269
6,5,2018-06-11 - 2018-08-09,58,568
7,6,2018-08-10 - 2018-10-08,59,352
8,7,2018-10-09 - 2018-12-07,59,358
9,8,2018-12-08 - 2019-02-05,59,1158


Subreddit: climate (comments)


Unnamed: 0,timestep,interval,actual_window_size,total_count
0,All,2015-01-03 - 2021-05-18,2327,10131
1,0,2015-01-03 - 2015-04-02,88,220
2,1,2015-04-03 - 2015-07-01,85,205
3,2,2015-07-02 - 2015-09-29,88,233
4,3,2015-09-30 - 2015-12-28,87,201
5,4,2015-12-29 - 2016-03-27,89,206
6,5,2016-03-28 - 2016-06-25,85,181
7,6,2016-06-26 - 2016-09-23,87,222
8,7,2016-09-24 - 2016-12-22,89,392
9,8,2016-12-23 - 2017-03-22,86,273


Subreddit: democrats (comments)


Unnamed: 0,timestep,interval,actual_window_size,total_count
0,All,2020-01-02 - 2021-05-19,503,16499
1,0,2020-01-02 - 2020-03-01,59,1936
2,1,2020-03-02 - 2020-04-30,59,2402
3,2,2020-05-01 - 2020-06-29,59,2476
4,3,2020-06-30 - 2020-08-28,59,1920
5,4,2020-08-29 - 2020-10-27,59,1931
6,5,2020-10-28 - 2020-12-26,59,2365
7,6,2020-12-27 - 2021-02-24,59,1658
8,7,2021-02-25 - 2021-04-25,59,1155
9,8,2021-04-26 - 2021-05-19,23,656


Subreddit: republican (comments)


Unnamed: 0,timestep,interval,actual_window_size,total_count
0,All,2020-01-02 - 2021-05-19,503,16912
1,0,2020-01-02 - 2020-03-01,59,810
2,1,2020-03-02 - 2020-04-30,59,1316
3,2,2020-05-01 - 2020-06-29,59,1995
4,3,2020-06-30 - 2020-08-28,59,2399
5,4,2020-08-29 - 2020-10-27,59,2937
6,5,2020-10-28 - 2020-12-26,59,3289
7,6,2020-12-27 - 2021-02-24,59,1448
8,7,2021-02-25 - 2021-04-25,59,1730
9,8,2021-04-26 - 2021-05-19,22,988


In [85]:
# Build graphs
import importlib
import src.modules.graph_processor
importlib.reload(src.modules.graph_processor)

from src.modules.graph_processor import GraphProcessor

processed_path = base_configs.get('processed_path', 'data/processed')
pairs = processed_data.user_pairs
comments = processed_data.comments

graph_processor = GraphProcessor(graph_configs=graph_configs, processed_path=processed_path)
graph_data = graph_processor.run(pairs=pairs, embeddings_source=comments)

Building node features with pooling: mean
    + Total unique authors in pairs: 35257
    + Total pooled vectors: 35212
    + Pooled vector dimension: 384
Building graph snapshots: directed=True, wcc_mode=min_edges, edge_attrs=['mean_confidence', 'net_vector']
[Subreddit 0, T0] [WCC:min_edges] Nodes 32->29, Edges 86->84 (2.3% edges removed)
[Subreddit 0, T1] [WCC:min_edges] Nodes 30->30, Edges 79->79 (0.0% edges removed)
[Subreddit 0, T2] [WCC:min_edges] Nodes 31->31, Edges 59->59 (0.0% edges removed)
[Subreddit 0, T3] [WCC:min_edges] Nodes 43->43, Edges 107->107 (0.0% edges removed)
[Subreddit 0, T4] [WCC:min_edges] Nodes 53->53, Edges 134->134 (0.0% edges removed)
[Subreddit 0, T5] [WCC:min_edges] Nodes 71->71, Edges 266->266 (0.0% edges removed)
[Subreddit 0, T6] [WCC:min_edges] Nodes 79->79, Edges 187->187 (0.0% edges removed)
[Subreddit 0, T7] [WCC:min_edges] Nodes 96->94, Edges 177->176 (0.6% edges removed)
[Subreddit 0, T8] [WCC:min_edges] Nodes 187->187, Edges 611->611 (0.0% edg

In [104]:
# Run community detection
import importlib
import src.modules.community_processor
importlib.reload(src.modules.community_processor)

from src.modules.community_processor import LeidenCommunityProcessor

graph_dict = graph_data.graph_dict

print(community_cfg)
processed_path = base_configs.get('processed_path', 'data/processed')
leiden_processor = LeidenCommunityProcessor(community_configs=community_cfg, output_dir=processed_path)

# Run optimization analysis
print("=============================================================")
print("Optimization Analysis for Community Detection")
print("=============================================================")
opt_meta, scans_df, best_df, summary_df = leiden_processor.analyze_optimal_community_parameters(graph_dict)
print("Optimization meta:", opt_meta)
print("\nBest resolutions per graph (head):")
display(best_df.head() if best_df is not None else "None")
print("Resolution summary:")
display(summary_df if summary_df is not None else "None")

# Community detection results using best parameters
print("=============================================================")
print("Community Detection Results using Best Parameters")
print("=============================================================")
community_result = leiden_processor.run_community_detection(
    graph_dict=graph_dict,
    use_optimization=True,
    save=True
)

print("\nPartitions:")
display(community_result.partitions)
print("Meta:")
print(community_result.meta)

{'algorithm': 'leiden', 'seed': 42, 'weights': {'strategy': 'agreement_diff'}, 'optimization': {'mode': 'per_graph', 'metric': 'modularity', 'min_communities': 2, 'min_community_size': 3, 'max_communities_factor': 0.8, 'resolution': {'grid': [0.01, 0.05, 0.07, 0.1, 0.2, 0.3, 0.5, 0.8, 1.0], 'early_stop': {'window': 5, 'delta': 0.001}}}, 'algorithms': {'leiden': {'partition_type': 'RBConfigurationVertexPartition'}}}
Optimization Analysis for Community Detection
Optimization meta: {'mode': 'per_graph', 'algorithm': 'leiden', 'partition_type': 'RBConfigurationVertexPartition', 'weight_strategy': 'agreement_diff', 'uniform_choice': None, 'uniform_scores': [], 'res_grid': [0.01, 0.05, 0.07, 0.1, 0.2, 0.3, 0.5, 0.8, 1.0]}

Best resolutions per graph (head):


Unnamed: 0,subreddit_id,timestep,resolution,num_nodes,num_edges,num_communities,min_size,max_size,mean_size,metric_value,quality,modularity,fragmentation_limit
0,0,0,0.8,29,84,3,6,13,9.666667,52.361905,52.361905,0.237812,4.308132
1,0,1,0.5,30,79,2,9,21,15.0,83.860759,83.860759,0.213427,4.38178
2,0,2,0.3,31,59,2,3,28,15.5,83.472881,83.472881,0.064206,4.454211
3,0,3,0.8,43,107,4,4,14,10.75,78.179439,78.179439,0.304786,5.245951
4,0,4,0.8,53,134,4,11,14,13.25,101.498507,101.498507,0.327885,5.824088


Resolution summary:


Unnamed: 0,resolution,metric_mean,metric_std,modularity_mean,num_communities_mean,num_communities_min,num_communities_max,community_size_mean,community_size_min,community_size_max
0,0.01,782.815153,822.514678,0.455226,15.943396,2,52,23.330765,3,1208
1,0.05,706.566415,753.940743,0.371887,9.363636,3,18,31.229781,4,1029
2,0.07,721.019937,771.882067,0.426374,9.7,3,19,30.200844,4,1020
3,0.1,675.010291,777.979569,0.495861,9.578947,2,22,27.973152,4,930
4,0.2,669.793299,726.444759,0.654328,10.4,2,23,29.956534,4,351
5,0.3,739.042511,679.176472,0.439491,8.173913,2,25,46.639884,3,262
6,0.5,597.964195,646.296226,0.474333,8.318182,2,31,33.526057,3,312
7,0.8,447.317388,366.367383,0.420027,7.103448,3,28,26.569063,3,88
8,1.0,442.882568,266.610819,0.354034,8.590909,5,13,21.132914,3,50


Community Detection Results using Best Parameters
Saved partitions to data/processed\communities/partitions.parquet
Saved labels arrays to data/processed\communities/labels_arrays.npz
Saved index mapping to data/processed\communities/labels_index.json
Saved name mapping to data/processed\communities/labels_name.json

Partitions:


Unnamed: 0,subreddit_id,timestep,resolution_used,num_nodes,num_edges,num_communities,modularity,community_sizes
0,0,0,0.80,29,84,3,0.237812,"[13, 10, 6]"
1,0,1,0.50,30,79,2,0.213427,"[21, 9]"
2,0,2,0.30,31,59,2,0.064206,"[28, 3]"
3,0,3,0.80,43,107,4,0.304786,"[14, 13, 12, 4]"
4,0,4,0.80,53,134,4,0.327885,"[14, 14, 14, 11]"
...,...,...,...,...,...,...,...,...
72,4,4,0.01,1267,1314,52,0.332251,"[975, 18, 16, 13, 12, 10, 10, 7, 7, 6, 6, 6, 6..."
73,4,5,0.01,1462,1538,49,0.249497,"[1208, 13, 12, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6..."
74,4,6,0.01,633,610,39,0.566009,"[385, 16, 14, 12, 11, 10, 9, 9, 9, 8, 8, 7, 7,..."
75,4,7,0.01,797,819,27,0.211530,"[679, 7, 7, 6, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4..."


Meta:
{'algorithm': 'leiden', 'partition_type': 'RBConfigurationVertexPartition', 'weight_strategy': 'agreement_diff', 'use_optimization': True, 'force_resolution': None, 'optimization_meta': {'mode': 'per_graph', 'algorithm': 'leiden', 'partition_type': 'RBConfigurationVertexPartition', 'weight_strategy': 'agreement_diff', 'uniform_choice': None, 'uniform_scores': [], 'res_grid': [0.01, 0.05, 0.07, 0.1, 0.2, 0.3, 0.5, 0.8, 1.0]}}


In [37]:
# Load trained model and extract GNN node embeddings
import importlib
import src.utils.gnn_checkpointing
importlib.reload(src.utils.gnn_checkpointing)

from src.utils.gnn_checkpointing import load_model_checkpoint

ckpt_path = "checkpoints/best_model_2508152029.pth"
model_class_path = "src.models.multitask_debate_gnn.MultitaskDebateGNN"
gnn_model, gnn_ckpt = load_model_checkpoint(
    ckpt_path,
    device=device,
    model_class_path=model_class_path,
)

gnn_model.eval()
print(f"Loaded: {gnn_model.__class__.__name__}")
print(f"Checkpoint keys: {list(gnn_ckpt.keys())}")

Loading model checkpoint from checkpoints/best_model_2508152029.pth
Loaded: MultitaskDebateGNN
Checkpoint keys: ['state_dict', 'model_args', 'train_args']


In [None]:
import umap
import numpy as np

def reduce_embeddings_umap(
    embeddings,
    target_dim: int = 50,
    n_neighbors: int = 15,
    min_dist: float = 0.1,
    metric: str = 'euclidean',
    random_state: int = 42,
    verbose: bool = False
) -> np.ndarray:
    if isinstance(embeddings, torch.Tensor):
        embeddings = embeddings.cpu().numpy()
    
    # Validate input:
    if embeddings.ndim != 2:
        raise ValueError(f"Expected 2D embeddings, got shape {embeddings.shape}")
    if target_dim >= embeddings.shape[1]:
        raise ValueError(f"Target dimension {target_dim} must be less than input dimension {embeddings.shape[1]}.")
    
    # Initialize UMAP
    reducer = umap.UMAP(
        n_components=target_dim,
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        metric=metric,
        random_state=random_state,
        verbose=verbose
    )
    # Fit and transform
    reduced_embeddings = reducer.fit_transform(embeddings)
    
    # If the result is not a numpy ndarray, convert it
    if not isinstance(reduced_embeddings, np.ndarray):
        reduced_embeddings = np.array(reduced_embeddings)
    if verbose:
        print(f"Reduced embeddings from {embeddings.shape} to {reduced_embeddings.shape}")
    
    return reduced_embeddings

In [203]:
# Replace the entire processing loop with this corrected version:

pyg_graphs = graph_data.pyg_graphs
graph_dict = graph_data.graph_dict
pyg_node_map = graph_data.pyg_node_map

import numpy as np
import networkx as nx
from torch_geometric.utils import to_networkx

from src.baselines.echogae import EchoChamberMeasure, EchoGAE_algorithm
from src.modules.community_processor import LeidenCommunityProcessor

import warnings
warnings.filterwarnings("ignore", message="Converting a tensor with requires_grad=True to a scalar")
warnings.filterwarnings("ignore", message="'train_test_split_edges' is deprecated")

# Initialize community processor
processed_path = base_configs.get('processed_path', 'data/processed')
leiden_processor = LeidenCommunityProcessor(community_configs=community_cfg, output_dir=processed_path)

processed_dict = {}

for i in pyg_graphs:
    sub = i.subreddit_id if hasattr(i, 'subreddit_id') else None
    ts = i.local_timestep if hasattr(i, 'local_timestep') else None
    print(f"Processing graph for subreddit: {sub}, timestep: {ts}")
    
    if sub is None or ts is None:
        print("Skipping - missing subreddit_id or timestep")
        continue

    # Get PyG node mapping
    node_map = pyg_node_map[(sub, ts)] if (sub, ts) in pyg_node_map else None
    
    # Extract node features
    node_features = i.x.cpu().numpy() if hasattr(i, 'x') and i.x is not None else None
    
    # Convert to nx graph, keep directed
    nx_graph = to_networkx(i, to_undirected=False)  
    # Check connectivity AFTER conversion
    if nx_graph.number_of_nodes() > 0:
        wcc = list(nx.weakly_connected_components(nx_graph))
        largest_wcc = max(wcc, key=len) if wcc else set()

    # Get EchoGAE node embeddings
    echogae_embeddings, _, _, _, _ = EchoGAE_algorithm(
        G=nx_graph.to_undirected(),
        user_embeddings=node_features,
        show_progress=False,
        epochs=50,
        hidden_channels=100,
        out_channels=50,
        seed=42
    )
    
    # Get GNN node embeddings
    gnn_embeddings = gnn_model.embed(i, device=device, eval_mode=True) if gnn_model else None

    # Run optimization for this single graph
    single_graph_dict = {sub: {ts: graph_dict[sub][ts]}}
    
    opt_meta, scans_df, best_df, summary_df = leiden_processor.analyze_optimal_community_parameters(single_graph_dict)
    
    # Run community detection with optimization results
    community_result = leiden_processor.run_community_detection(
        graph_dict=single_graph_dict,
        use_optimization=True,
        save=False
    )
    
    # Get community labels and partition
    comm_labels = community_result.labels_array[sub][ts]
    partition_nx = community_result.labels_name_dict[sub][ts]  # node_id -> community_id
    
    # Debug: Check actual constraints vs requirements
    unique_communities = np.unique(comm_labels)
    comm_sizes = [np.sum(comm_labels == comm_id) for comm_id in unique_communities]
    num_communities = len(unique_communities)
    min_comm_size_actual = min(comm_sizes) if comm_sizes else 0
    
    # Get constraints from config
    min_communities_required = community_cfg.get('optimization', {}).get('min_communities', 2)
    min_size_required = community_cfg.get('optimization', {}).get('min_community_size', 10)
    
    print(f"  Graph stats: {nx_graph.number_of_nodes()} nodes, {nx_graph.number_of_edges()} edges")
    print(f"  Communities found: {num_communities} (min required: {min_communities_required})")
    print(f"  Community sizes (min={min_comm_size_actual}): {comm_sizes} (min size required: {min_size_required})")
    
    # Check if constraints can theoretically be satisfied
    theoretical_min_nodes = min_communities_required * min_size_required
    if nx_graph.number_of_nodes() < theoretical_min_nodes:
        print(f"  WARNING: Graph has {nx_graph.number_of_nodes()} nodes but needs at least {theoretical_min_nodes} to satisfy constraints!")
    
    # Calculate community statistics
    nx_n_comms = len(unique_communities)
    
    # Calculate modularity (if available in community_result)
    nx_modularity = None
    if hasattr(community_result, 'partitions') and not community_result.partitions.empty:
        # Find the row for current subreddit and timestep
        partition_row = community_result.partitions[
            (community_result.partitions['subreddit_id'] == sub) & 
            (community_result.partitions['timestep'] == ts)
        ]
        if not partition_row.empty:
            nx_modularity = partition_row['modularity'].iloc[0]
    
    print(f"  Modularity: {nx_modularity:.3f}")

    # Community sizes
    nx_comm_sizes = [np.sum(comm_labels == comm_id) for comm_id in unique_communities]
    nx_min_comm_size = min(nx_comm_sizes)
    nx_max_comm_size = max(nx_comm_sizes)
    nx_mean_comm_size = np.mean(nx_comm_sizes)
    
    # FIXED: Build comm_nodes directly from partition (user_id -> community_id)
    nx_comm_nodes = {}
    for comm_id in unique_communities:
        nx_comm_nodes[comm_id] = []
    
    # Populate comm_nodes from partition
    for user_id, comm_id in partition_nx.items():
        if comm_id in nx_comm_nodes:
            nx_comm_nodes[comm_id].append(user_id)
    
    # Debug: Print comm_nodes sizes to verify they're populated correctly
    comm_nodes_sizes = {comm_id: len(users) for comm_id, users in nx_comm_nodes.items()}
    print(f"  Community node counts: {comm_nodes_sizes}")
    
    # Detailed community information
    subreddit_map = {v: k for k, v in base_configs['subreddits'].items()}
    community_info = {
        'subreddit_id': sub,
        'subreddit': subreddit_map.get(sub, f'Unknown_{sub}'),
        'timestep': ts,
        'num_nodes': nx_graph.number_of_nodes(),
        'num_edges': nx_graph.number_of_edges(),
        'num_communities': nx_n_comms,
        'modularity': nx_modularity,
        'min_comm_size': nx_min_comm_size,
        'max_comm_size': nx_max_comm_size,
        'mean_comm_size': nx_mean_comm_size,
        'partition': partition_nx,
        'comm_labels': comm_labels,
        'comm_sizes': nx_comm_sizes,
        'comm_nodes': nx_comm_nodes,
    }
    
    # Calculate community and overall ECS for EchoGAE
    echogae_ecm = EchoChamberMeasure(
        users_representations=echogae_embeddings,
        labels=comm_labels,
    )
    echogae_eci = echogae_ecm.echo_chamber_index()
    echogae_comm_eci = [echogae_ecm.community_echo_chamber_index(i) for i in np.unique(comm_labels)]
    print(f"  EchoGAE ECI: {echogae_eci:.4f} | Community ECIs: {[f'{eci:.4f}' for eci in echogae_comm_eci]}")

    # Calculate community and overall ECS for DebateGNN
    debgnn_ecm = EchoChamberMeasure(
        users_representations=gnn_embeddings,
        labels=comm_labels,
    )
    debgnn_eci = debgnn_ecm.echo_chamber_index()
    debgnn_comm_eci = [debgnn_ecm.community_echo_chamber_index(i) for i in np.unique(comm_labels)]
    print(f"  DebateGNN ECI: {debgnn_eci:.4f} | Community ECIs: {[f'{eci:.4f}' for eci in debgnn_comm_eci]}")

    # Store all data for this graph
    if sub not in processed_dict:
        processed_dict[sub] = {}
    
    processed_dict[sub][ts] = {
        'echogae_embeddings': echogae_embeddings,
        'gnn_embeddings': gnn_embeddings.cpu().numpy() if gnn_embeddings is not None else None,
        'node_features': node_features,
        'community_info': community_info,
        'echogae_eci': echogae_eci,
        'echogae_comm_eci': echogae_comm_eci,
        'debgnn_eci': debgnn_eci,
        'debgnn_comm_eci': debgnn_comm_eci,
    }

print(f"\nProcessed {len(processed_dict)} subreddits")

Processing graph for subreddit: 0, timestep: 0
Using device: cuda
  Graph stats: 29 nodes, 84 edges
  Communities found: 3 (min required: 2)
  Community sizes (min=6): [np.int64(13), np.int64(10), np.int64(6)] (min size required: 3)
  Modularity: 0.238
  Community node counts: {np.int64(0): 13, np.int64(1): 10, np.int64(2): 6}
  EchoGAE ECI: 0.5073 | Community ECIs: ['0.4814', '0.5453', '0.5001']
  DebateGNN ECI: 0.5192 | Community ECIs: ['0.5976', '0.4440', '0.4745']
Processing graph for subreddit: 0, timestep: 1
Using device: cuda
  Graph stats: 30 nodes, 79 edges
  Communities found: 2 (min required: 2)
  Community sizes (min=9): [np.int64(21), np.int64(9)] (min size required: 3)
  Modularity: 0.213
  Community node counts: {np.int64(0): 21, np.int64(1): 9}
  EchoGAE ECI: 0.5563 | Community ECIs: ['0.5736', '0.5157']
  DebateGNN ECI: 0.5208 | Community ECIs: ['0.5063', '0.5548']
Processing graph for subreddit: 0, timestep: 2
Using device: cuda
  Graph stats: 31 nodes, 59 edges
  Com

In [204]:
import pandas as pd

# Create a combined DataFrame from community info
combined_data = []

for subreddit_id, timesteps in processed_dict.items():
    for timestep, data in timesteps.items():
        community_info = data['community_info']
        
        # Create a row with all the community information
        row = {
            'subreddit_id': subreddit_id,
            'subreddit': community_info['subreddit'],
            'timestep': timestep,
            'num_nodes': community_info['num_nodes'],
            'num_edges': community_info['num_edges'],
            'num_communities': community_info['num_communities'],
            'modularity': round(community_info['modularity'], 3) if community_info['modularity'] is not None else None,
            'min_comm_size': community_info['min_comm_size'],
            'max_comm_size': community_info['max_comm_size'],
            'mean_comm_size': round(community_info['mean_comm_size'], 1),
            'echogae_eci': round(data['echogae_eci'], 3),
            'debgnn_eci': round(data['debgnn_eci'], 3),
            # Convert numpy arrays to Python lists with native types
            'comm_sizes': [int(size) for size in community_info['comm_sizes']],
            'echogae_comm_eci': [round(float(eci), 3) for eci in data['echogae_comm_eci']],
            'debgnn_comm_eci': [round(float(eci), 3) for eci in data['debgnn_comm_eci']]
        }
        
        combined_data.append(row)

# Create DataFrame
eci_df = pd.DataFrame(combined_data)

# Add delta columns for ECI changes between timesteps
eci_df = eci_df.sort_values(['subreddit_id', 'timestep']).reset_index(drop=True)

# Calculate deltas within each subreddit group
eci_df['delta_echogae_eci'] = eci_df.groupby('subreddit_id')['echogae_eci'].diff()
eci_df['delta_debgnn_eci'] = eci_df.groupby('subreddit_id')['debgnn_eci'].diff()

# Round the delta values to 3 decimal places
eci_df['delta_echogae_eci'] = eci_df['delta_echogae_eci'].round(4)
eci_df['delta_debgnn_eci'] = eci_df['delta_debgnn_eci'].round(4)

# Sort by subreddit and timestep
eci_df = eci_df.sort_values(['subreddit_id', 'timestep']).reset_index(drop=True)

print(f"Created combined DataFrame with {len(eci_df)} rows")
print("\nDataFrame columns:", list(eci_df.columns))
print("\nFirst few rows:")
display(eci_df.head(11))

# Save results to csv
eci_df.to_csv('results/eci_results.csv', index=False)
print("Saved results to results/eci_results.csv")

Created combined DataFrame with 77 rows

DataFrame columns: ['subreddit_id', 'subreddit', 'timestep', 'num_nodes', 'num_edges', 'num_communities', 'modularity', 'min_comm_size', 'max_comm_size', 'mean_comm_size', 'echogae_eci', 'debgnn_eci', 'comm_sizes', 'echogae_comm_eci', 'debgnn_comm_eci', 'delta_echogae_eci', 'delta_debgnn_eci']

First few rows:


Unnamed: 0,subreddit_id,subreddit,timestep,num_nodes,num_edges,num_communities,modularity,min_comm_size,max_comm_size,mean_comm_size,echogae_eci,debgnn_eci,comm_sizes,echogae_comm_eci,debgnn_comm_eci,delta_echogae_eci,delta_debgnn_eci
0,0,brexit,0,29,84,3,0.238,6,13,9.7,0.507,0.519,"[13, 10, 6]","[0.481, 0.545, 0.5]","[0.598, 0.444, 0.474]",,
1,0,brexit,1,30,79,2,0.213,9,21,15.0,0.556,0.521,"[21, 9]","[0.574, 0.516]","[0.506, 0.555]",0.049,0.002
2,0,brexit,2,31,59,2,0.064,3,28,15.5,0.436,0.51,"[28, 3]","[0.404, 0.734]","[0.502, 0.582]",-0.12,-0.011
3,0,brexit,3,43,107,4,0.305,4,14,10.8,0.48,0.486,"[14, 13, 12, 4]","[0.468, 0.477, 0.452, 0.612]","[0.509, 0.435, 0.495, 0.544]",0.044,-0.024
4,0,brexit,4,53,134,4,0.328,11,14,13.2,0.444,0.489,"[14, 14, 14, 11]","[0.382, 0.695, 0.358, 0.313]","[0.452, 0.538, 0.465, 0.502]",-0.036,0.003
5,0,brexit,5,71,266,3,0.279,22,26,23.7,0.489,0.484,"[26, 23, 22]","[0.488, 0.567, 0.408]","[0.527, 0.446, 0.474]",0.045,-0.005
6,0,brexit,6,79,187,2,0.296,36,43,39.5,0.513,0.512,"[43, 36]","[0.503, 0.524]","[0.53, 0.49]",0.024,0.028
7,0,brexit,7,94,176,2,0.033,4,90,47.0,0.541,0.527,"[90, 4]","[0.537, 0.653]","[0.527, 0.53]",0.028,0.015
8,0,brexit,8,187,611,6,0.339,12,48,31.2,0.411,0.438,"[48, 38, 34, 32, 23, 12]","[0.37, 0.371, 0.364, 0.418, 0.609, 0.445]","[0.407, 0.474, 0.412, 0.408, 0.464, 0.551]",-0.13,-0.089
9,0,brexit,9,271,1090,6,0.303,11,70,45.2,0.456,0.46,"[70, 68, 65, 36, 21, 11]","[0.444, 0.431, 0.439, 0.538, 0.491, 0.446]","[0.454, 0.424, 0.48, 0.517, 0.442, 0.453]",0.045,0.022


Saved results to results/eci_results.csv


In [205]:
delta_eci_summary = eci_df.groupby('subreddit').agg({
    'timestep': 'count',
    'echogae_eci': ['mean', 'min', 'max'],
    'delta_echogae_eci': ['mean', 'min', 'max'],
    'debgnn_eci': ['mean', 'min', 'max'],
    'delta_debgnn_eci': ['mean', 'min', 'max'],
})
display(delta_eci_summary)

# Save results to csv
delta_eci_summary.to_csv('results/delta_eci_summary.csv')
print("Saved summary to results/delta_eci_summary.csv")

Unnamed: 0_level_0,timestep,echogae_eci,echogae_eci,echogae_eci,delta_echogae_eci,delta_echogae_eci,delta_echogae_eci,debgnn_eci,debgnn_eci,debgnn_eci,delta_debgnn_eci,delta_debgnn_eci,delta_debgnn_eci
Unnamed: 0_level_1,count,mean,min,max,mean,min,max,mean,min,max,mean,min,max
subreddit,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
blacklivesmatter,10,0.4852,0.436,0.524,0.005889,-0.025,0.035,0.3794,0.244,0.524,0.018667,-0.112,0.094
brexit,23,0.45987,0.305,0.63,0.000364,-0.183,0.22,0.474,0.376,0.577,-0.001273,-0.13,0.112
climate,26,0.463269,0.406,0.526,-0.00188,-0.061,0.046,0.375346,0.285,0.516,0.00108,-0.154,0.162
democrats,9,0.409778,0.365,0.445,0.00325,-0.037,0.08,0.280444,0.22,0.339,0.007,-0.073,0.119
republican,9,0.419111,0.389,0.45,-0.0015,-0.034,0.046,0.264222,0.208,0.338,-0.00275,-0.088,0.06


Saved summary to results/delta_eci_summary.csv


### **Community Evolution**
By Subreddit

In [206]:
# Compute Jaccard evolution for a subreddit
def compute_jaccard_similarity(c1, c2):
    """Compute Jaccard similarity between two communities (sets of users)."""
    set1, set2 = set(c1), set(c2)
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union > 0 else 0.0

def compute_community_jaccard_matrix(comms_t1, comms_t2):
    """Compute Jaccard similarity matrix between communities at two timesteps."""
    comms_ids_t1 = sorted(comms_t1.keys())
    comms_ids_t2 = sorted(comms_t2.keys())
    
    n_t1, n_t2 = len(comms_ids_t1), len(comms_ids_t2)
    jaccard_matrix = np.zeros((n_t1, n_t2))

    for i, comm_id_t1 in enumerate(comms_ids_t1):
        for j, comm_id_t2 in enumerate(comms_ids_t2):
            jaccard_matrix[i, j] = compute_jaccard_similarity(comms_t1[comm_id_t1], comms_t2[comm_id_t2])

    return jaccard_matrix, comms_ids_t1, comms_ids_t2

def analyze_user_migration_statistics(comms_t1, comms_t2):
    pass

def find_hungarian_best_matches(comms_t1, comms_t2):
    pass


def _print_jaccard_matrix(jaccard_matrix, t1, t2):
    pass

def _print_migration_stats(migration_stats):
    pass

def compute_community_jaccard_evolution(subreddit_data, verbose=True):
    ts = sorted(subreddit_data.keys())
    if len(ts) < 2:
        if verbose:
            print("Not enough timesteps to compute evolution.")
        return None
    
    evolution_data = {
        'ts_pair': [],
        'jaccard_matrices': [],
        'best_matches': [],
        'migration_stats': []
    }
    if verbose:
        print(f"Computing evolution for {len(ts)} timesteps: {ts}")
    
    for i in range(len(ts) - 1):
        t1, t2 = ts[i], ts[i + 1]
        comm_data_t1 = subreddit_data[t1]['community_info']
        comm_data_t2 = subreddit_data[t2]['community_info']
    
    return evolution_data


In [219]:
import numpy as np
from scipy.optimize import linear_sum_assignment

def compute_jaccard_similarity(c1, c2):
    """Compute Jaccard similarity between two communities (sets of users)."""
    set1, set2 = set(c1), set(c2)
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union > 0 else 0.0

def compute_community_jaccard_matrix(comms_t1, comms_t2):
    """
    Compute Jaccard similarity matrix between communities at two timesteps.
    
    Args:
        comms_t1: dict {community_id: list_of_users} for timestep t1
        comms_t2: dict {community_id: list_of_users} for timestep t2
    
    Returns:
        jaccard_matrix: 2D numpy array where [i,j] = Jaccard(comm_i_t1, comm_j_t2)
        comm_ids_t1: list of community IDs for t1 (rows)
        comm_ids_t2: list of community IDs for t2 (columns)
    """
    comm_ids_t1 = sorted(comms_t1.keys())
    comm_ids_t2 = sorted(comms_t2.keys())
    
    n_t1, n_t2 = len(comm_ids_t1), len(comm_ids_t2)
    jaccard_matrix = np.zeros((n_t1, n_t2))
    
    for i, comm_id_t1 in enumerate(comm_ids_t1):
        for j, comm_id_t2 in enumerate(comm_ids_t2):
            jaccard_matrix[i, j] = compute_jaccard_similarity(
                comms_t1[comm_id_t1], 
                comms_t2[comm_id_t2]
            )
    
    return jaccard_matrix, comm_ids_t1, comm_ids_t2

def find_best_matches_hungarian(jaccard_matrix, comm_ids_t1, comm_ids_t2, comms_t1, comms_t2):
    """
    Find optimal community matches using Hungarian algorithm.
    
    Args:
        jaccard_matrix: Jaccard similarity matrix
        comm_ids_t1, comm_ids_t2: Community IDs for each timestep
        comms_t1, comms_t2: Community user dictionaries
    
    Returns:
        matches: List of match dictionaries with optimal assignments
        total_jaccard: Sum of Jaccard similarities for optimal matching
    """
    # Hungarian algorithm minimizes cost, but we want to maximize Jaccard
    # So we use negative Jaccard as cost, or (1 - Jaccard)
    cost_matrix = 1 - jaccard_matrix
    
    # Handle case where dimensions don't match by padding with high cost
    n_t1, n_t2 = jaccard_matrix.shape
    if n_t1 != n_t2:
        max_dim = max(n_t1, n_t2)
        padded_cost_matrix = np.ones((max_dim, max_dim))  # High cost for dummy assignments
        padded_cost_matrix[:n_t1, :n_t2] = cost_matrix
        cost_matrix = padded_cost_matrix
    
    # Run Hungarian algorithm
    row_indices, col_indices = linear_sum_assignment(cost_matrix)
    
    # Extract valid matches (not dummy assignments)
    matches = []
    total_jaccard = 0.0
    
    for i, j in zip(row_indices, col_indices):
        if i < n_t1 and j < n_t2:  # Valid assignment (not dummy)
            comm_t1 = comm_ids_t1[i]
            comm_t2 = comm_ids_t2[j]
            jaccard_score = jaccard_matrix[i, j]
            
            matches.append({
                't1_comm': comm_t1,
                't2_comm': comm_t2,
                'jaccard': jaccard_score,
                't1_size': len(comms_t1[comm_t1]),
                't2_size': len(comms_t2[comm_t2]),
                'assignment_type': 'matched'
            })
            total_jaccard += jaccard_score
    
    # Handle unmatched communities
    matched_t1 = {match['t1_comm'] for match in matches}
    matched_t2 = {match['t2_comm'] for match in matches}
    
    # Unmatched t1 communities (disappeared)
    for comm_t1 in comm_ids_t1:
        if comm_t1 not in matched_t1:
            matches.append({
                't1_comm': comm_t1,
                't2_comm': None,
                'jaccard': 0.0,
                't1_size': len(comms_t1[comm_t1]),
                't2_size': 0,
                'assignment_type': 'disappeared'
            })
    
    # Unmatched t2 communities (emerged)
    for comm_t2 in comm_ids_t2:
        if comm_t2 not in matched_t2:
            matches.append({
                't1_comm': None,
                't2_comm': comm_t2,
                'jaccard': 0.0,
                't1_size': 0,
                't2_size': len(comms_t2[comm_t2]),
                'assignment_type': 'emerged'
            })
    
    return matches, total_jaccard

def analyze_user_migration_statistics(comms_t1, comms_t2):
    """Analyze user migration patterns between timesteps."""
    users_t1 = set().union(*comms_t1.values()) if comms_t1 else set()
    users_t2 = set().union(*comms_t2.values()) if comms_t2 else set()
    
    retained_users = users_t1.intersection(users_t2)
    new_users = users_t2 - users_t1
    lost_users = users_t1 - users_t2
    
    return {
        'total_t1': len(users_t1),
        'total_t2': len(users_t2),
        'retained': len(retained_users),
        'new': len(new_users),
        'lost': len(lost_users),
        'retention_rate': len(retained_users) / len(users_t1) if users_t1 else 0,
        'growth_rate': len(new_users) / len(users_t1) if users_t1 else 0
    }

def _print_jaccard_matrix(jaccard_matrix, t1, t2, comm_ids_t1, comm_ids_t2, max_display=None):
    """Print formatted Jaccard matrix with proper alignment."""
    print(f"\nJaccard Similarity Matrix (T{t1} -> T{t2}):")
    print(f"Rows: T{t1} communities, Columns: T{t2} communities")
    print(f"Shape: {jaccard_matrix.shape}")
    
    # Determine how many rows/cols to display
    if max_display is None:
        max_rows = len(comm_ids_t1)
        max_cols = len(comm_ids_t2)
        print("Full matrix:")
    else:
        max_rows = min(max_display, len(comm_ids_t1))
        max_cols = min(max_display, len(comm_ids_t2))
        print(f"Matrix (first {max_rows}x{max_cols}):")
    
    # Create column headers with actual timestep values
    col_headers = [f"T{t2}_C{comm_ids_t2[j]}" for j in range(max_cols)]
    header_str = "      " + " ".join([f"{header:>8}" for header in col_headers])
    print(header_str)
    
    # Print rows with actual timestep values
    display_matrix = jaccard_matrix[:max_rows, :max_cols]
    for i, row in enumerate(display_matrix):
        if i < len(comm_ids_t1):
            row_label = f"T{t1}_C{comm_ids_t1[i]}"
            row_str = " ".join([f"{val:>8.3f}" for val in row])
            print(f"{row_label:>5}: {row_str}")

def _print_migration_stats(migration_stats):
    """Print user migration statistics."""
    print(f"User Migration Statistics:")
    print(f"  Total users t1: {migration_stats['total_t1']}")
    print(f"  Total users t2: {migration_stats['total_t2']}")
    print(f"  Retained: {migration_stats['retained']} ({migration_stats['retention_rate']:.2%})")
    print(f"  New: {migration_stats['new']} ({migration_stats['growth_rate']:.2%})")
    print(f"  Lost: {migration_stats['lost']}")

def _print_hungarian_matches(matches, total_jaccard, t1=None, t2=None):
    """Print Hungarian algorithm matching results."""
    print(f"Hungarian Matching Results (Total Jaccard: {total_jaccard:.3f}):")
    
    # Use actual timestep values if provided
    t1_label = f"T{t1}" if t1 is not None else "T1"
    t2_label = f"T{t2}" if t2 is not None else "T2"
    
    # Group by assignment type
    matched = [m for m in matches if m['assignment_type'] == 'matched']
    disappeared = [m for m in matches if m['assignment_type'] == 'disappeared']
    emerged = [m for m in matches if m['assignment_type'] == 'emerged']
    
    print("  Matched communities:")
    for match in sorted(matched, key=lambda x: x['jaccard'], reverse=True):
        print(f"    {t1_label}_C{match['t1_comm']}({match['t1_size']}) -> {t2_label}_C{match['t2_comm']}({match['t2_size']}) | J={match['jaccard']:.3f}")
    
    if disappeared:
        print("  Disappeared communities:")
        for match in disappeared:
            print(f"    {t1_label}_C{match['t1_comm']}({match['t1_size']}) -> DISAPPEARED")
    
    if emerged:
        print("  Emerged communities:")
        for match in emerged:
            print(f"    EMERGED -> {t2_label}_C{match['t2_comm']}({match['t2_size']})")

def compute_community_jaccard_evolution(subreddit_data, verbose=True, matrix_display_limit=None):
    """
    Compute Jaccard evolution for a single subreddit across timesteps using Hungarian algorithm.
    
    Args:
        subreddit_data: dict {timestep: {'community_info': {...}, ...}}
        verbose: whether to print detailed output
    
    Returns:
        evolution_data: dict containing evolution analysis results
    """
    ts = sorted(subreddit_data.keys())
    if len(ts) < 2:
        if verbose:
            print("Not enough timesteps to compute evolution.")
        return None
    
    evolution_data = {
        'timesteps': ts,
        'ts_pairs': [],
        'jaccard_matrices': [],
        'hungarian_matches': [],
        'total_jaccards': [],
        'migration_stats': [],
        'comm_ids_t1': [],
        'comm_ids_t2': []
    }
    
    if verbose:
        subreddit_name = subreddit_data[ts[0]]['community_info']['subreddit']
        print(f"Computing evolution for {subreddit_name} across {len(ts)} timesteps: {ts}")
    
    for i in range(len(ts) - 1):
        t1, t2 = ts[i], ts[i + 1]
        comm_data_t1 = subreddit_data[t1]['community_info']
        comm_data_t2 = subreddit_data[t2]['community_info']
        
        # Extract community nodes
        comms_t1 = comm_data_t1['comm_nodes']  # {comm_id: [users]}
        comms_t2 = comm_data_t2['comm_nodes']  # {comm_id: [users]}
        
        # Compute Jaccard matrix
        jaccard_matrix, comm_ids_t1, comm_ids_t2 = compute_community_jaccard_matrix(comms_t1, comms_t2)
        
        # Find optimal matches using Hungarian algorithm
        hungarian_matches, total_jaccard = find_best_matches_hungarian(
            jaccard_matrix, comm_ids_t1, comm_ids_t2, comms_t1, comms_t2
        )
        
        # Compute migration statistics
        migration_stats = analyze_user_migration_statistics(comms_t1, comms_t2)
        
        # Store results
        evolution_data['ts_pairs'].append((t1, t2))
        evolution_data['jaccard_matrices'].append(jaccard_matrix)
        evolution_data['hungarian_matches'].append(hungarian_matches)
        evolution_data['total_jaccards'].append(total_jaccard)
        evolution_data['migration_stats'].append(migration_stats)
        evolution_data['comm_ids_t1'].append(comm_ids_t1)
        evolution_data['comm_ids_t2'].append(comm_ids_t2)
        
        if verbose:
            print(f"\n--- Timestep {t1} -> {t2} ---")
            print(f"Communities: {len(comms_t1)} -> {len(comms_t2)}")
            _print_jaccard_matrix(jaccard_matrix, t1, t2, comm_ids_t1, comm_ids_t2, max_display=matrix_display_limit)
            _print_hungarian_matches(hungarian_matches, total_jaccard, t1, t2)
            _print_migration_stats(migration_stats)
    
    return evolution_data

# Example usage:
_ = compute_community_jaccard_evolution(processed_dict[0], verbose=True, matrix_display_limit=10)

Computing evolution for brexit across 23 timesteps: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]

--- Timestep 0 -> 1 ---
Communities: 3 -> 2

Jaccard Similarity Matrix (T0 -> T1):
Rows: T0 communities, Columns: T1 communities
Shape: (3, 2)
Matrix (first 3x2):
         T1_C0    T1_C1
T0_C0:    0.259    0.100
T0_C1:    0.348    0.118
T0_C2:    0.038    0.071
Hungarian Matching Results (Total Jaccard: 0.448):
  Matched communities:
    T0_C1(10) -> T1_C0(21) | J=0.348
    T0_C0(13) -> T1_C1(9) | J=0.100
  Disappeared communities:
    T0_C2(6) -> DISAPPEARED
User Migration Statistics:
  Total users t1: 29
  Total users t2: 30
  Retained: 21 (72.41%)
  New: 9 (31.03%)
  Lost: 8

--- Timestep 1 -> 2 ---
Communities: 2 -> 2

Jaccard Similarity Matrix (T1 -> T2):
Rows: T1 communities, Columns: T2 communities
Shape: (2, 2)
Matrix (first 2x2):
         T2_C0    T2_C1
T1_C0:    0.400    0.043
T1_C1:    0.194    0.000
Hungarian Matching Results (Total Jaccard