In [43]:
import importlib
import src.utils.config_loader
importlib.reload(src.utils.config_loader)

from src.utils.config_loader import ConfigLoader

config_loader = ConfigLoader()
all_configs = config_loader.load_configs()
base_configs = config_loader.get_section(all_configs, "base")
graph_configs = config_loader.get_section(all_configs, "graph")
community_cfg = config_loader.get_section(all_configs, "community")

In [44]:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}") # type: ignore

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

PyTorch version: 2.5.1
CUDA available: True
CUDA version: 12.4


In [45]:
# Preprocess data
import importlib
import src.modules.data_processor
importlib.reload(src.modules.data_processor)

from src.modules.data_processor import DataProcessor

print(base_configs)
data_processor = DataProcessor(base_configs=base_configs)
processed_data = data_processor.run(device=device, summarize=False)

{'paths': {'raw': 'data/raw/deb_label.csv', 'processed': 'data/processed'}, 'labels': {0: 'disagree', 1: 'neutral', 2: 'agree'}, 'subreddits': {'brexit': 0, 'blacklivesmatter': 1, 'climate': 2, 'democrats': 3, 'republican': 4}, 'required_columns': ['label', 'msg_id_parent', 'msg_id_child', 'submission_id', 'body_parent', 'body_child', 'submission_text', 'subreddit', 'author_parent', 'author_child', 'datetime', 'agreement_fraction', 'individual_kappa'], 'cleaning': {'normalize_subreddits': True, 'rename_columns': {'author_child': 'src_author', 'author_parent': 'dst_author', 'msg_id_child': 'src_comment_id', 'msg_id_parent': 'dst_comment_id', 'body_child': 'src_comment_text', 'body_parent': 'dst_comment_text', 'datetime': 'timestamp'}, 'timestamp_parsing': {'primary_format': '%d/%m/%Y %H:%M', 'dayfirst': True, 'error_handling': 'coerce', 'fallback_formats': ['%Y-%m-%d %H:%M:%S', '%m/%d/%Y %H:%M', '%Y-%m-%d']}, 'remove_self_replies': True}, 'temporal': {'infer_parent_comment_time': {'enab

In [46]:
# Build graphs
import importlib
import src.modules.graph_processor
importlib.reload(src.modules.graph_processor)

from src.modules.graph_processor import GraphProcessor

processed_path = base_configs.get('processed_path', 'data/processed')
pairs = processed_data.user_pairs
comments = processed_data.comments

graph_processor = GraphProcessor(graph_configs=graph_configs, processed_path=processed_path)
graph_data = graph_processor.run(pairs=pairs, embeddings_source=comments)

Building node features with pooling: mean
    + Total unique authors in pairs: 35257
    + Total pooled vectors: 35212
    + Pooled vector dimension: 384
Building graph snapshots: directed=True, wcc_mode=topk_coverage, edge_attrs=['mean_confidence', 'net_vector']
[Subreddit 0, T0] [WCC:topk_coverage] Nodes 32->29, Edges 86->84 (2.3% edges removed)
[Subreddit 0, T1] [WCC:topk_coverage] Nodes 30->30, Edges 79->79 (0.0% edges removed)
[Subreddit 0, T2] [WCC:topk_coverage] Nodes 31->31, Edges 59->59 (0.0% edges removed)
[Subreddit 0, T3] [WCC:topk_coverage] Nodes 43->43, Edges 107->107 (0.0% edges removed)
[Subreddit 0, T4] [WCC:topk_coverage] Nodes 53->53, Edges 134->134 (0.0% edges removed)
[Subreddit 0, T5] [WCC:topk_coverage] Nodes 71->71, Edges 266->266 (0.0% edges removed)
[Subreddit 0, T6] [WCC:topk_coverage] Nodes 79->79, Edges 187->187 (0.0% edges removed)
[Subreddit 0, T7] [WCC:topk_coverage] Nodes 96->94, Edges 177->176 (0.6% edges removed)
[Subreddit 0, T8] [WCC:topk_coverage] 

In [47]:
# Run community detection
import importlib
import src.modules.community_processor
importlib.reload(src.modules.community_processor)

from src.modules.community_processor import LeidenCommunityProcessor

graph_dict = graph_data.graph_dict

print(community_cfg)
processed_path = base_configs.get('processed_path', 'data/processed')
leiden_processor = LeidenCommunityProcessor(community_configs=community_cfg, output_dir=processed_path)

# Run optimization analysis
print("=============================================================")
print("Optimization Analysis for Community Detection")
print("=============================================================")
opt_meta, scans_df, best_df, summary_df = leiden_processor.analyze_optimal_community_parameters(graph_dict)
print("Optimization meta:", opt_meta)
print("\nBest resolutions per graph (head):")
display(best_df.head() if best_df is not None else "None")
print("Resolution summary:")
display(summary_df if summary_df is not None else "None")

# Community detection results using best parameters
print("=============================================================")
print("Community Detection Results using Best Parameters")
print("=============================================================")
community_result = leiden_processor.run_community_detection(
    graph_dict=graph_dict,
    use_optimization=True,
    save=True
)

print("\nPartitions:")
display(community_result.partitions)
print("Meta:")
print(community_result.meta)

{'algorithm': 'leiden', 'seed': 42, 'weights': {'strategy': 'low_neutrality'}, 'optimization': {'mode': 'per_graph', 'metric': 'modularity', 'min_communities': 3, 'min_community_size': 10, 'max_communities_factor': 0.8, 'resolution': {'grid': [0.05, 0.1, 0.2, 0.4, 0.6, 0.9, 1.0], 'early_stop': {'window': 3, 'delta': 0.001}}}, 'algorithms': {'leiden': {'partition_type': 'RBConfigurationVertexPartition'}}}
Optimization Analysis for Community Detection
Optimization meta: {'mode': 'per_graph', 'algorithm': 'leiden', 'partition_type': 'RBConfigurationVertexPartition', 'weight_strategy': 'low_neutrality', 'uniform_choice': None, 'uniform_scores': [], 'res_grid': [0.05, 0.1, 0.2, 0.4, 0.6, 0.9, 1.0]}

Best resolutions per graph (head):


Unnamed: 0,subreddit_id,timestep,resolution,num_nodes,num_edges,num_communities,min_size,max_size,mean_size,metric_value,modularity
0,0,5,1.0,71,266,4,15,23,17.75,0.288117,0.288117
1,0,7,0.9,94,176,7,10,18,13.428571,0.475223,0.475223
2,0,8,1.0,187,611,9,16,30,20.777778,0.350063,0.350063
3,0,10,1.0,214,555,9,15,30,23.777778,0.402388,0.402388
4,0,11,1.0,261,943,9,13,46,29.0,0.325407,0.325407


Resolution summary:


Unnamed: 0,resolution,metric_mean,metric_std,modularity_mean,num_communities_mean,num_communities_min,num_communities_max,community_size_mean,community_size_min,community_size_max
0,0.05,0.110628,0.042378,0.110628,4.333333,3,6,207.916667,11,833
1,0.1,0.521998,0.151723,0.521998,4.4,3,7,130.566667,11,635
2,0.2,0.674607,0.114137,0.674607,7.0,3,10,80.949405,11,407
3,0.4,0.773443,0.065553,0.773443,12.714286,5,17,51.476494,11,133
4,0.6,0.659049,0.20818,0.659049,12.6,3,21,43.199365,11,111
5,0.9,0.455786,0.205215,0.455786,12.0,4,27,30.622484,10,76
6,1.0,0.383382,0.129674,0.383382,10.357143,4,28,26.563013,10,54


Community Detection Results using Best Parameters
Saved partitions to data/processed/communities/partitions.parquet
Saved labels arrays to data/processed/communities/labels_arrays.npz
Saved index mapping to data/processed/communities/labels_index.json
Saved name mapping to data/processed/communities/labels_name.json

Partitions:


Unnamed: 0,subreddit_id,timestep,resolution_used,num_nodes,num_edges,num_communities,modularity,community_sizes
0,0,0,1.0,29,84,3,0.249787,"[11, 9, 9]"
1,0,1,1.0,30,79,4,0.326791,"[9, 9, 6, 6]"
2,0,2,1.0,31,59,6,0.402183,"[7, 5, 5, 5, 5, 4]"
3,0,3,1.0,43,107,6,0.312997,"[12, 10, 7, 7, 4, 3]"
4,0,4,1.0,53,134,6,0.337464,"[12, 11, 9, 8, 7, 6]"
...,...,...,...,...,...,...,...,...
72,4,4,1.0,1195,1260,60,0.879579,"[90, 47, 47, 45, 45, 44, 43, 39, 39, 38, 38, 3..."
73,4,5,1.0,1317,1426,45,0.870164,"[86, 60, 55, 52, 50, 48, 47, 46, 45, 45, 41, 4..."
74,4,6,1.0,657,626,62,0.926102,"[35, 34, 31, 27, 26, 25, 24, 24, 23, 23, 22, 2..."
75,4,7,1.0,719,760,31,0.879852,"[56, 45, 43, 40, 39, 38, 38, 37, 34, 30, 30, 2..."


Meta:
{'algorithm': 'leiden', 'partition_type': 'RBConfigurationVertexPartition', 'weight_strategy': 'low_neutrality', 'use_optimization': True, 'force_resolution': None, 'optimization_meta': {'mode': 'per_graph', 'algorithm': 'leiden', 'partition_type': 'RBConfigurationVertexPartition', 'weight_strategy': 'low_neutrality', 'uniform_choice': None, 'uniform_scores': [], 'res_grid': [0.05, 0.1, 0.2, 0.4, 0.6, 0.9, 1.0]}}


In [48]:
community_result.partitions.num_communities.unique()

array([  3,   4,   6,   7,   9,  10,  11,   8, 119,  77,  57,  46,  44,
        32,  29,  39,  30,  26,   5,  12,  15,  25,  14,  13,  19,  16,
        20,  38,  21,  24,  27,  28,  49,  60,  45,  62,  31])