In [33]:
import importlib
import src.utils.config_loader
importlib.reload(src.utils.config_loader)

from src.utils.config_loader import ConfigLoader

config_loader = ConfigLoader()
all_configs = config_loader.load_configs()
base_configs = config_loader.get_section(all_configs, "base")
graph_configs = config_loader.get_section(all_configs, "graph")
community_cfg = config_loader.get_section(all_configs, "community")

In [2]:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}") # type: ignore

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

PyTorch version: 2.8.0+cu128
CUDA available: True
CUDA version: 12.8


In [3]:
# Preprocess data
import importlib
import src.modules.data_processor
importlib.reload(src.modules.data_processor)

from src.modules.data_processor import DataProcessor

print(base_configs)
data_processor = DataProcessor(base_configs=base_configs)
processed_data = data_processor.run(device=device, summarize=False)

{'paths': {'raw': 'data/raw/deb_label.csv', 'processed': 'data/processed'}, 'labels': {0: 'disagree', 1: 'neutral', 2: 'agree'}, 'subreddits': {'brexit': 0, 'blacklivesmatter': 1, 'climate': 2, 'democrats': 3, 'republican': 4}, 'required_columns': ['label', 'msg_id_parent', 'msg_id_child', 'submission_id', 'body_parent', 'body_child', 'submission_text', 'subreddit', 'author_parent', 'author_child', 'datetime', 'agreement_fraction', 'individual_kappa'], 'cleaning': {'normalize_subreddits': True, 'rename_columns': {'author_child': 'src_author', 'author_parent': 'dst_author', 'msg_id_child': 'src_comment_id', 'msg_id_parent': 'dst_comment_id', 'body_child': 'src_comment_text', 'body_parent': 'dst_comment_text', 'datetime': 'timestamp'}, 'timestamp_parsing': {'primary_format': '%d/%m/%Y %H:%M', 'dayfirst': True, 'error_handling': 'coerce', 'fallback_formats': ['%Y-%m-%d %H:%M:%S', '%m/%d/%Y %H:%M', '%Y-%m-%d']}, 'remove_self_replies': True}, 'temporal': {'infer_parent_comment_time': {'enab

In [58]:
# Build graphs
import importlib
import src.modules.graph_processor
importlib.reload(src.modules.graph_processor)

from src.modules.graph_processor import GraphProcessor

processed_path = base_configs.get('processed_path', 'data/processed')
pairs = processed_data.user_pairs
comments = processed_data.comments

graph_processor = GraphProcessor(graph_configs=graph_configs, processed_path=processed_path)
graph_data = graph_processor.run(pairs=pairs, embeddings_source=comments)

Building node features with pooling: mean
    + Total unique authors in pairs: 35257
    + Total pooled vectors: 35212
    + Pooled vector dimension: 384
Building graph snapshots: directed=True, use_wcc=True, edge_attrs=['mean_confidence', 'net_vector']
    + [Subreddit 0, T0] 2 edges filtered by WCC (86 -> 84)
    + [Subreddit 0, T7] 1 edges filtered by WCC (177 -> 176)
    + [Subreddit 0, T11] 1 edges filtered by WCC (944 -> 943)
    + [Subreddit 0, T13] 4 edges filtered by WCC (701 -> 697)
    + [Subreddit 0, T15] 1 edges filtered by WCC (1195 -> 1194)
    + [Subreddit 0, T16] 2 edges filtered by WCC (608 -> 606)
    + [Subreddit 0, T18] 1 edges filtered by WCC (948 -> 947)
    + [Subreddit 0, T19] 1 edges filtered by WCC (893 -> 892)
    + [Subreddit 0, T21] 2 edges filtered by WCC (544 -> 542)
    + [Subreddit 0, T22] 4 edges filtered by WCC (421 -> 417)
    + [Subreddit 1, T0] 456 edges filtered by WCC (599 -> 143)
    + [Subreddit 1, T1] 281 edges filtered by WCC (323 -> 42)
   

In [76]:
# Run community detection
import importlib
import src.modules.community_processor
importlib.reload(src.modules.community_processor)

from src.modules.community_processor import LeidenCommunityProcessor

graph_dict = graph_data.graph_dict

print(community_cfg)
processed_path = base_configs.get('processed_path', 'data/processed')
leiden_processor = LeidenCommunityProcessor(community_configs=community_cfg, output_dir=processed_path)

# Run optimization analysis
print("=============================================================")
print("Optimization Analysis for Community Detection")
print("=============================================================")
opt_meta, scans_df, best_df, summary_df = leiden_processor.analyze_optimal_community_parameters(graph_dict)
print("Optimization meta:", opt_meta)
print("\nBest resolutions per graph (head):")
display(best_df.head() if best_df is not None else "None")
print("Resolution summary:")
display(summary_df if summary_df is not None else "None")

# Community detection results using best parameters
print("=============================================================")
print("Community Detection Results using Best Parameters")
print("=============================================================")
community_result = leiden_processor.run_community_detection(
    graph_dict=graph_dict,
    use_optimization=True,
    save=True
)

print("\nPartitions:")
display(community_result.partitions)
print("Meta:")
print(community_result.meta)

{'algorithm': 'leiden', 'seed': 42, 'weights': {'strategy': 'agreement_diff'}, 'optimization': {'mode': 'per_graph', 'metric': 'modularity', 'min_communities': 2, 'min_community_size': 3, 'max_communities_factor': 1.5, 'resolution': {'grid': [0.01, 0.05, 0.1, 0.25, 0.4, 0.6, 0.9, 1.0], 'early_stop': {'window': 3, 'delta': 0.001}}}, 'algorithms': {'leiden': {'partition_type': 'RBConfigurationVertexPartition'}}}
Optimization Analysis for Community Detection
Optimization meta: {'mode': 'per_graph', 'algorithm': 'leiden', 'partition_type': 'RBConfigurationVertexPartition', 'weight_strategy': 'agreement_diff', 'uniform_choice': None, 'uniform_scores': [], 'res_grid': [0.01, 0.05, 0.1, 0.25, 0.4, 0.6, 0.9, 1.0]}

Best resolutions per graph (head):


Unnamed: 0,subreddit_id,timestep,resolution,num_nodes,num_edges,num_communities,min_size,max_size,mean_size,metric_value,modularity
0,0,2,0.9,31,59,5,4,9,6.2,0.403045,0.403045
1,0,10,1.0,214,555,9,15,30,23.777778,0.402388,0.402388
2,0,13,1.0,265,697,11,16,33,24.090909,0.414099,0.414099
3,0,16,1.0,206,606,9,16,37,22.888889,0.365495,0.365495
4,0,18,0.9,264,947,8,18,50,33.0,0.340207,0.340207


Resolution summary:


Unnamed: 0,resolution,metric_mean,metric_std,modularity_mean,num_communities_mean,num_communities_min,num_communities_max,community_size_mean,community_size_min,community_size_max
0,0.01,0.074971,,0.074971,2.0,2,2,604.0,53,1155
1,0.05,0.317899,0.168212,0.317899,3.028571,2,8,129.067891,11,1053
2,0.1,0.531722,0.166008,0.531722,4.0,2,10,72.835229,6,906
3,0.25,0.591583,0.262751,0.591583,5.810345,2,13,44.453785,3,363
4,0.4,0.679227,0.205097,0.679227,8.240741,2,19,28.956283,3,206
5,0.6,0.694737,0.165395,0.694737,10.122807,2,25,24.089772,3,112
6,0.9,0.706966,0.157605,0.706966,13.178571,3,31,18.290026,3,90
7,1.0,0.712826,0.156541,0.712826,14.333333,4,32,17.754046,3,84


Community Detection Results using Best Parameters
Saved partitions to data/processed\communities/partitions.parquet
Saved labels arrays to data/processed\communities/labels_arrays.npz
Saved index mapping to data/processed\communities/labels_index.json
Saved name mapping to data/processed\communities/labels_name.json

Partitions:


Unnamed: 0,subreddit_id,timestep,resolution_used,num_nodes,num_edges,num_communities,modularity,community_sizes
0,0,0,1.0,29,84,3,0.249787,"[11, 9, 9]"
1,0,1,1.0,30,79,4,0.326791,"[9, 9, 6, 6]"
2,0,2,0.9,31,59,5,0.403045,"[9, 7, 6, 5, 4]"
3,0,3,1.0,43,107,6,0.312997,"[12, 10, 7, 7, 4, 3]"
4,0,4,1.0,53,134,6,0.337464,"[12, 11, 9, 8, 7, 6]"
...,...,...,...,...,...,...,...,...
72,4,4,0.9,975,1073,27,0.852925,"[90, 54, 51, 49, 45, 44, 42, 40, 38, 37, 37, 3..."
73,4,5,0.9,1208,1332,31,0.859669,"[59, 55, 53, 52, 52, 49, 47, 43, 43, 42, 42, 4..."
74,4,6,0.9,385,400,19,0.868500,"[31, 27, 25, 25, 24, 23, 23, 22, 21, 20, 20, 1..."
75,4,7,0.9,679,727,23,0.873055,"[51, 45, 44, 43, 41, 39, 38, 37, 30, 29, 28, 2..."


Meta:
{'algorithm': 'leiden', 'partition_type': 'RBConfigurationVertexPartition', 'weight_strategy': 'agreement_diff', 'use_optimization': True, 'force_resolution': None, 'optimization_meta': {'mode': 'per_graph', 'algorithm': 'leiden', 'partition_type': 'RBConfigurationVertexPartition', 'weight_strategy': 'agreement_diff', 'uniform_choice': None, 'uniform_scores': [], 'res_grid': [0.01, 0.05, 0.1, 0.25, 0.4, 0.6, 0.9, 1.0]}}


In [None]:
# Load saved community detection artifacts
import os, json
import pandas as pd
import numpy as np

base_processed = base_configs.get("processed_path", "data/processed")
comm_dir = os.path.join(base_processed, "communities")

# 1. Partitions table
partitions = pd.read_parquet(os.path.join(comm_dir, "partitions.parquet"))

# 2. Label arrays (each key: f"{sub_id}_{timestep}")
labels_npz = np.load(os.path.join(comm_dir, "labels_arrays.npz"))
# Example: get array for subreddit 12 timestep 3
def get_labels_array(sub_id, ts):
    return labels_npz.get(f"{sub_id}_{ts}")

# 3. Index -> original node id mapping
with open(os.path.join(comm_dir, "labels_index.json"), "r") as f:
    index_map = json.load(f)
# Access: index_map[str(sub_id)][str(ts)] -> {graph_index: original_node_id}

# 4. Name -> community id mapping (if stored)
with open(os.path.join(comm_dir, "labels_name.json"), "r") as f:
    name_map = json.load(f)
# Access: name_map[str(sub_id)][str(ts)] -> {original_node_id (as string): community_id}

# Helper: safe numeric cast
def _maybe_int(x):
    try:
        return int(x)
    except:
        return x  # keep string (e.g., username)

# From name_map directly (only if you need it)
def node_communities_from_name_map(sub_id, ts):
    nm = name_map[str(sub_id)][str(ts)]
    df = pd.DataFrame(
        [(k, v) for k, v in nm.items()],  # k may be non-numeric
        columns=["node_id", "community"]
    )
    df["node"] = df["node"].map(_maybe_int)
    df["subreddit_id"] = sub_id
    df["timestep"] = ts
    return df

# Example usage
example_sub = partitions.subreddit_id.iloc[0]
example_ts = partitions.timestep.iloc[0]
print(get_labels_array(example_sub, example_ts))
display(node_communities_from_name_map(example_sub, example_ts).head(7))


[2 1 0 0 1 0 1 0 2 1 1 0 1 0 2 0 0 1 2 0 0 0 1 2 1 2 2 2 2]


Unnamed: 0,node_id,community,graph_index,subreddit_id,timestep
0,2,2,0,0,0
1,1,1,1,0,0
2,0,0,2,0,0
3,0,0,3,0,0
4,1,1,4,0,0
5,0,0,5,0,0
6,1,1,6,0,0
7,0,0,7,0,0
8,2,2,8,0,0
9,1,1,9,0,0


Unnamed: 0,node_id,community,subreddit_id,timestep
0,APB2710,2,0,0
1,AnomalyNexus,1,0,0
2,ArchbishopMegatronQC,0,0,0
3,Bozata1,0,0,0
4,EthiczGradient,1,0,0
5,Greengoblingogo,0,0,0
6,Heruss100,1,0,0


In [103]:
# Inspect one index_map entry
idx_map_raw = index_map[str(example_sub)][str(example_ts)]
print("Sample original ids:", list(idx_map_raw.values())[:10])

Sample original ids: [2, 1, 0, 0, 1, 0, 1, 0, 2, 1]


In [107]:
index_map['0']['0']

{'0': 2,
 '1': 1,
 '2': 0,
 '3': 0,
 '4': 1,
 '5': 0,
 '6': 1,
 '7': 0,
 '8': 2,
 '9': 1,
 '10': 1,
 '11': 0,
 '12': 1,
 '13': 0,
 '14': 2,
 '15': 0,
 '16': 0,
 '17': 1,
 '18': 2,
 '19': 0,
 '20': 0,
 '21': 0,
 '22': 1,
 '23': 2,
 '24': 1,
 '25': 2,
 '26': 2,
 '27': 2,
 '28': 2}