In [1]:
# Reload package
import importlib
import src.utils.config_loader
importlib.reload(src.utils.config_loader)

from src.utils.config_loader import ConfigLoader

config_loader = ConfigLoader()
all_configs = config_loader.load_configs()

base_configs = config_loader.get_section(all_configs, "base")
graph_configs = config_loader.get_section(all_configs, "graph")

In [2]:
# Preprocess data
from src.modules.data_processor import DataProcessor
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
data_processor = DataProcessor(base_configs=base_configs)
processed_data = data_processor.process_data(device=device)

PyTorch version: 2.8.0+cu128
CUDA available: True
CUDA version: 12.8
Processed files already exist, loading ['comments.pkl', 'replies.pkl', 'user_pairs.pkl', 'submissions.pkl'] from 'data/processed/'


In [None]:
import networkx as nx
import pandas as pd
from collections import defaultdict
import numpy as np
from typing import Dict, Any, NamedTuple, Tuple
from torch_geometric.data import Data
import torch
import os

class ProcessedData(NamedTuple):
    node_dict: Dict[int, Dict[int, Dict[str, Any]]]
    graph_dict: Dict[int, Dict[int, nx.Graph]]
    pyg_graphs: list[Data]
    pyg_node_map: Dict[Tuple[int, int], Dict[str, int]]

class GraphProcessor:
    def __init__(self, graph_configs, processed_path: str = "data/processed"):
        self.processed_path = processed_path
        self.graph_configs = graph_configs
        self.graphs = {}
    
    def build_node_features(self, embeddings_source: pd.DataFrame, pairs: pd.DataFrame, configs: Dict[str, Any]) -> Tuple[Dict[int, Dict[int, Dict[str, np.ndarray]]], int]:
        """Pool embeddings per (subreddit_id, timestep, author) from [comments] data"""
        def _pool(arrs):
            arrs = np.stack(arrs, axis=0)
            if pooling == 'mean':
                return arrs.mean(axis=0)
            elif pooling == 'sum':
                return arrs.sum(axis=0)
            elif pooling == 'max':
                return arrs.max(axis=0)
            else:
                raise ValueError(f"Unsupported pooling: {pooling}")
        
        pooling = configs.get('pooling', 'mean')
        print(f"Building node features with pooling: {pooling}")
        
        # 1. Collect all authors from user pairs
        all_users = pd.concat([
            pairs[['subreddit_id','timestep','src_author']].rename(columns={'src_author': 'author'}),
            pairs[['subreddit_id','timestep','dst_author']].rename(columns={'dst_author': 'author'})
        ], ignore_index=True).drop_duplicates()
        print(f"    + Total unique authors in pairs: {len(all_users)}")

        # 2. Filter embeddings to only those authors & snapshots
        embs_df = embeddings_source[['subreddit_id','timestep','author','embeddings']]
        embs_df = embs_df.merge(all_users, on=['subreddit_id','timestep','author'], how='inner')
        embs_df['embeddings'] = embs_df['embeddings'].map(
            lambda x: x if isinstance(x, np.ndarray) else np.asarray(x, dtype=float)
        )

        # 3. Pool within each (subreddit, timestep, author)
        pooled = embs_df.groupby(['subreddit_id','timestep','author'], sort=False)['embeddings'].agg(_pool)
        emb_dim = pooled.iloc[0].shape[0] if not pooled.empty else 0
        print(f"    + Total pooled vectors: {len(pooled)}")
        print(f"    + Pooled vector dimension: {emb_dim}")

        # 4. Convert to nested dict
        node_dict: Dict[int, Dict[int, Dict[str, np.ndarray]]] = defaultdict(lambda: defaultdict(dict))
        for index, vec in pooled.items():
            sub, ts, author = index     # type: ignore
            node_dict[int(sub)][int(ts)][str(author)] = vec
        
        return {sub: dict(ts_dict) for sub, ts_dict in node_dict.items()}, emb_dim
    
    def build_graph_snapshots(self, pairs: pd.DataFrame, node_dict: Dict[int, Dict[int, Dict[str, np.ndarray]]], configs: Dict[str, Any]) -> Dict[int, Dict[int, nx.Graph]]:
        # {subreddit_id: {timestep: graph}}
        directed = configs.get('directed', True)
        use_wcc = configs.get('use_wcc', False)
        edge_attrs = configs.get('edge_attrs', ['mean_confidence', 'net_vector'])
        print(f"Building graph snapshots: directed={directed}, use_wcc={use_wcc}, edge_attrs={edge_attrs}")
        graph_dict = {}
        
        # Build from pairs
        for (subreddit_id, timestep), group in pairs.groupby(['subreddit_id', 'timestep']):
            subreddit_id = int(subreddit_id)
            timestep = int(timestep)

            G = nx.DiGraph() if directed else nx.Graph()

            # Add nodes with features
            ts_nodes = node_dict.get(subreddit_id, {}).get(timestep, {})
            for author, embedding in ts_nodes.items():
                G.add_node(author, embedding=embedding)

            # Add edges
            for _, row in group.iterrows():
                src = row['src_author']
                dst = row['dst_author']
                if src in G and dst in G:
                    edge_data = {attr: row[attr] for attr in edge_attrs if attr in row}
                    G.add_edge(src, dst, **edge_data)
            
            # Apply weakly connected component filtering if requested
            if use_wcc and len(G.nodes()) > 0:
                edges_before_wcc = len(G.edges())
                if directed:
                    # For directed graphs, get largest weakly connected component
                    wcc_components = list(nx.weakly_connected_components(G))
                else:
                    # For undirected graphs, get largest connected component
                    wcc_components = list(nx.connected_components(G))
                
                if wcc_components:
                    # Get the largest component
                    largest_component = max(wcc_components, key=len)
                    G = G.subgraph(largest_component).copy()
                    
                    # Log filtered
                    edges_after_wcc = len(G.edges())
                    edges_filtered = edges_before_wcc - edges_after_wcc
                    if edges_filtered > 0:
                        print(f"    + [Subreddit {subreddit_id}, T{timestep}] {edges_filtered} edges filtered by WCC ({edges_before_wcc} -> {edges_after_wcc})")
            
            graph_dict.setdefault(subreddit_id, {})[timestep] = G

        return graph_dict

    def build_pyg_graphs(self, graphs_dict) -> tuple[list, dict]:
        """Convert NetworkX graphs to PyG Data objects"""
        print("Converting NetworkX graphs to PyG data objects")
        pyg_graphs = []
        master_node_map = {}  # {(subreddit_id, timestep): {node_name: idx}}
        
        # Sort by subreddit_id and timestep for consistent ordering
        for sub in sorted(graphs_dict.keys()):
            ts_dict = graphs_dict[sub]
            for ts in sorted(ts_dict.keys()):
                G = ts_dict[ts]
                if len(G.nodes()) == 0:
                    continue    # Skip empty graphs
                
                node_list = sorted(G.nodes())
                node_map = {node: idx for idx, node in enumerate(node_list)}
                master_node_map[(sub, ts)] = node_map

                # Extract node features
                node_features = []
                for node in node_list:
                    embedding = G.nodes[node].get('embedding')
                    if embedding is not None:
                        node_features.append(embedding)
                    else:
                        node_features.append(np.zeros(384))  # Default to zero vector if no embedding
                x = torch.tensor(node_features, dtype=torch.float)

                # Extract edges
                edge_list, edge_attrs = [], []
                for src, dst, data in G.edges(data=True):
                    src_idx = node_map[src]
                    dst_idx = node_map[dst]
                    edge_list.append([src_idx, dst_idx])

                    # Extract edge attributes
                    edge_attr = []
                    for attr in ['mean_confidence', 'net_vector']:
                        if attr in data:
                            if attr == 'net_vector' and isinstance(data[attr], (list, np.ndarray)):
                                edge_attr.extend(data[attr])  # Flatten vector attributes
                            else:
                                edge_attr.append(data[attr])
                    edge_attrs.append(edge_attr)
            
                # Convert to tensors (moved inside the timestep loop)
                if edge_list:
                    edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()
                    edge_attr = torch.tensor(edge_attrs, dtype=torch.float)
                else:
                    edge_index = torch.empty((2, 0), dtype=torch.long)
                    edge_attr = torch.empty((0, len(['mean_confidence', 'net_vector'])), dtype=torch.float)
                
                # Create PyG data object (moved inside the timestep loop)
                data = Data(
                    x=x,
                    edge_index=edge_index,
                    edge_attr=edge_attr,
                    num_nodes=len(node_list),
                    num_edges=len(edge_list),
                    node_map=node_map,
                    subreddit_id=sub,
                    local_timestep=ts
                )
                pyg_graphs.append(data)
        
        print(f"    + Created {len(pyg_graphs)} PyG graphs")
        return pyg_graphs, master_node_map

    def process_data(self, pairs: pd.DataFrame, embeddings_source: pd.DataFrame) -> ProcessedData:
        """Process data to create node features and graph snapshots"""
        # 1. Build node features
        node_cfg = self.graph_configs.get('node_features', {})
        node_dict, vec_dim = self.build_node_features(embeddings_source, pairs, node_cfg)
        
        # 2. Build graph snapshots
        construction_cfg = self.graph_configs.get('construction', {})
        graph_dict = self.build_graph_snapshots(pairs, node_dict, construction_cfg)

        # 3. Build PyG graphs
        pyg_graphs, pyg_node_map = self.build_pyg_graphs(graph_dict)
        if not os.path.exists(self.processed_path):
            os.makedirs(self.processed_path)

        # 4. Save PyG graphs to file
        torch.save(pyg_graphs, os.path.join(self.processed_path, f'pyg_graphs_{vec_dim}D.pt'))
        print(f"Saved PyG graphs to {self.processed_path}/pyg_graphs_{vec_dim}D.pt")
        
        return ProcessedData(
            node_dict=node_dict, 
            graph_dict=graph_dict, 
            pyg_graphs=pyg_graphs,
            pyg_node_map=pyg_node_map
        )

# Graph snapshots
pairs = processed_data.user_pairs
comments = processed_data.comments
processed_path = base_configs.get('processed_path', 'data/processed')
graph_processor = GraphProcessor(graph_configs)
graph_data = graph_processor.process_data(pairs, comments)

Building node features with pooling: mean
    + Total unique authors in pairs: 35257
    + Total pooled vectors: 35212
    + Pooled vector dimension: 384
Building graph snapshots: directed=True, use_wcc=True, edge_attrs=['mean_confidence', 'net_vector']
    + [Subreddit 0, T0] 2 edges filtered by WCC (86 -> 84)
    + [Subreddit 0, T7] 1 edges filtered by WCC (177 -> 176)
    + [Subreddit 0, T11] 1 edges filtered by WCC (944 -> 943)
    + [Subreddit 0, T13] 4 edges filtered by WCC (701 -> 697)
    + [Subreddit 0, T15] 1 edges filtered by WCC (1195 -> 1194)
    + [Subreddit 0, T16] 2 edges filtered by WCC (608 -> 606)
    + [Subreddit 0, T18] 1 edges filtered by WCC (948 -> 947)
    + [Subreddit 0, T19] 1 edges filtered by WCC (893 -> 892)
    + [Subreddit 0, T21] 2 edges filtered by WCC (544 -> 542)
    + [Subreddit 0, T22] 4 edges filtered by WCC (421 -> 417)
    + [Subreddit 1, T0] 456 edges filtered by WCC (599 -> 143)
    + [Subreddit 1, T1] 281 edges filtered by WCC (323 -> 42)
   

In [11]:
import igraph as ig
import leidenalg as la
import networkx as nx

# Convert NetworkX graph to igraph for Leiden algorithm
def nx_to_igraph(G):
    """Convert NetworkX graph to igraph"""
    # Create igraph from edge list
    edges = list(G.edges())
    nodes = sorted(G.nodes())
    
    # Create igraph graph
    ig_graph = ig.Graph()
    ig_graph.add_vertices(len(nodes))
    
    # Map node names to indices
    node_to_idx = {node: i for i, node in enumerate(nodes)}
    idx_to_node = {i: node for i, node in enumerate(nodes)}
    
    # Add edges using indices
    edge_indices = [(node_to_idx[src], node_to_idx[dst]) for src, dst in edges]
    ig_graph.add_edges(edge_indices)
    
    # Set vertex names
    ig_graph.vs['name'] = nodes
    
    return ig_graph, node_to_idx, idx_to_node

# Run Leiden community detection
sample_graph = graph_data.graph_dict[0][0]
print(f"Sample graph: {len(sample_graph.nodes())} nodes, {len(sample_graph.edges())} edges")

# Convert to igraph
ig_graph, node_to_idx, idx_to_node = nx_to_igraph(sample_graph)

# Run Leiden algorithm
partition = la.find_partition(ig_graph, la.ModularityVertexPartition, seed=42)

print(f"Found {len(partition)} communities")
print(f"Modularity: {partition.modularity:.3f}")

# Show community assignments
communities = {}
for i, community_id in enumerate(partition.membership):
    node_name = idx_to_node[i]
    communities.setdefault(community_id, []).append(node_name)

for comm_id, members in communities.items():
    print(f"Community {comm_id}: {len(members)} members: {members[:5]}{'...' if len(members) > 5 else ''}")

Sample graph: 29 nodes, 84 edges
Found 3 communities
Modularity: 0.250
Community 2: 9 members: ['APB2710', 'Petemcfuzzbuzz', 'StrixTechnica', 'amgiecorker', 'pikadrew']...
Community 1: 9 members: ['AnomalyNexus', 'EthiczGradient', 'Heruss100', 'Prituh', 'ScarletIT']...
Community 0: 11 members: ['ArchbishopMegatronQC', 'Bozata1', 'Greengoblingogo', 'MrPuddington2', 'SideburnsOfDoom']...


In [17]:
# Run Leiden community detection on all graphs
def analyze_all_communities(graphs_dict):
    """Run community detection on all graphs and collect statistics"""
    results = []
    
    for sub_id in sorted(graphs_dict.keys()):
        ts_dict = graphs_dict[sub_id]
        for ts in sorted(ts_dict.keys()):
            G = ts_dict[ts]
            
            if len(G.nodes()) == 0:
                continue
                
            # Convert to igraph
            ig_graph, node_to_idx, idx_to_node = nx_to_igraph(G)
            
            # Run Leiden algorithm
            partition = la.find_partition(ig_graph, la.ModularityVertexPartition, seed=42)
            
            # Calculate statistics
            num_communities = len(partition)
            modularity = partition.modularity
            community_sizes = [len([i for i, comm in enumerate(partition.membership) if comm == c]) 
                             for c in range(num_communities)]
            
            results.append({
                'subreddit_id': sub_id,
                'timestep': ts,
                'num_nodes': len(G.nodes()),
                'num_edges': len(G.edges()),
                'num_communities': num_communities,
                'modularity': modularity,
                'largest_community': max(community_sizes) if community_sizes else 0,
                'smallest_community': min(community_sizes) if community_sizes else 0,
                'avg_community_size': sum(community_sizes) / len(community_sizes) if community_sizes else 0,
                'community_sizes': community_sizes
            })
            
            print(f"Sub {sub_id}, TS {ts}: {len(G.nodes())} nodes, {len(G.edges())} edges, "
                  f"{num_communities} communities, modularity: {modularity:.3f}")
    
    return results

# Run analysis on all graphs
print("=== Community Detection Analysis ===")
community_stats = analyze_all_communities(graph_data.graph_dict)

# Summary statistics
import pandas as pd
stats_df = pd.DataFrame(community_stats)
print("\n=== Summary Statistics ===")
print(f"Total graphs analyzed: {len(stats_df)}")
print(f"Average modularity: {stats_df['modularity'].mean():.3f}")
print(f"Average communities per graph: {stats_df['num_communities'].mean():.1f}")
print(f"Modularity range: {stats_df['modularity'].min():.3f} - {stats_df['modularity'].max():.3f}")

# Show detailed results
print("\n=== Detailed Results ===")
for result in community_stats:
    sizes_str = ', '.join(map(str, result['community_sizes'][:5]))
    if len(result['community_sizes']) > 5:
        sizes_str += '...'
    print(f"Sub {result['subreddit_id']}, TS {result['timestep']}: "
          f"Communities: {result['num_communities']}, "
          f"Mod: {result['modularity']:.3f}, "
          f"Sizes: [{sizes_str}]")

=== Community Detection Analysis ===
Sub 0, TS 0: 29 nodes, 84 edges, 3 communities, modularity: 0.250
Sub 0, TS 1: 30 nodes, 79 edges, 4 communities, modularity: 0.327
Sub 0, TS 2: 31 nodes, 59 edges, 6 communities, modularity: 0.402
Sub 0, TS 3: 43 nodes, 107 edges, 6 communities, modularity: 0.313
Sub 0, TS 4: 53 nodes, 134 edges, 6 communities, modularity: 0.337
Sub 0, TS 5: 71 nodes, 266 edges, 4 communities, modularity: 0.288
Sub 0, TS 6: 79 nodes, 187 edges, 6 communities, modularity: 0.405
Sub 0, TS 7: 94 nodes, 176 edges, 8 communities, modularity: 0.468
Sub 0, TS 8: 187 nodes, 611 edges, 9 communities, modularity: 0.350
Sub 0, TS 9: 271 nodes, 1090 edges, 9 communities, modularity: 0.317
Sub 0, TS 10: 214 nodes, 555 edges, 9 communities, modularity: 0.402
Sub 0, TS 11: 261 nodes, 943 edges, 9 communities, modularity: 0.325
Sub 0, TS 12: 330 nodes, 1414 edges, 10 communities, modularity: 0.292
Sub 0, TS 13: 265 nodes, 697 edges, 11 communities, modularity: 0.414
Sub 0, TS 14: 