In [1]:
import community as community_louvain
import networkx as nx
import pickle
import json
from sklearn.metrics import normalized_mutual_info_score
import matplotlib.pyplot as plt
from matplotlib import colormaps
import matplotlib.patches as mpatches
import numpy as np
from collections import defaultdict, Counter
import pandas as pd

In [2]:
with open('/root/social_computing_group/graph.pkl', 'rb') as f:
    nx_graph = pickle.load(f)

In [3]:
labels = [nx_graph.nodes[node]['label'] for node in nx_graph.nodes()]

In [4]:
best_nmi = -1
best_partition = None

# 运行10次取最优
for _ in range(10):
    current_partition = community_louvain.best_partition(nx_graph)
    current_nmi = normalized_mutual_info_score(labels, list(current_partition.values()))
    
    if current_nmi > best_nmi:
        best_nmi = current_nmi
        best_partition = current_partition

partition = best_partition
partition = {int(k): int(v) for k, v in partition.items()}

In [5]:
print(f"[Louvain] NMI Score: {best_nmi:.4f}")

[Louvain] NMI Score: 0.6347


In [6]:
with open('/root/social_computing_group/community_detection/louvain_communities.json', 'w') as f:
    json.dump(partition, f)

In [7]:
def visualize_large_graph(nx_graph, partition, nmi, output_path):
    """大规模图可视化"""
    # 1. 社区子图采样
    community_sizes = {c: sum(1 for v in partition.values() if v == c) 
                      for c in set(partition.values())}
    top_communities = sorted(community_sizes, key=lambda x: -community_sizes[x])[:5]
    
    # 2. 构建展示子图
    sample_nodes = []
    for comm in top_communities:
        nodes_in_comm = [n for n in nx_graph.nodes() if partition[n] == comm][:100]
        sample_nodes.extend(nodes_in_comm)
    subgraph = nx_graph.subgraph(sample_nodes)
    
    # 3. 计算力导向布局
    print("正在计算布局...")
    pos = nx.kamada_kawai_layout(subgraph)
    
    # 4. 可视化
    plt.figure(figsize=(15, 10))
    node_colors = [top_communities.index(partition[n]) for n in subgraph.nodes()]
    nx.draw(subgraph, pos, node_color=node_colors, cmap='tab20', 
           node_size=30, width=0.1, with_labels=False)
    
    # 5. 添加统计信息
    info_text = '\n'.join([f"Community {c}: {community_sizes[c]} nodes" 
                          for i, c in enumerate(top_communities)])
    plt.gcf().text(0.82, 0.15, info_text, fontsize=10, 
                  bbox=dict(facecolor='white', alpha=0.7))
    
    plt.text(0.5, 0.02, f"NMI: {nmi:.4f}", 
            ha='center', va='bottom', 
            transform=plt.gcf().transFigure,
            fontsize=12,
            bbox=dict(facecolor='white', alpha=0.8))
    
    plt.title("Visualization of Top5 community structure sampling", fontsize=20)
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"大规模图可视化结果已保存至 {output_path}")

In [8]:
output_path='/root/social_computing_group/community_detection/louvain_visualization.png'
visualize_large_graph(nx_graph, partition, best_nmi, output_path)

正在计算布局...
大规模图可视化结果已保存至 /root/social_computing_group/community_detection/louvain_visualization.png


In [9]:
def community_metrics(partition, true_labels):
    """
    partition: 字典 {节点ID: 社区ID}
    true_labels: 列表 [节点真实标签]
    返回: DataFrame包含每个社区的指标
    """
    # 构建社区-标签分布映射
    comm_label_dist = {}
    for node, comm_id in partition.items():
        true_label = true_labels[node]
        if comm_id not in comm_label_dist:
            comm_label_dist[comm_id] = Counter()
        comm_label_dist[comm_id][true_label] += 1
    
    # 计算指标
    metrics = []
    for comm_id, counter in comm_label_dist.items():
        total = sum(counter.values())
        max_count = max(counter.values())
        probabilities = [c/total for c in counter.values()]
        
        # 纯度
        purity = max_count / total
        
        # 熵
        entropy = -sum(p * np.log(p) for p in probabilities if p > 0)
        
        # 存储结果
        metrics.append({
            "Community ID": comm_id,
            "Nodes": total,
            "Label": counter.most_common(1)[0][0],
            "Purity": purity,
            "Entropy": entropy,
        })
    
    return pd.DataFrame(metrics)

In [10]:
# 计算指标
metrics_df = community_metrics(partition, labels)

# 排序并保存表格
metrics_df.sort_values('Community ID', inplace=True)
metrics_df.to_csv('/root/social_computing_group/community_detection/community_metrics_origin.csv', index=False)

### 尝试改进

In [13]:
def optimized_merge(partition, nx_graph, min_size=100):
    """高性能社区合并函数"""
    # 构建社区节点映射
    comm_nodes = defaultdict(list)
    for node, comm in partition.items():
        comm_nodes[comm].append(node)
    
    # 识别小社区
    small_comms = {c for c, nodes in comm_nodes.items() if len(nodes) < min_size}
    if not small_comms:
        return partition
    
    # 构建社区邻接图（使用边权重统计）
    comm_graph = nx.Graph()
    edge_weights = defaultdict(int)
    for u, v in nx_graph.edges():
        c1, c2 = partition[u], partition[v]
        if c1 != c2:
            edge_weights[(min(c1,c2), max(c1,c2))] += 1
    
    # 添加带权重的边
    for (c1, c2), w in edge_weights.items():
        comm_graph.add_edge(c1, c2, weight=w)
    
    # 批量合并处理
    merged = partition.copy()
    for sc in small_comms:
        # 寻找最佳目标社区
        if sc not in comm_graph:
            target = min(comm_nodes.keys(), key=lambda x: len(comm_nodes[x]))
        else:
            neighbors = list(comm_graph.neighbors(sc))
            if neighbors:
                target = max(neighbors, key=lambda x: comm_graph[sc][x]['weight'])
            else:
                target = min(comm_nodes.keys(), key=lambda x: len(comm_nodes[x]))
        
        # 批量更新节点
        for node in comm_nodes[sc]:
            merged[node] = target
    
    return merged

In [14]:
def multi_resolution_louvain(nx_graph, resolutions=[0.7, 0.8, 0.9, 1.0, 1.1, 1.2], n_runs=5):
    """多分辨率集成优化"""
    best_nmi = -1
    best_part = None
    labels = [nx_graph.nodes[n]['label'] for n in nx_graph.nodes()]
    
    for res in resolutions:
        print(f"Processing resolution {res}...")
        for _ in range(n_runs):
            part = community_louvain.best_partition(nx_graph, resolution=res)
            current_nmi = normalized_mutual_info_score(labels, list(part.values()))
            
            if current_nmi > best_nmi:
                best_nmi = current_nmi
                best_part = part
    
    return best_part

In [15]:
base_partition = multi_resolution_louvain(nx_graph)
final_partition = optimized_merge(base_partition, nx_graph)
final_partition = {int(k): int(v) for k, v in final_partition.items()}

Processing resolution 0.7...
Processing resolution 0.8...
Processing resolution 0.9...
Processing resolution 1.0...
Processing resolution 1.1...
Processing resolution 1.2...


In [16]:
nmi = normalized_mutual_info_score(labels, list(final_partition.values()))
print(f"[Enhanced Louvain] NMI Score: {nmi:.4f}")

[Enhanced Louvain] NMI Score: 0.6656


In [17]:
with open('/root/social_computing_group/community_detection/louvain_communities_enhanced.json', 'w') as f:
    json.dump(final_partition, f)

In [18]:
output_path = '/root/social_computing_group/community_detection/enhanced_louvain_visualization.png'
visualize_large_graph(nx_graph, final_partition, nmi, output_path)

正在计算布局...
大规模图可视化结果已保存至 /root/social_computing_group/community_detection/enhanced_louvain_visualization.png


In [19]:
# 计算指标
metrics_df = community_metrics(final_partition, labels)


# 排序并保存表格
metrics_df.sort_values('Community ID', inplace=True)
metrics_df.to_csv('/root/social_computing_group/community_detection/enhanced_louvain_metrics.csv', index=False)