#Giai đoạn 3: Community Detection (Phát hiện Cộng đồng)


In [1]:
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
from sklearn.metrics import normalized_mutual_info_score
import random
import os
import time
CLEAN_FILE = "facebook_combined_cleaned.txt"

# Load graph
G = nx.read_edgelist(CLEAN_FILE, nodetype=int)


##Louvain
chạy, modularity, sizes, lưu assignment

In [2]:
!pip install python-louvain



In [3]:
from community.community_louvain import best_partition
start_time = time.time()
# Đặt random_state để kết quả nhất quán mỗi lần chạy
partition_louvain = best_partition(G, random_state=42)

run_time = time.time() - start_time
print(f"Hoàn tất! Thuật toán Louvain chạy trong {run_time:.4f} giây.")

Hoàn tất! Thuật toán Louvain chạy trong 10.3253 giây.


In [5]:
from community.community_louvain import modularity
# Stats Louvain
k_louvain = len(set(partition_louvain.values()))
mod_louvain = modularity(partition_louvain, G)
sizes_louvain = Counter(partition_louvain.values())
sizes_louvain_df = pd.DataFrame(
    sorted(sizes_louvain.items(), key=lambda x: x[1], reverse=True),
    columns=["community_id", "size"]
)


In [6]:
sizes_louvain_df["percentage"] = 100 * sizes_louvain_df["size"] / G.number_of_nodes()

# Save assignment CSV
assign_louvain_df = pd.DataFrame({"node": list(partition_louvain.keys()), "community": list(partition_louvain.values())})
assign_louvain_df.to_csv("louvain_assignment.csv", index=False)

print(" Number of communities (k):", k_louvain)
print(" Modularity (Q):", round(mod_louvain, 4))
print(" Largest community %:", round(sizes_louvain_df.iloc[0]["percentage"], 2))


 Number of communities (k): 16
 Modularity (Q): 0.835
 Largest community %: 13.57


##Leiden
chạy (igraph + leidenalg), modularity, sizes, lưu assignment

In [7]:
!pip install igraph leidenalg


Collecting igraph
  Downloading igraph-1.0.0-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting leidenalg
  Downloading leidenalg-0.11.0-cp38-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (10 kB)
Collecting texttable>=1.6.2 (from igraph)
  Downloading texttable-1.7.0-py2.py3-none-any.whl.metadata (9.8 kB)
Downloading igraph-1.0.0-cp39-abi3-manylinux_2_28_x86_64.whl (5.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.7/5.7 MB[0m [31m66.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading leidenalg-0.11.0-cp38-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (2.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m59.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading texttable-1.7.0-py2.py3-none-any.whl (10 kB)
Installing collected packages: texttable, igraph, leidenalg
Successfully installed igraph-1.0.0 leidenalg-0.11.0 texttable-1.7.0


In [8]:
import leidenalg
import igraph as ig
import pandas as pd

networkx->igraph

In [9]:
nodes_sorted = sorted(G.nodes())
node_to_idx = {n: i for i, n in enumerate(nodes_sorted)}
edges_idx = [(node_to_idx[u], node_to_idx[v]) for u, v in G.edges()]

ig_g = ig.Graph(n=len(nodes_sorted), edges=edges_idx, directed=False)

In [10]:
partition_leiden = leidenalg.find_partition(ig_g, leidenalg.RBConfigurationVertexPartition, resolution_parameter=1.0)


In [12]:
leiden_membership = partition_leiden.membership
leiden_labels = {nodes_sorted[i]: leiden_membership[i] for i in range(len(nodes_sorted))}
k_leiden = len(set(leiden_membership))
mod_leiden = partition_leiden.modularity  # modularity từ igraph clustering
sizes_leiden = Counter(leiden_membership)
sizes_leiden_df = pd.DataFrame(
    sorted(sizes_leiden.items(), key=lambda x: x[1], reverse=True),
    columns=["community_id", "size"]
)
sizes_leiden_df["percentage"] = 100 * sizes_leiden_df["size"] / G.number_of_nodes()

In [13]:
assign_leiden_df = pd.DataFrame({"node": list(leiden_labels.keys()), "community": list(leiden_labels.values())})
assign_leiden_df.to_csv("/content/leiden_assignment.csv", index=False)


In [14]:
print(" Number of communities (k):", k_leiden)
print(" Modularity (Q):", round(mod_leiden, 4))
print(" Largest community %:", round(sizes_leiden_df.iloc[0]["percentage"], 2))

 Number of communities (k): 17
 Modularity (Q): 0.8356
 Largest community %: 13.57


##So sánh Louvain & Leiden

In [15]:

nodes = sorted(G.nodes())
labels_louvain = [partition_louvain[n] for n in nodes]
labels_leiden = [leiden_labels[n] for n in nodes]

# NMI
nmi_val = normalized_mutual_info_score(labels_louvain, labels_leiden)

# DataFrame
comp_df = pd.DataFrame({
    "method": ["Louvain", "Leiden"],
    "num_communities": [k_louvain, k_leiden],
    "modularity": [mod_louvain, mod_leiden],
    "largest_community_percent": [sizes_louvain_df.iloc[0]["percentage"], sizes_leiden_df.iloc[0]["percentage"]]
})

comp_df.to_csv("/content/community_comparison.csv", index=False)

print("Comparison Louvain vs Leiden:")
display(comp_df)
print("NMI between partitions:", round(nmi_val, 4))


Comparison Louvain vs Leiden:


Unnamed: 0,method,num_communities,modularity,largest_community_percent
0,Louvain,16,0.83497,13.567715
1,Leiden,17,0.835586,13.567715


NMI between partitions: 0.9621
