## Altmap Experiments
### Compare altmap to map eq using networkx on realworld networks


In [12]:
import itertools

from clusim.clustering import Clustering

from altmap.altmap_helpers.general import *

# show plots in separate window
%pylab

# init rc params
init_plt_params()

def num_nodes_in_multiple_comms(comm_list):
    nodes = sorted(list(itertools.chain(*comm_list)))
    num_multiple_comms = 0
    last_node = -1
    already_counted = False
    for node in nodes:
        if last_node != node:
            last_node = node
            already_counted = False
        elif not already_counted:
            num_multiple_comms += 1
            already_counted = True
    
    return num_multiple_comms
            

Using matplotlib backend: Qt5Agg
Populating the interactive namespace from numpy and matplotlib


In [13]:
dataset = 'dblp' # citation network
# dataset = 'youtube'
# dataset = 'amazon'
# dataset = 'lj'
# dataset = 'orkut'

# load groundtruth comms
path = './realworld/com-' + dataset + '.top5000.cmty.txt'
comm_list = []
with open(path, 'r') as file:
    for line in file:
        comm_list.append(list(map(int, line.strip().split('\t')))) 
        
comm_sizes = list(map(len, comm_list))

# take only communities with at least x nodes
min_comm_size = 1
comm_true_list = [comm for comm in comm_list if len(comm) >= min_comm_size]
comm_true_sizes = list(map(len, comm_true_list))
num_comm_true = len(comm_true_list)

# extract unique node ids
node_ids = list(itertools.chain(*comm_true_list))
unique_node_ids = set(np.unique(node_ids)) # unique node ids part of some top community

print(f'There are {num_comm_true} admissible ground truth communities.')
print(f'The minimun community size is {np.min(comm_true_sizes)}.')
print(f'The mean community size is {np.mean(comm_true_sizes)}.')
print(f'The maximum community size is {np.max(comm_true_sizes)}.')

# relabelling necessary for the computation of the omega index
nodes_relabelling_map = dict(zip(sorted(unique_node_ids),  range(len(unique_node_ids))))

# assemble ground truth clusim clustering
comm_list_relabelled = [[nodes_relabelling_map[node] for node in comm] for comm in comm_true_list]
comm_labels = list(range(1, num_comm_true + 1))
clu2elm = dict(zip(comm_labels, comm_list_relabelled))
clustering_true = Clustering(clu2elm_dict=clu2elm)

# load network
path = './realworld/com-' + dataset + '.ungraph.txt'
# G = nx.read_adjlist(path, create_using=nx.Graph, nodetype=int)

G = nx.Graph()
with open(path, 'r') as file:
    for line in file:
        if line[0] == '#':
            continue
        
        node1, node2 = tuple(map(int, line.strip().split('\t')))
        if node1 in unique_node_ids and node2 in unique_node_ids:
            G.add_edge(node1, node2) 

print(f'There are {len(G.nodes())} nodes in the reduced network')
print(f'There are {len(G.edges())} edges in the reduced network')


There are 5000 admissible ground truth communities.
The minimun community size is 3.
The mean community size is 215.7152.
The maximum community size is 4785.


In [7]:
num_multiple = num_nodes_in_multiple_comms(comm_true_list)
print (f'Number of nodes that are part of multiple communities (reduced network): {num_multiple}')
print (f'Fraction of nodes that are part of multiple communities (reduced network): {num_multiple / len(G.nodes())}')

c = nx.average_clustering(G)
print (f'Average clustering coefficient (reduced network): {c}')

Number of nodes that are part of multiple communities (reduced network): 31450
Fraction of nodes that are part of multiple communities (reduced network): 0.37246263530637863
Average clustering coefficient (reduced network): 0.764008212007451


In [None]:
# run community detection for infomap
print('Testing Infomap...')
communities_found_infomap, num_communities_found,_,_ = infomap(G, altmap=False)

print (f'Found {num_communities_found} communities on the reduced network.')
print (f'Achieved RENDC is {num_communities_found/5000.0 - 1}.')

# run community detection for altmap
print('Testing Altmap...')
communities_found_altmap, num_communities_found,_,_ = infomap(G, altmap=True, update_inputfile=False)

print (f'Found {num_communities_found} communities on the reduced network.')
print (f'Achieved RENDC is {num_communities_found/5000.0 - 1}.')


Testing Infomap...


In [10]:
from clusim.sim import onmi
from clusim.sim import omega_index
from clusim.clusimelement import element_sim # very memory intense

def evaluate_clusterings(communities_found, metrics):
    # assemble detected clustering
    elm2clu = dict([(nodes_relabelling_map[node], [label]) for node, label in communities_found.items()])
    clustering_found= Clustering(elm2clu_dict=elm2clu)
    
    scores = []
    for i, metric in enumerate(metrics):
        print (f'Computing metric {i+1}...')
        scores.append(metric(clustering_found, clustering_true))
    
    return scores


metrics = (onmi, omega_index)

print ('Computing scores for Infomap...')
scores = evaluate_clusterings(communities_found_infomap, metrics)
print (f'Infomap scores are: onmi = {scores[0]}, omega_idx = {scores[1]}.')

print ('Computing scores for Altmap...')
scores = evaluate_clusterings(communities_found_altmap, metrics)
print (f'Altmap scores are: onmi = {scores[0]}, omega_idx = {scores[1]}.')




Computing scores for Infomap...
Computing metric 1...
Computing metric 2...
Infomap scores are: onmi = 0.8754197648885511, omega_idx = 0.6010862039967505.
Computing scores for Altmap...
Computing metric 1...
Computing metric 2...
Altmap scores are: onmi = 0.879330724043965, omega_idx = 0.6611350373008688.
