# Chapter 8 -  Detecting Overlapping communities

The problem of graph clustering is well studied, in particular the case where the vertices are partitioned into
non-overlapping communities.

Here, we look at the problem of graph clustering where:
* vertices can be part of several communities (overlapping communities)
* vertices can be part of no community ("noise" vertices)

We explore the following three methods:
* methods based on finding overlapping cliques (a clique is a complete subgraph)
* methods based on splitting vertices into multiple personae, and
* methods based on clustering the edges.

We also look at some post-processing based on Community Association Strength scores (CAS), which can be used after running the above, or some graph partioning algorithm (such as ECG ou Leiden)

We illustrate those methods using the small Karate Club graph.
Next we conaider larger graphs: a word association graph and artificial ABCD benchmark graphs. 


In [None]:
import igraph as ig
import numpy as np
import pandas as pd
#from sklearn.metrics import adjusted_mutual_info_score as AMI
from itertools import combinations
#from IPython.display import display, SVG
#import time
import matplotlib.pyplot as plt
%matplotlib inline
import csv
import subprocess
import partition_igraph
import os
import pickle
import random


## New requirements

* overlapping NMI measure (oNMI): download and compile from: https://github.com/aaronmcdaid/Overlapping-NMI 


In [None]:
datadir='../Datasets/'
## oNMI executable:
oNMI = '../oNMI/onmi'             ## overlapping NMI executable
#oNMI = '/work/home/fcthebe/Tools/oNMI/onmi'


In [None]:
## calls the oNMI executable, format of inputs: list of lists (communities)
def compute_oNMI(First, Second):
    fn1 = '__'+str(random.random())[2:]
    with open(fn1,"w") as f:
        wr = csv.writer(f, delimiter=" ")
        wr.writerows(First)
    f.close()   

    fn2 = '__'+str(random.random())[2:]
    with open(fn2,"w") as f:
        wr = csv.writer(f, delimiter=" ")
        wr.writerows(Second)
    f.close()   
    x = float(subprocess.check_output([oNMI,fn1,fn2]).decode("utf-8").split()[1])
    _ = os.system('rm '+fn1)
    _ = os.system('rm '+fn2)
    return x


In [None]:
## assign colors and shapes w.r.t. overlapping clusters
## white: no cluster
## black square: overlap
## the rest are shown as colored circles
def color_nodes(g, communities, greyscale=False):
    g.vs['_oc'] = [ [] for i in range(g.vcount())]
    for i in range(len(communities)):
        for j in communities[i]:
            g.vs.find(j)['_oc'].append(i)
    if greyscale:
        pal = ig.drawing.colors.GradientPalette('white','black',n=len(communities)+2)
    else:
        pal = ig.drawing.colors.ClusterColoringPalette(n=len(communities))
    g.vs['shape'] = 'circle'
    for v in g.vs:
        if len(v['_oc'])==0:
            v['color'] = 'white'
        else: 
            if len(v['_oc'])>1:
                v['color'] = 'black'
                v['shape'] = 'square'
            else:
                if greyscale:
                    v['color'] = pal[v['_oc'][0]+1]
                else:
                    v['color'] = pal[v['_oc'][0]]


# 1. CPM (clique percolation method)

The first algorithm we consider is the Clique Percolation Method, which can
be summarized as:
* fix the clique size $k$ (typically $k$=3 or 4)
* for each $k$-clique, join all other $k$-cliques with $k-1$ vertices in common, in turn (the percolation)
* continue until all $k$-cliques are exhausted

Is is based on:

Derényi I., *et al.*, Clique percolation in random networks, Phys. Rev. Lett., 2005, vol. 94 (pg. 160-202)


In [None]:
def CPM(g, k=3):
    cls = list(map(set, g.cliques(min=k,max=k)))
    edgelist = []
    for i in range(len(cls)):
        edgelist.append((i,i))
    for i,j in combinations(range(len(cls)),2):
        if len(cls[i].intersection(cls[j])) >= (k-1):
            edgelist.append((i,j))
    cg = ig.Graph(edgelist, directed=False)
    clusters = cg.connected_components()
    L = []
    for cluster in clusters:
        members = set()
        for i in cluster:
            members.update(cls[i])
        L.append(set(g.vs[members]["name"]))
    return L


#### CPM on the Zachary Graph

We illustrate the different CPM-based algorithms with the well-known Karate Club dataset, which model interaction between 34 members. The 2 communities correspond to groups forming after a split in two "factions". Modularity-based algorithms usually find 4 or 5 communities.
Below, we color the nodes according to the 2 factions after the split.  

In [None]:
## Zachary graph and its two communities
zac = ig.Graph.Read_Ncol(datadir+'Zachary/zachary.edgelist',directed=False)
c = np.loadtxt(datadir+'Zachary/zachary.communities',dtype='uint16')
zac.vs['comm'] = [c[int(x['name'])] for x in zac.vs]

## layout stored from Chapter 5 notebook
## this was generated using the Fruchterman-Reingold method
with open(datadir+"Zachary/layout.pkl","rb") as fn:
    ly_zac = pickle.load(fn)
zac['layout'] = [ly_zac[int(x['name'])] for x in zac.vs] 

## plotting parameters
zac.vs['size'] = 12
zac.es['color'] = 'gainsboro'
#pal = ig.drawing.colors.GradientPalette("white","black",n=max(zac.vs['comm'])+1)
pal = ig.drawing.colors.ClusterColoringPalette(n=max(zac.vs['comm'])+1)
zac.vs['color'] = [pal[i] for i in zac.vs['comm']]

## plot
ig.plot(zac,layout=zac['layout'], bbox=(0,0,300,300))


####  Running the CPM algorithm

* We run the CPM algorithm as is on the Karate graph.
* Nodes that belong to 2 or more clusters are represented as squares.
* You can select col='grey' for greyscale, but this is hard to distinguish with several clusters
* We obtain one large community, two small ones and two orphan nodes (shown in white)

In [None]:
## ground-truth communities
zac_gt = []
for i in set(zac.vs['comm']):
    zac_gt.append([v['name'] for v in zac.vs if v['comm']==i])


In [None]:
## CPM with k=3
X = CPM(zac, k=3)
color_nodes(zac, X, greyscale=False)
print('oNMI:',compute_oNMI([list(i) for i in X], zac_gt))

## plot
ig.plot(zac, layout=zac['layout'], bbox=(0,0,300,300))
#ig.plot(zac, 'zac_overlap_cpm_1.eps', layout=zac['layout'], bbox=(0,0,300,300))


In [None]:
## CPM with k=4
X = CPM(zac, k=4)
color_nodes(zac, X, greyscale=False)
print('oNMI:',compute_oNMI([list(i) for i in X], zac_gt))

## plot
ig.plot(zac,layout=zac['layout'], bbox=(0,0,300,300))


In [None]:
## filter edges with small ECG weight (threshold or under)
threshold = 0 ## filer edges with NO vote
np.random.seed(123)
random.seed(123)
zac.es['ecg_w'] = zac.community_ecg(ens_size=32, min_weight=0).W
zac_sg = zac.subgraph_edges([e for e in zac.es if e['ecg_w']>threshold])
X = CPM(zac_sg, k=3)
color_nodes(zac, X, greyscale=False)
print('oNMI:',compute_oNMI([list(i) for i in X], zac_gt))

## plot
ig.plot(zac, layout=zac['layout'], bbox=(0,0,300,300))
#ig.plot(zac, 'zac_overlap_cpm_2.eps', layout=zac['layout'], bbox=(0,0,300,300))


In [None]:
# random clusterings with same sizes as above
np.random.seed(123)
random.seed(123)
Nodes = []
for x in X:
    Nodes.extend(list(x))
Sizes = np.concatenate([[0],np.cumsum([len(x) for x in X])])
Results = []
for rep in range(100):
    R = []
    P = np.random.permutation(Nodes)
    for s in range(len(Sizes)-1):
        R.append((P[Sizes[s]:Sizes[s+1]]))
    Results.append((compute_oNMI(R,zac_gt)))
## report mean and stdv
print('mean:', np.mean(Results), 'stdv:', np.std(Results))


# 2. Ego-Splitting method

The Ego-Splitting framework is based on paper by A. Epasto, S. Lattanzi and R.P. Leme at KDD 2017:

https://www.kdd.org/kdd2017/papers/view/ego-splitting-framework-from-non-overlapping-to-overlapping-clusters


In summary, the steps are:
* For each vertex $v$:
 * build the ego-net for $v$ (minus self)
 * cluster this ego-net using a local method, such as label propagation (LP) or connected components (CC)
 * "split" vertex $v$ into one persona per ego-net cluster
* Cluster this new graph (with duplicated vertices) with some graph partitioning algorithm such as LP or ECG. 
 * We can set a minimum community size to avoid tiny ones.

The original paper uses a LP method based on the Potts model, but we will use the Label Propagation from Raghavan *et. al.* which is implemented in igraph.


In [None]:
def EgoSplit(G, split='CC', algo='LP'):
    g = G.copy()
    ## implement ego-split approach with LP+LP and LP+ECG
    g.vs['original'] = g.vs['name']
    ## use the vertex names to avoid issues when vertices are re-mapped ...
    names = g.vs['name']
    ## step 1 - ego-net splits
    for nm in names:
        v = g.vs.find(nm).index
        n = g.neighbors(v)
        sg = g.subgraph(n)
        if split == 'LP':
            x = sg.community_label_propagation().membership
        else:
            x = sg.connected_components().membership
        if np.min(x)==-1: ## this should not occur, just being careful
            print('ES issue')
            x = [i+1 for i in x]
        for j in set(x):
            g.add_vertex(name=nm+'.'+str(j),original=nm)

        l = sg.vs['name']
        for j in range(len(x)):
            g.add_edge(nm+'.'+str(x[j]) , l[j])
        g.delete_vertices(v)
    ## step 2 -- cluster w.r.t. multiple personae
    if algo=='LP':
        cl = g.community_label_propagation()
    else:
        cl = g.community_ecg(ens_size=32)
    C = [set(sg.vs['original']) for sg in cl.subgraphs()]
    return C


In [None]:
## ego-split
np.random.seed(123)
random.seed(123)
X = EgoSplit(zac, algo='LP') ## pick final algorithm as parameter (LP or ECG)
X = [set(l) for l in X if len(l)>=3] ## min community size set to 3
color_nodes(zac, X, greyscale=False)
print('oNMI:',compute_oNMI([list(i) for i in X], zac_gt))
ig.plot(zac, layout=zac['layout'], bbox=(0,0,300,300))
#ig.plot(zac, 'zac_overlap_ego.eps', layout=zac['layout'], bbox=(0,0,300,300))


# 3. Edge Clustering 

We can obtain overlapping communities by clustering edges instead of vertices.
The algorithm can be described as follows:
* for each pair of edges sharing a node, say $(i,k)$ and $(j,k)$, compute some similarity measure between the neighborhoods of vertices $i$ and $j$, such as the Jaccard measure
* perform hierarchical clustering on the edges with this similarity matrix

It is based on:

Ahn, YY., Bagrow, J., Lehmann, S. Link communities reveal multiscale complexity in networks. Nature 466, 761–764 (2010). https://doi.org/10.1038/nature09182

This can be implemented by considering the connected components for the line-graph of the original graph using varying thresholds for the Jaccard measure.
* line graph Lg(G) represents ties between edges of G
* Lg(G) nodes are edges in G
* edges sharing a node in G are linked by an edge in Lg(G)

We pick the "best" clustering in the hierarchy based on the modularity scores on the line graph.


In [None]:
def Jaccard(a,b):
    x = len(set(a).intersection(set(b)))/len(set(a).union(set(b)))
    return x

def weightedLinegraph(g):
    lg = g.linegraph()
    w=[]
    for e in lg.es:
        A = set(g.es[e.tuple[0]].tuple)
        B = set(g.es[e.tuple[1]].tuple)
        x = list((A-B).union(B-A))
        w.append(Jaccard(g.neighbors(x[0]),g.neighbors(x[1])))
    lg.es['weight'] = w
    return lg

def edgeCluster(g):
    q = -999
    D = weightedLinegraph(g)
    for th in sorted(set(D.es['weight'])):
        ## filter edges w.r.t. similarity and find CC
        dg = D.copy()
        dg.delete_edges([e for e in dg.es if e['weight']<=th])
        cc = dg.connected_components().membership
        mod = D.modularity(cc)
        if mod>q:
            q = mod
            g.es['lc'] = cc
    ## Now gather the nodes for each edge cluster
    L = []
    for i in range(max(g.es['lc'])+1):
        sg = g.subgraph_edges([e for e in g.es if e['lc']==i])
        L.append(sg.vs['name'])
    return L

In [None]:
## cluster
X = edgeCluster(zac) ## pick final algorithm as parameter (LP or ECG)
X = [set(l) for l in X if len(l)>=3] ## min community size set to 3
print('oNMI:',compute_oNMI([list(i) for i in X], zac_gt))
color_nodes(zac, X, greyscale=False)
ig.plot(zac,layout=zac['layout'], bbox=(0,0,300,300))


### Post-processing with CAS scores

From the previous result, drop community memberships with low CAS scores.


In [None]:
## community association strength for partitions
def cas(G, A):
    deg = np.array(G.degree())
    deg_int = np.array([sum([A[i] == A[j] for i in G.neighbors(j)]) for j in range(G.vcount())])
    Vol = sum(deg)
    Vol_A = np.zeros(max(A)+1, dtype='int')
    for i in range(G.vcount()):
        Vol_A[A[i]] += deg[i]
    return deg_int/deg - (np.array([Vol_A[A[i]] for i in range(G.vcount())]) - deg)/Vol


In [None]:
## drop comminity memberships with low scores
threshold = 0.1
Y = []
for i in range(len(X)):
    zac.vs['_com'] = np.arange(1,1+zac.vcount()) ## initialize each node in its own community
    for x in X[i]:
        zac.vs.find(x)['_com'] = 0 ## consider community X[i] only
    c = cas(zac, zac.vs['_com']) ## cas w.r.t. X[i] for nodes in that community only
    Y.append(set([zac.vs[i]['name'] for i in range(len(c)) if c[i]>=threshold])) ## keep only cas results above threshold

## plot and compute oNMI
color_nodes(zac, Y, greyscale=False)
## plot
print('oNMI:',compute_oNMI([list(i) for i in Y], zac_gt))
ig.plot(zac, layout=zac['layout'], bbox=(0,0,300,300))


# Word Association Graph Example

We consider a graph built from the Word Association dataset (U of South Florida) based on:

* G. Palla *et al.*, "Uncovering the overlapping structure of complex networks in nature and society", Nature 435, 814-818 (2005).

In a nutshell, we build a graph with edges between pairs of similar words. We used a threshold of $w^*=.025$ for the association strength and use $k=4$ for the clique size. We use this dataset to illustrate the usefulness of overlapping clusters to discover various contexts of words. 

We look at two versions of CPM:
* using the ECG-based version, and
* using the association strength as edge weight.


In [None]:
## build the graph
wg = ig.Graph.Read_Ncol(datadir+'Words/words.txt', names=True, directed=False, weights=True)
wg = wg.simplify(combine_edges="sum") ## sum association strength scores
wg = wg.subgraph_edges([e for e in wg.es if e['weight'] >= .025]) ## prune low weight edges
wg.vs['label'] = wg.vs['name']
print(wg.vcount(),'nodes and',wg.ecount(),'edges')


In [None]:
## use ECG-based weights
np.random.seed(123)
random.seed(123)

for word in ['MATH', 'DOG', 'MONEY']: ## you can try other words; all words are CAPITALIZED

    ## get 2-hop ego-net
    v = wg.vs.find(name=word)
    n = wg.neighborhood(v,order=2)
    sg = wg.subgraph(n)

    ## filter edges w.r.t. ECG score
    threshold = 0
    np.random.seed(123)
    random.seed(123)
    sg.es['ecg_w'] = sg.community_ecg(ens_size=32, min_weight=0).W
    sg = sg.subgraph_edges([e for e in sg.es if e['ecg_w']>threshold])

    ## cluster and show results containing the given word
    X = CPM(sg, k=4)
    print('\nShowing clusters for the word',word)
    for x in X:
        if word in x:
            print(" ".join(sorted(x)))


In [None]:
## use association scores as weights

for word in ['MATH','DOG','MONEY']: ## you can try other words; all words are CAPITALIZED

    ## get 2-hop ego-net
    v = wg.vs.find(name=word)
    n = wg.neighborhood(v,order=2)
    sg = wg.subgraph(n)

    ## filter edges w.r.t. association strength scores
    threshold = 0.025
    sg = sg.subgraph_edges([e for e in sg.es if e['weight']>threshold])

    ## cluster and show results containing the given word
    X = CPM(sg, k=4)
    print('\nShowing clusters for the word',word)
    for x in X:
        if word in x:
            print(" ".join(sorted(x)))


# Study over ABCD-$o^2$ benchmark graphs

We generated 1,320 ABCD-$o^2$ graphs with the following parameters:
* $n = 1,000$ nodes, no outliers
* power law degree exponent $\tau_1=2.5$ with degrees in range [5,50]
* community size degree exponent $\tau_2=1.5$ with sizes in range [50,200]
* noise parameter $0.1 \le \xi \le 0.65$
* overlap parameter $1.0 \le \eta \le 2.0$
* $d=2$ (dimension of the spatial model for overlaps) and $\rho=0$ (correlation between degree and number of community memberships)

For the CAS-based post-processing, we used thresholds $t_1 = t_2 = 0.1$ (respectively to remove or add nodes to communities).

The configuration file to build the graphs can be found in the ```Datasets/ABCDoo``` subdirectory, as well as a ```pickle``` file that contains the results from the following (commented out) cells.
The graphs can be generated by running: ```julia abcd_sampler_overlap.jl```


In [None]:
## load the results from the experiment above
with open(datadir+"ABCDoo/abcdoo.pkl", "rb" ) as fp:
    df = pickle.load(fp)


In [None]:
## fix xi
xi = 0.1
#cls = ['blue','green','red']
cls = ['black','black','black']
algos = ['ego-split','ego-split+cas','ecg+cas']
stdv = ['ego-split(sd)','ego-split+cas(sd)','ecg+cas(sd)']
style = [':','--','-']
_df = df[ (df.xi==xi) ]
for j in range(3):
    s = algos[j]
    e = stdv[j]
    plt.plot(_df.eta, _df[s], label=s, color=cls[j], linestyle=style[j])
    plt.fill_between(_df.eta, _df[s]+2*_df[e], _df[s]-2*_df[e], alpha=.1, color=cls[j])
plt.legend()
plt.grid()
plt.title(r'ABCD-oo graphs with $\xi$='+str(xi), fontsize=16)
plt.ylabel('oNMI', fontsize=14)
plt.xlabel(r'$\eta$', fontsize=14)
#plt.savefig('abcdoo-xi.pdf')
plt.show()


In [None]:
## fix eta
eta = 1.5
#cls = ['blue','green','red','purple']
cls = ['black','black','black']
algos = ['ego-split','ego-split+cas','ecg+cas']
stdv = ['ego-split(sd)','ego-split+cas(sd)','ecg+cas(sd)']
style = [':','--','-']
_df = df[ (df.eta==eta) ]
for j in range(3):
    s = algos[j]
    e = stdv[j]
    plt.plot(_df.xi, _df[s], label=s, color=cls[j], linestyle=style[j])
    plt.fill_between(_df.xi, _df[s]+2*_df[e], _df[s]-2*_df[e], alpha=.1, color=cls[j])
plt.legend()
plt.grid()
plt.title(r'ABCD-oo graphs with $\eta$='+str(eta), fontsize=16)
plt.ylabel('oNMI', fontsize=14)
plt.xlabel(r'$\xi$', fontsize=14)
#plt.savefig('abcdoo-eta.pdf')
plt.show()


In [None]:
## fix eta
eta = 1.1
#cls = ['blue','green','red','purple']
cls = ['black','black','black']
algos = ['ego-split','ego-split+cas','ecg+cas']
stdv = ['ego-split(sd)','ego-split+cas(sd)','ecg+cas(sd)']
style = [':','--','-']
_df = df[ (df.eta==eta) ]
for j in range(3):
    s = algos[j]
    e = stdv[j]
    plt.plot(_df.xi, _df[s], label=s, color=cls[j], linestyle=style[j])
    plt.fill_between(_df.xi, _df[s]+2*_df[e], _df[s]-2*_df[e], alpha=.1, color=cls[j])
plt.legend()
plt.grid()
plt.title(r'ABCD-oo graphs with $\eta$='+str(eta), fontsize=16)
plt.ylabel('oNMI', fontsize=14)
plt.xlabel(r'$\xi$', fontsize=14)
#plt.savefig('abcdoo-eta.pdf')
plt.show()
