In [1]:
import igraph as ig
import pandas as pd
import numpy as np
from sklearn.metrics import adjusted_mutual_info_score as AMI
import partition_igraph
from matplotlib import pyplot as plt
from collections import Counter
import random
import pickle
import os
import seaborn as sns
from sklearn.metrics import roc_auc_score as AUC
from sklearn.metrics import roc_curve
from scipy.sparse import csr_matrix
import scipy.sparse as sparse 
import time
from statistics import mode
from abcd_graph import ABCDGraph, ABCDParams
import csv
import subprocess
import sys
sys.path.append('../')
from CAS import *


In [2]:
## this assumes community 0 are outliers and 'gt' exists in graph G
## M: membership matrix
## S: score matrix (C or Beta)
def outliers_AUC(G, M, S):
    y = [int(x>0) for x in G.vs['gt']]
    x = np.array(S.max(axis=1).todense()).flatten()
    ctr = Counter(np.array(M.sum(axis=1)).flatten())
    print('With matrix M:\noutliers:',ctr[0])
    print('correct outliers:',sum((np.array(M.sum(axis=1)).flatten() == 0) & (np.array(G.vs['gt']) == 0)))
    print('\nWith scores:\noutlier AUC:',AUC(y,x)) 
        

## ABCD-oo graphs

Main 2 files:

```
networkfile<xi>_<eta>_<rep>.txt : the edges, 1-based node ids, tab separated
communityfile<xi>_<eta>_<rep>.txt : node <tab> list of communities in [], for example:
1       [24]
2       [3, 28]
3       [10]
```

All files for one case:

```
communityfile0.5_1.5_10.txt
communitysizesfile0.5_1.5_10.txt
degreefile0.5_1.5_10.txt
networkfile0.5_1.5_10.txt
```

All graph have 250 outlier nodes, mapped to community 0 below


In [3]:
path = '/data/ABCDoo/'
#path = '../Datasets/ABCDoo/'
def readGraph(xi=0.5, eta=1.5, rep=1):
    ## read edges, build graph
    fn = path+'networkfile'+str(xi)+'_'+str(eta)+'_'+str(rep)+'.txt'
    Edges = pd.read_csv(fn, sep='\t', header=None)-1
    G = ig.Graph.DataFrame(Edges, directed=False)
    ## read communities
    fn = path+'communityfile'+str(xi)+'_'+str(eta)+'_'+str(rep)+'.txt'
    L = []
    with open(fn, "r") as infile:
        for line in infile:
            x = line.split('\t')
            L.append([int(y)-1 for y in x[1].rstrip()[1:-1].split(',')]) ## map to 0-based
    G.vs['comms'] = L
    G.vs['n_comms'] = [len(x) for x in G.vs['comms']]
    return G
   

In [4]:
#oNMI = '/Users/francois/Book/GraphMiningNotebooks/oNMI/onmi'          ## overlapping NMI executable
oNMI = '/work/home/fcthebe/Tools/oNMI/onmi'          ## overlapping NMI executable

## input format: 
def compute_oNMI(First, Second):
    fn1 = '__'+str(random.random())[2:]
    with open(fn1,"w") as f:
        wr = csv.writer(f, delimiter=" ")
        wr.writerows(First)
    f.close()   

    fn2 = '__'+str(random.random())[2:]
    with open(fn2,"w") as f:
        wr = csv.writer(f, delimiter=" ")
        wr.writerows(Second)
    f.close()   
    x = float(subprocess.check_output([oNMI,fn1,fn2]).decode("utf-8").split()[1])
    _ = os.system('rm '+fn1)
    _ = os.system('rm '+fn2)
    return x


In [5]:
def memberships2list(S):
    L = []
    for i in range(len(S.indptr)-1):
        if S.indptr[i] == S.indptr[i+1]:
            L.append([0]) ## no membership == outlier (community 0)
        else:
            L.append(list(S.indices[S.indptr[i]:S.indptr[i+1]]+1)) ## 1-based
    return L

## given list of node memberships, return list of communities
def mems2comms(X):
    nc = max(set([i for j in X for i in j]))+1  
    n = len(X)
    L = [[] for _ in range(nc)]
    for i in range(n):
        for j in X[i]:
            L[j].append(i)
    return L

In [6]:
from itertools import combinations
def CPM(g, k=3):
    cls = list(map(set, g.cliques(min=k,max=k)))
    edgelist = []
    for i in range(len(cls)):
        edgelist.append((i,i))
    for i,j in combinations(range(len(cls)),2):
        if len(cls[i].intersection(cls[j])) >= (k-1):
            edgelist.append((i,j))
    cg = ig.Graph(edgelist, directed=False)
    clusters = cg.connected_components()
    L = []
    for cluster in clusters:
        members = set()
        for i in cluster:
            members.update(cls[i])
        L.append(set(g.vs[members]["name"]))
    return L


In [7]:
def EgoSplit(G, split='CC', algo='LP'):
    g = G.copy()
    ## implement ego-split approach with LP or ECG
    g.vs['original'] = g.vs['name']
    ## use the vertex names to avoid issues when vertices are re-mapped ...
    names = g.vs['name']
    ## step 1 - ego-net splits
    ctr = 1
    for nm in names:
        if ctr%1000==0:
            print(ctr)
        ctr+=1
        v = g.vs.find(nm).index
        n = g.neighbors(v)
        sg = g.subgraph(n)
        if split == 'LP':
            x = sg.community_label_propagation().membership
        else:
            x = sg.connected_components().membership
        if np.min(x)==-1:
            x = [i+1 for i in x]
        for j in set(x):
            g.add_vertex(name=nm+'.'+str(j),original=nm)

        l = sg.vs['name']
        for j in range(len(x)):
            g.add_edge(nm+'.'+str(x[j]) , l[j])
        g.delete_vertices(v)
    ## step 2 -- cluster w.r.t. multiple personae
    if algo=='LP':
        cl = g.community_label_propagation()
    else:
        cl = g.community_ecg(ens_size=32)
    C = [set(sg.vs['original']) for sg in cl.subgraphs()]
    return C


# Compare densities

In [268]:
## LFRo
LFRo = '/work/home/fcthebe/Tools/oLFR/benchmark' ## overlapping LFR executable
mu = .35
cmd = LFRo+' -N 10000 -k 20 -maxk 100 -t1 2.5 -minc 50 -maxc 1150 -t2 1.5 -on 5000 -om 3 -mu '+str(mu)  +' >  _temp'
_ = os.system(cmd)
g = ig.Graph.Read_Ncol('./network.dat',directed=False)
g = g.simplify()## edges are repeated twice with LFRo
fn = 'community.dat'
L = []
with open(fn, "r") as infile:
    for line in infile:
        x = line.split('\t')[1]
        L.append([int(i) for i in x.split(' ')[:-1]])
v_dct = {j:i for i,j in enumerate(g.vs['name'])}
for i in range(len(L)):
    g.vs[v_dct[str(i+1)]]['comms'] = L[i]
g.vs['leiden'] = g.community_leiden(objective_function='modularity').membership
GT = mems2comms(g.vs['comms'])[1:]
LD = mems2comms([[x] for x in g.vs['leiden']])
compute_oNMI(GT,LD)


0.0820062

In [269]:
g.ecount()

102059

In [271]:
proba = g.ecount()*2/(g.vcount()*(g.vcount()-1))
proba

0.0020413841384138414

In [272]:
L = []
for i in range(100000):
    x = np.random.choice(10000,2,replace=False)
    L.append([g.are_adjacent(x[0],x[1]),len(set(g.vs[x[0]]['comms']).intersection(set(g.vs[x[1]]['comms'])))])
_df = pd.DataFrame(L,columns=['edge','inter'])
print( sum(_df[_df.inter==0].edge)/len(_df[_df.inter==0].edge)/proba,
       sum(_df[_df.inter==1].edge)/len(_df[_df.inter==1].edge)/proba,
       sum(_df[_df.inter>1].edge)/len(_df[_df.inter>1].edge)/proba)


0.4283382682777887 8.370715558704045 4.9232533295230105


In [273]:
Counter(_df.inter)


Counter({0: 91491, 1: 8310, 2: 196, 3: 3})

In [None]:
### ABCDoo

In [275]:
G = readGraph(xi=0.35, eta=1.5, rep=5)
#G.vs['name'] = [str(i) for i in np.arange(G.vcount())]
GT = mems2comms(G.vs['comms'])[1:]  ## ignore "outlier community"
G.ecount()
proba = G.ecount()*2/(G.vcount()*(G.vcount()-1))
proba

0.002045104510451045

In [277]:
L = []
for i in range(1000000):
    x = np.random.choice(10000,2,replace=False)
    L.append([G.are_adjacent(x[0],x[1]),len(set(G.vs[x[0]]['comms']).intersection(set(G.vs[x[1]]['comms'])))])
_df = pd.DataFrame(L,columns=['edge','inter'])
print( sum(_df[_df.inter==0].edge)/len(_df[_df.inter==0].edge)/proba,
       sum(_df[_df.inter==1].edge)/len(_df[_df.inter==1].edge)/proba,
       sum(_df[_df.inter>1].edge)/len(_df[_df.inter>1].edge)/proba)


0.37966483319104294 12.353429238590737 7.327146854910125
