# Chapter 6 - Graph Embeddings

In this notebook, we illustrate several graph embedding algorithms, we show how we can compare embeddings using an unsupervised framework, and we look at various applications such as clustering, link prediction and anomaly detection.

### Requirements

- install node2vec code (see https://snap.stanford.edu/node2vec)
- compile GED code (graph embedding divergence), 
  the base implementation of the framework in C (the code is included, and can also be found at      https://github.com/ftheberge/Comparing_Graph_Embeddings) 
- new package to install: 'pip install --no-dependencies graphrole'

Also set the path(s) in the cell below. For Windows, you may need to use "\\" or "\\\\" as delimiters, for example 'C:\\\\node2vec\\\\node2vec.exe'

Also for windows, "cp" should be changed to "copy" when keeping track of best and worst embeddings.

### Non-deterministic results

Some of the results in this notebook may vary from run to run in particular for node2vec (which uses random walks) and for 2-d renditions of high-dimensional embeddings via UMAP.


In [None]:
## the data directory
datadir = '../Datasets/'

## location of the GED code
## use the '-S' option to use split JS divergence
GED = '../GED/GED'

## location of the node2vec code
n2v = '~/Tools/node2vec/node2vec'

In [None]:
import igraph as ig
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LinearRegression
from collections import Counter
import os
import umap
import pickle
import partition_igraph
import subprocess
import scipy.sparse.linalg as lg
from sklearn.cluster import KMeans, DBSCAN
from sklearn.model_selection import train_test_split
from sklearn.metrics import adjusted_mutual_info_score as AMI
from graphrole import RecursiveFeatureExtractor, RoleExtractor
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import calinski_harabasz_score as CHS
from sklearn.metrics import silhouette_score as SIL
%config Completer.use_jedi = False

## node and edge greyscale colors
cls_edges = 'gainsboro'
cls = ['silver','dimgray','black']

# A few useful functions

In [None]:
## as defined in the node2vec paper
def binary_operator(u, v, op='had'):
    if op=='had':
        return u * v
    if op=='l1':
        return np.abs(u - v)
    if op=='l2':
        return (u - v) ** 2
    if op=='avg':
        return (u + v) / 2.0

## 'N2K' mapping is used to map between node name and key value in graph when reading results from node2vec
def readEmbedding(fn="_embed", N2K=None):
    D = pd.read_csv(fn, sep=' ', skiprows=1, header=None)
    D = D.dropna(axis=1)
    if N2K!=None:
        x = [N2K[i] for i in D[0]]
        D[0] = x    
        D = D.sort_values(by=0)
    Y = np.array(D.iloc[:,1:])
    return Y

## Read embedding from file in node2vec format
## Map to layout format
## for visualization, we use UMAP if dim > 2
def embed2layout(fn="_embed"):
    D = pd.read_csv(fn, sep=' ', skiprows=1, header=None)
    D = D.dropna(axis=1)
    D = D.sort_values(by=0)
    Y = np.array(D.iloc[:,1:])
    if Y.shape[1]>2:
        Y = umap.UMAP().fit_transform(Y)
    ly = []
    for v in range(Y.shape[0]):
        ly.append((Y[v][0],Y[v][1]))
    return ly


## Computing JS divergence with GED code given edgelist, communities and embedding
def JS(edge_file, comm_file, embed_file, entropy=False):
    if entropy:
        x = GED+' -E -g '+edge_file+' -c '+comm_file+' -e '+embed_file
    else:
        x = GED+' -g '+edge_file+' -c '+comm_file+' -e '+embed_file
    s = subprocess.run(x, shell=True, stdout=subprocess.PIPE)
    x = s.stdout.decode().split(' ')
    div = float(x[1])
    return(div)


## Hope embedding with various similarity functions
def Hope(g, sim='katz', dim=2, verbose=False, beta=.01, alpha=.5):
    ## For undirected graphs, embedding as source and target are identical
    if g.is_directed() == False:
        dim = dim*2
    A = np.array(g.get_adjacency().data)
    beta = beta
    alpha = alpha
    n = g.vcount()
    ## Katz
    if sim == 'katz':
        M_g = np.eye(n) - beta * A
        M_l = beta * A
    ## Adamic-Adar
    if sim == 'aa':
        M_g = np.eye(n)
        ## fix bug 1/x and take log();
        D = np.diag([1/np.log(x) if x>1 else 0 for x in g.degree()]) 
        # D = np.diag([1/np.log(max(2,x)) for x in g.degree()]) 
        M_l = np.dot(np.dot(A,D),A)
        np.fill_diagonal(M_l,0)
    ## Common neighbors
    if sim == 'cn':
        M_g = np.eye(n)
        M_l = np.dot(A,A)
    ## presonalized page rank
    if sim == 'ppr':
        P = []
        for i in range(n):
            s = np.sum(A[i])
            if s>0:
                P.append([x/s for x in A[i]])
            else:
                P.append([1/n for x in A[i]])
        P = np.transpose(np.array(P)) ## fix bug - take transpose
        M_g = np.eye(n)-alpha*P
        M_l = (1-alpha)*np.eye(n)
    S = np.dot(np.linalg.inv(M_g), M_l)
    u, s, vt = lg.svds(S, k=dim // 2)
    X1 = np.dot(u, np.diag(np.sqrt(s)))
    X2 = np.dot(vt.T, np.diag(np.sqrt(s)))
    X = np.concatenate((X1, X2), axis=1)
    p_d_p_t = np.dot(u, np.dot(np.diag(s), vt))
    eig_err = np.linalg.norm(p_d_p_t - S)
    if verbose:
        print('SVD error (low rank): %f' % eig_err)
    ## undirected graphs have identical source and target embeddings
    if g.is_directed() == False:
        d = dim//2
        return X[:,:d]
    else:
        return X

## save to disk to compute divergence
def saveEmbedding(X, g, fn='_embed'):
    with open(fn,'w') as f:
        f.write(str(X.shape[0]) + " " + str(X.shape[1])+'\n')
        for i in range(X.shape[0]):
            f.write(g.vs[i]['name']+' ')
            for j in range(X.shape[1]):
                f.write(str(X[i][j])+' ')
            f.write('\n')

## Laplacian eigenmaps embedding
def LE(g, dim=2):
    L_sym = np.array(g.laplacian(normalized=True))
    w, v = lg.eigs(L_sym, k=dim + 1, which='SM')
    idx = np.argsort(w) # sort eigenvalues
    w = w[idx]
    v = v[:, idx]
    X = v[:, 1:]
    return X.real

## Returns a LaTeX bmatrix
def bmatrix(a):
    if len(a.shape) > 2:
        raise ValueError('bmatrix can at most display two dimensions')
    lines = str(a).replace('[', '').replace(']', '').splitlines()
    rv = [r'\begin{bmatrix}']
    rv += ['  ' + ' & '.join(l.split()) + r'\\' for l in lines]
    rv +=  [r'\end{bmatrix}']
    return '\n'.join(rv)

## Figure 6.1 in the Book

This is to illustrate random walks on (directed) graphs.


In [None]:
g = ig.Graph.Erdos_Renyi(n=4,p=0,directed=True)
g.vs['label'] = ['A','B','C','D']
g.vs['color'] = 'white'
g.add_edges([(0,1),(1,2),(1,3),(2,1),(3,2)])
#ig.plot(g,'tiny.eps',bbox=(0,0,300,200),vertex_label_size=10)
ig.plot(g,bbox=(0,0,300,200),vertex_label_size=10)

# Load and prepare some datasets

* $abcd$: is a small ABCD graph (100 nodes), mainly for visualization and quick exampes
* $ABCD$: is a larger ABCD graph (1000 nodes), for experiments
* $zac$: Zachary (karate club) graph, for visualzation

The small ABCD graph was generated with the following parameters:

```
n = "100"                     # number of vertices in graph
t1 = "3"                      # power-law exponent for degree distribution
d_min = "5"                   # minimum degree
d_max = "15"                  # maximum degree
d_max_iter = "1000"           # maximum number of iterations for sampling degrees
t2 = "2"                      # power-law exponent for cluster size distribution
c_min = "25"                  # minimum cluster size
c_max = "50"                  # maximum cluster size
c_max_iter = "1000"           # maximum number of iterations for sampling cluster sizes
xi = "0.2"                    # fraction of edges to fall in background graph
isCL = "false"                # if "false" use configuration model, if "true" use Chung-Lu
```

The larger ABCD graph was generated with the following parameters:

```
n = "1000"                     # number of vertices in graph
t1 = "3"                       # power-law exponent for degree distribution
d_min = "10"                   # minimum degree
d_max = "100"                  # maximum degree
d_max_iter = "1000"            # maximum number of iterations for sampling degrees
t2 = "2"                       # power-law exponent for cluster size distribution
c_min = "50"                   # minimum cluster size
c_max = "150"                  # maximum cluster size
c_max_iter = "1000"            # maximum number of iterations for sampling cluster sizes
xi = "0.6"                     # fraction of edges to fall in background graph
isCL = "false"                 # if "false" use configuration model, if "true" use Chung-Lu
```


### Load the small ABCD graph and visualize

Node names are integers here, and this should not be confused with the key used in igraph to enumerate the nodes.
In order to avoid such issues, we define a dictionary, *n2k*, to map between node name and its key value.


In [None]:
## read graph and communities
abcd = ig.Graph.Read_Ncol(datadir+'ABCD/abcd_100.dat',directed=False)
c = np.loadtxt(datadir+'ABCD/abcd_100_comms.dat',dtype='uint16',usecols=(1))
abcd.vs['comm'] = [c[int(x['name'])-1] for x in abcd.vs]

## print a few stats
print(abcd.vcount(),'vertices,',abcd.ecount(),'edges,','avg degreee',np.mean(abcd.degree()),
      'communities',max(abcd.vs['comm']))

## ground truth communities
gt = {k:(v-1) for k,v in enumerate(abcd.vs['comm'])}

## map between int(name) to key
n2k = {int(v):k for k,v in enumerate(abcd.vs['name'])}

## define the colors and node sizes here
abcd.vs['size'] = 7
abcd.es['color'] = cls_edges
abcd.vs['color'] = [cls[i-1] for i in abcd.vs['comm']]

#ig.plot(abcd, 'abcd.eps', bbox=(0,0,300,200))
ig.plot(abcd, bbox=(0,0,300,200))

### Load the larger ABCD graph and visualize

This is a larger graph with lots of noise edges ($\xi$=0.6). Nore colours refer to the communities.
With this amount of noise, the communities are far from obvious on a 2-dim layout.

We'll use a version with stronger communities ($\xi$=0.2) for link prediction.


In [None]:
## read graph and communities
ABCD = ig.Graph.Read_Ncol(datadir+'ABCD/abcd_1000.dat',directed=False)
c = np.loadtxt(datadir+'ABCD/abcd_1000_comms.dat',dtype='uint16',usecols=(1))
ABCD.vs['comm'] = [c[int(x['name'])-1] for x in ABCD.vs]

## print a few stats
print(ABCD.vcount(),'vertices,',ABCD.ecount(),'edges,','avg degreee',np.mean(ABCD.degree()),
      'communities',max(ABCD.vs['comm']))

## ground truth communities
GT = {k:(v-1) for k,v in enumerate(ABCD.vs['comm'])}

## map between int(name) to key
N2K = {int(v):k for k,v in enumerate(ABCD.vs['name'])}

## define the colors and node sizes here
## node colors refer to communities
cls_edges = 'gainsboro'
ABCD.vs['size'] = 5
ABCD.es['color'] = cls_edges
pal = ig.RainbowPalette(n=max(ABCD.vs['comm'])+1) 
ABCD.vs['color'] = [pal.get(int(i)) for i in ABCD.vs['comm']]
#ABCD.vs['color'] = 'black'

ig.plot(ABCD, bbox=(0,0,400,300)) ## communities are far from obvious in 2d layout!

### Zachary (karate club) graph

This graph is already included with igraph.


In [None]:
zac = ig.Graph.Famous('zachary')
zac.vs['size'] = 7
zac.vs['name'] = [str(i) for i in range(zac.vcount())]
zac.es['color'] = cls_edges
zac.vs['comm'] = [0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1]
zac.vs['color'] = [cls[i*2] for i in zac.vs['comm']]
#ig.plot(zac, 'zachary.eps', bbox=(0,0,300,200))
ig.plot(zac, bbox=(0,0,300,200))

## Graph layouts 

We show a variety of graph layout functions available in igraph on the Zachary graph.


In [None]:
## Kamada-Kawai layout
ly = zac.layout('kk')

#ig.plot(zac, 'layout_kk.eps', layout=ly, bbox=(0,0,300,200))
ig.plot(zac, layout=ly, bbox=(0,0,300,200))


In [None]:
## Fruchterman-Reingold layout
ly = zac.layout('fr')

#ig.plot(zac, 'layout_fr.eps', layout=ly, bbox=(0,0,300,200))
ig.plot(zac, layout=ly, bbox=(0,0,300,200))

In [None]:
## Multidimensional scaling layout
ly = zac.layout('mds')

#ig.plot(zac, 'layout_mds.eps', layout=ly, bbox=(0,0,300,200))
ig.plot(zac, layout=ly, bbox=(0,0,300,200))

In [None]:
## Circular layout
ly = zac.layout('circle')

#ig.plot(zac, 'layout_circle.eps', layout=ly, bbox=(0,0,300,200))
ig.plot(zac, layout=ly, bbox=(0,0,300,200))

In [None]:
## Grid layout
ly = zac.layout('grid')

#ig.plot(zac, 'layout_grid.eps', layout=ly, bbox=(0,0,300,200))
ig.plot(zac, layout=ly, bbox=(0,0,300,200))

In [None]:
## Sugiyama layout
ly = zac.layout('sugiyama')

#ig.plot(zac, 'layout_tree.eps', layout=ly, bbox=(0,0,300,200))
ig.plot(zac, layout=ly, bbox=(0,0,300,200))

# Generate several embeddings -- Zachary graph

We try a few graph embedding algorithms on the Zachary graph with
different parameters. For example, we try different embedding dimensions.

We run the following:
* node2vec from source code
* HOPE with different similarities
* Laplacian Eigenmaps

For each embedding, we use the ground truth communities along with the framework to compute the "graph embedding divergence" (GED). We visualize some good and bad results.

For embeddings with low divergence, we see good separation of the communities (even in 2-dim projection, using UMAP), while this is not the case for embeddings with high divergence.


In [None]:
L = [] ## to store results
DIM = [2, 5, 10, 15]  ## try embedding in different dimensions
best_jsd = 1    ## keep track of best JS-divergence
worst_jsd = 0   ## and worst one.

## Hope with different choices for the similarity
for dim in DIM:
    for sim in ['katz','ppr','cn','aa']:
        X = Hope(zac,sim=sim,dim=dim) 
        saveEmbedding(X,zac)
        jsd = JS(datadir+'Zachary/zachary.edgelist',datadir+'Zachary/zachary.ecg','_embed')        
        ## keep track of best and worst embeddings
        if jsd < best_jsd:
            os.system('cp _embed _embed_best')
            best_jsd = jsd
        if jsd > worst_jsd:
            os.system('cp _embed _embed_worst')
            worst_jsd = jsd
        L.append([dim,'hope',sim,jsd])

## Laplacian Eigenmap
for dim in DIM:
    X = LE(zac,dim=dim)
    saveEmbedding(X,zac)
    jsd = JS(datadir+'Zachary/zachary.edgelist',datadir+'Zachary/zachary.ecg','_embed')
    ## keep track of best and worst embeddings
    if jsd < best_jsd:
        os.system('cp _embed _embed_best')
        best_jsd = jsd
    if jsd > worst_jsd:
        os.system('cp _embed _embed_worst')
        worst_jsd = jsd
    L.append([dim,'le',' ',jsd])
    
## node2vec 
## we try a few choices for p and q, parameters for the random walks
## on some platforms, we got better results with longer random walks (code commented out below)
for dim in DIM:
    for (p,q) in [(1,0.5),(0.5,1),(1,1)]:
        ## long walks:
        #x = n2v + ' -i:'+datadir+'Zachary/zachary.edgelist -o:_embed -d:'+str(dim)+' -p:'+str(p)+' -q:'+str(q)
        ## short walks (10-long):
        x = n2v + ' -l:10 -i:'+datadir+'Zachary/zachary.edgelist -o:_embed -d:'+str(dim)+' -p:'+str(p)+' -q:'+str(q)
        r = os.system(x+' >/dev/null 2>&1')
        jsd = JS(datadir+'Zachary/zachary.edgelist',datadir+'Zachary/zachary.ecg','_embed')

        ## keep track of best and worst embeddings
        if jsd < best_jsd:
            os.system('cp _embed _embed_best')
            best_jsd = jsd
        if jsd > worst_jsd:
            os.system('cp _embed _embed_worst')
            worst_jsd = jsd
        
        ## store results
        L.append([dim,'n2v',str(p)+' '+str(q),jsd])

## store results in dataframe, show top results w.r.t. JS divergence (lower is better)        
D = pd.DataFrame(L,columns=['dim','algo','param','jsd'])
D = D.sort_values(by='jsd',axis=0)
D.head()


In [None]:
## plot top results
os.system('cp _embed_best _embed')
l = embed2layout()
zac.vs['ly'] = [l[int(v['name'])] for v in zac.vs]
#ig.plot(z, 'zac_high.eps', layout=z.vs['ly'], bbox=(0,0,300,200))
ig.plot(zac,layout=zac.vs['ly'], bbox=(0,0,300,200))

In [None]:
## results with largest JS divergence
D.tail()

In [None]:
## plot result with largest divergence
os.system('cp _embed_worst _embed')
l = embed2layout()
zac.vs['ly'] = [l[int(v['name'])] for v in zac.vs]
#ig.plot(zac, 'zac_high.eps', layout=z.vs['ly'], bbox=(0,0,300,200))
ig.plot(zac,layout=zac.vs['ly'], bbox=(0,0,300,200))

# Generate several embeddings -- small ABCD  graph

This is the same exercise as what we did above, this time for the 100-nodes ABCD graph.
We look at slightly higher embedding dimensions as there are more nodes than the Zachary graph.

In [None]:
L = []
DIM = [2,4,8,16,24,32] ## embedding dimensions
best_jsd = 1           ## keep track of best result
worst_jsd = 0          ## and worst

## Hope with different choices for the similarity
for dim in DIM:
    for sim in ['katz','aa','cn','ppr']:
        X = Hope(abcd,sim=sim,dim=dim) 
        saveEmbedding(X,abcd)
        jsd = JS(datadir+'ABCD/abcd_100.dat',datadir+'ABCD/abcd_100.ecg','_embed')
        ## keep track of best and worst
        if jsd < best_jsd:
            os.system('cp _embed _embed_best')
            best_jsd = jsd
        if jsd > worst_jsd:
            os.system('cp _embed _embed_worst')
            worst_jsd = jsd
        L.append([dim,'hope',sim,jsd])

## Laplacian Eigenmap
for dim in DIM:
    X = LE(abcd,dim=dim)
    saveEmbedding(X,abcd)
    jsd = JS(datadir+'ABCD/abcd_100.dat',datadir+'ABCD/abcd_100.ecg','_embed')
    ## keep track of best and worst
    if jsd < best_jsd:
        os.system('cp _embed _embed_best')
        best_jsd = jsd
    if jsd > worst_jsd:
        os.system('cp _embed _embed_worst')
        worst_jsd = jsd
    L.append([dim,'le',' ',jsd])
    
## node2vec 
## we try a few choices for p and q, parameters for the random walks
## on some platforms, we got better results with longer random walks (code commented out below)for dim in DIM:
    for (p,q) in [(1,0.1),(1,.5),(0.1,1),(.5,1),(1,1)]:
        ## long walks:
        #x = n2v + ' -i:'+datadir+'ABCD/abcd_100.dat -o:_embed -d:'+str(dim)+' -p:'+str(p)+' -q:'+str(q)
        ## short walks:
        x = n2v + ' -l:15 -i:'+datadir+'ABCD/abcd_100.dat -o:_embed -d:'+str(dim)+' -p:'+str(p)+' -q:'+str(q)
        r = os.system(x+' >/dev/null 2>&1')
        jsd = JS(datadir+'ABCD/abcd_100.dat',datadir+'ABCD/abcd_100.ecg','_embed')
        ## keep track of best and worst
        if jsd < best_jsd:
            os.system('cp _embed _embed_best')
            best_jsd = jsd
        if jsd > worst_jsd:
            os.system('cp _embed _embed_worst')
            worst_jsd = jsd
        L.append([dim,'n2v',str(p)+' '+str(q),jsd])

## store in dataframe and show best results        
D = pd.DataFrame(L,columns=['dim','algo','param','jsd'])
D = D.sort_values(by='jsd',axis=0)
D.head()


In [None]:
## plot result with lowest JS divergence
os.system('cp _embed_best _embed')
l = embed2layout()
abcd.vs['ly'] = [l[int(v['name'])-1] for v in abcd.vs]
ig.plot(abcd, layout=abcd.vs['ly'], bbox=(0,0,300,200))


In [None]:
## results with high divergence
D.tail()

In [None]:
## plot result with high divergence
os.system('cp _embed_worst _embed')
l = embed2layout()
abcd.vs['ly'] = [l[int(v['name'])-1] for v in abcd.vs]
ig.plot(abcd, layout=abcd.vs['ly'], bbox=(0,0,300,200))


# Classification on larger ABCD graph

We saw that embedding can be used to visualize graphs. Below we use graph embedding as a way to define a feature vector (point in vector space) for each node, and we use this representation to train a classifier.
We use a saved embedding (48-dimension running HOPE with 'ppr' similarity).

We split the data (the nodes) into a training and testing set. Using the training set, we build a random forest classification model where the classes are the communities for each node.

We then apply this model to the test set.

The graph has 1000 nodes; we use 250 for training and the rest for testing; we obtain good accuracy (around 90%).
What do you think will happen if we increase/decrease the size of the training set?

WE also report the confusion matrix (details in section 6.5 of the book).

Finally, we compare with results obtained via a random classifier where we supply the correct number of classes only, or the number and relative sizes for the classes.

We see that our random forest model gives much better results that with a random classifier.


In [None]:
## load a saved embedding for ABCD graph
X = readEmbedding(fn=datadir+"ABCD/abcd_1000_embed_best")
y = ABCD.vs['comm']

## train/test split
np.random.seed(1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.75, random_state=0)


In [None]:
# Create the model with 100 trees
model = RandomForestClassifier(n_estimators=100, 
                               bootstrap = True,
                               max_features = 'sqrt')
# Fit on training data
model.fit(X_train, y_train)

# Class predictions on test data
y_pred = model.predict(X_test)

In [None]:
## Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

## percent correct -- this can vary slightly as we split train/test randomly
print('\naccuracy:',sum(cm.diagonal())/sum(sum(cm)),'\n')
#print(bmatrix(cm))

In [None]:
## compare with random classifier -- assuming we know only the number of classes (12)
acc = []
for rep in range(30): ## repeat 30 times, we'll take average
    y_pred = [x+1 for x in np.random.choice(12,size=len(y_test),replace=True)]
    cm = confusion_matrix(y_test, y_pred)
    acc.append(sum(cm.diagonal())/sum(sum(cm)))
## accuracy
print('\nAverage accuracy:',np.mean(acc))

In [None]:
## compare with random classifier -- using class proportions in training data
ctr = Counter(y_train)
x = [ctr[i+1] for i in range(12)]
s = np.sum(x)
p = [i/s for i in x]
acc = []
for rep in range(30): ## repeat 30 times, we'll take average
    y_pred = [x+1 for x in np.random.choice(12,size=len(y_test),replace=True,p=p)]
    cm = confusion_matrix(y_test, y_pred)
    acc.append(sum(cm.diagonal())/sum(sum(cm)))
## accuracy
print('\nAverage accuracy:',np.mean(acc))


# Clustering in embedded space

Again using the larger ABCD graph, we run some graph clustering algorithms (Louvain and ECG).
We run each algorithm several times are report two statistics:
* the modularity score of the clustering, and
* the adjusted mutual information (AMI) score when comparing with ground-truth (GT) communities.

We do the same this also with the clusters obtained when running k-means (with 5 choices for k) in embedded vector space. 
We use the same saved embedding than in the previous experiment. 
This time, we report:
* the CHS score (Calinski and Harabasz score, or Variance Ratio Criterion)
* the adjusted mutual information (AMI) score when comparing with ground-truth (GT) communities.

In practical applications where we do not have access to the ground-truth, we need some other measure to quantify the quality of the clusters we obtain, such as modularity or CHS. We report AMI for runs with highest score (modularity or CHS) for the 3 clustering algorithms.

The cell below can take a few minutes to run. You can decrease the number of repeats (REP) for faster results.

In [None]:
## load the saved embedding
X = readEmbedding(fn=datadir+"ABCD/abcd_1000_embed_best")

L = [] ## to store results
K = [6,9,12,15,24] ## for k-means (real number of clusters is 12)
REP = 30 ## number of repeats; decrease for faster run

for i in range(REP):
    
    ## run kmeans
    for k in K:
        cl = KMeans(n_clusters=k, n_init=10).fit(X)
        d = {k:v for k,v in enumerate(cl.labels_)}
        scr = CHS(X,cl.labels_) ## CHS
        ami = AMI(list(GT.values()),list(d.values())) ## AMI vs ground truth
        L.append(['km'+str(k),scr,ami])

    ## ECG
    ec = ABCD.community_ecg().membership
    scr = ABCD.modularity(ec) ## modularity
    ami = AMI(list(GT.values()),ec) ## AMI vs ground truth
    L.append(['ecg',scr,ami])
    
    ## Louvain -- permute as this is not done in igraph, so we get different results for each repeat
    p = np.random.permutation(ABCD.vcount()).tolist()
    GG = ABCD.permute_vertices(p)
    l = GG.community_multilevel().membership
    ll = [-1]*len(l)
    for i in range(len(l)):
        ll[i] = l[p[i]]
    scr = ABCD.modularity(ll) ## modularity
    ami = AMI(list(GT.values()),ll) ## AMI vs ground truth
    L.append(['ml',scr,ami])

## store in dataframe
D = pd.DataFrame(L,columns=['algo','scr','ami'])


In [None]:
## AMI results with best scoring clustering for the 3 algorithms
x = list(D[[x.startswith('km') for x in D['algo']]].sort_values(by='scr',ascending=False)['ami'])[0]
print('K-Means AMI:',x)

x = list(D[D['algo']=='ml'].sort_values(by='scr',ascending=False)['ami'])[0]
print('Louvain AMI:',x)

x = list(D[D['algo']=='ecg'].sort_values(by='scr',ascending=False)['ami'])[0]
print('ECG AMI:',x)


Next, we summarize the results for all runs in a boxplot. 
Results with k-means are best when we supply the correct number of clusters (12). 
We also see the high variability when using Louvain instead of ECG.


In [None]:
## boxplot the AMI results 
A = []
algo = ['km6','km9','km12','km15','km24','ml','ecg']
for a in algo:
    A.append(D[D['algo']==a]['ami'])

B = pd.DataFrame(np.transpose(A), 
                 columns=['k-means(6)','k-means(9)','k-means(12)','k-means(15)',
                          'k-means(24)','Louvain','ECG'])
B.boxplot(rot=30,figsize=(7,5))
plt.ylabel('Adjusted Mutual Information (AMI)');
#plt.savefig('embed_cluster.eps')

Below we cluster using the DBSCAN algorithm after reducing the dimension via UMAP.
We found that running a good dimension reduction algorithm before clustering often gives better results.
This is for illustration and
you can experiment with different choices of parameter below as well as diffferent clustering algorithms.

DBSCAN does not always cluster all the points, which can be quite useful in practice. Some points can be tagged as "outliers". Below, we compute AMI with and without the outlying points. 
Result without outliers is quite good (recall that unlike k-mens, we do not supply the number of communities here).


In [None]:
## DBSCAN -- we tried a few 'min_sample' and 'dim' below
## with good results using 8 and 16 resp.
## we try various 'eps' and pick the best via calinski_harabasz_score (CHS)
top = 0
for dim in [16]: ## reduce to this dimension
    for ms in [8]: ## min-sample in DBSCAN
        U = umap.UMAP(n_components=dim).fit_transform(X)
        for e in np.arange(.4,.5,.0025): ## try different values for epsilon
            cl = DBSCAN(eps=e, min_samples=ms ).fit(U)
            labels = cl.labels_
            s = CHS(U,labels) ## CHS score
            if s > top:
                top=s
                e_top=e
                d_top=dim
                m_top=ms

## result with best CHS score
U = umap.UMAP(n_components=d_top).fit_transform(X) 
cl = DBSCAN(eps=e_top, min_samples=m_top).fit(U)

b = [x>-1 for x in cl.labels_]
l = list(GT.values())
v = [l[i] for i in range(len(l)) if b[i]]
print('AMI without outliers:',AMI(v,cl.labels_[b]))
print('AMI with outliers:',AMI(list(GT.values()),cl.labels_))


# Link prediction

Given a graph, link prediction aims at finding pairs of nodes not linked by an edge that are the most likely to actually have an edge between them. This could happen if we have a partial view of a graph, for example if edges 
are observed over some period of time, which new edges are we most likely to observe next?

In order to simulate this situation, we take the ABCD graph with 1,000 nodes and drop 10% of the edges.
We re-compute the embedding (since the graph has changed), train a logistic regression model using pairs
of nodes with and without an edge, and apply the model to a test set consisting of the dropped edges, and other 
pairs of nodes not linked by an edge.

First we try with the current ABCD graph with noise parameter $\xi=0.6$.
Given the large number of "noise" edges, results are not very good, as expected.

We do another test this time with another ABCD graph with $\xi=0.2$, with much better results.


### Link prediction with noisy ABCD graph


In [None]:
## pick 10% edges at random, save new graph as Gp
test_size = int(np.round(.1*ABCD.ecount()))
np.random.seed(123456)
test_eid = np.random.choice(ABCD.ecount(),size=test_size,replace=False)
Gp = ABCD.copy()
Gp.delete_edges(test_eid)

## are there zero-degree nodes in this subgraph?
print('min degree:',np.min(Gp.degree()))

## compute embedding on Gp with parameters that yielded a good embedding for G
X = Hope(Gp, sim='ppr', dim=48)


To build a classifier, we take pairs of nodes (some with edge, some without) and we merge the embedding vectors for those 2 nodes using some binary operator. This generates a feature vector for each pair of nodes we consider.

We build the training data by considering all edges in the subgraph, and an equal number of node pairs without an edge.

From this data, we build a logistic regression model to predict edges vs non-edges.

We then apply the model to the test set which includes the dropped edges, and the same number of non-edges.

We report the accuracy and AUC (area under the ROC curve). 
Results are better than random, but not great; recall that $\xi$=0.6, so the majority of edges are noise to start with, so link prediction is very hard in this case. We try with less noisy graph next.


In [None]:
## Model with Hadamard binary operator (other choices are 'l1', 'l2 and 'avg')
op = 'had'

## Build training data, first the edges
F = []
for e in Gp.es:
    F.append(binary_operator(X[e.tuple[0]],X[e.tuple[1]],op=op))
size = len(F)
f = [1]*size

## then for equal number of non-edges (we over-sample to drop edges or collisions from the list)
## nb: those could include some of the dropped edges, but avoiding those would not be realistic 
e = [tuple(np.random.choice(Gp.vcount(),size=2,replace=False)) for i in range(2*size)]
e = [(min(x),max(x)) for x in e if Gp.get_eid(x[0],x[1],directed=False,error=False) == -1]
non_edges = list(set(e))[:size]
for e in non_edges:
    F.append(binary_operator(X[e[0]],X[e[1]],op=op))
F = np.array(F)
f.extend([0]*size)

## train the model, here a logistic regression
logreg = LogisticRegression()
logreg.fit(F,f)

## prepare test set, first with all dropped edges from G 
X_test = []
for i in test_eid:
    e = ABCD.es[i]
    X_test.append(binary_operator(X[e.tuple[0]],X[e.tuple[1]],op=op))
size = len(X_test)
y_test = [1]*size

## then for equal number of non-edges (we over-sample to drop edges and collisions from the list)
e = [tuple(np.random.choice(ABCD.vcount(),size=2,replace=False)) for i in range(2*size)]
e = [(min(x),max(x)) for x in e if ABCD.get_eid(x[0],x[1],directed=False,error=False) == -1]
non_edges = list(set(e))[:size]
for e in non_edges:
    X_test.append(binary_operator(X[e[0]],X[e[1]],op=op))
X_test = np.array(X_test)
y_test.extend([0]*size)

## apply the model to test data
print('Accuracy of logistic regression classifier with',op,'on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
print('AUC:',roc_auc_score(y_test, logreg.predict_proba(X_test)[:,1]))

### Link prediction with less noisy ABCD graph

Same as above, but with ABCD graph with $\xi=0.2$.


In [None]:
## read graph and communities - graph with xi=0.2
ABCD2 = ig.Graph.Read_Ncol(datadir+'ABCD/abcd_1000_xi2.dat',directed=False)
c = np.loadtxt(datadir+'ABCD/abcd_1000_xi2_comms.dat',dtype='uint16',usecols=(1))
ABCD2.vs['comm'] = [c[int(x['name'])-1] for x in ABCD2.vs]

## pick 10% edges at random, save new graph as Gp
test_size = int(np.round(.1*ABCD2.ecount()))
np.random.seed(123456) ## for reproducibility
test_eid = np.random.choice(ABCD2.ecount(),size=test_size,replace=False)
Gp = ABCD2.copy()
Gp.delete_edges(test_eid)

## are there zero-degree nodes in this subgraph?
print('min degree:',np.min(Gp.degree()))

## compute embedding on Gp with same parameters as above
X = Hope(Gp,sim='ppr', dim=48)


In [None]:
## Train model with Hadamard binary operator (other choices are 'l1', 'l2 and 'avg')
op = 'had'

## Build training data, first the edges
F = []
for e in Gp.es:
    F.append(binary_operator(X[e.tuple[0]],X[e.tuple[1]],op=op))
size = len(F)
f = [1]*size

## then for equal number of non-edges (we over-sample to drop edges and collisions from the list)
e = [tuple(np.random.choice(Gp.vcount(),size=2,replace=False)) for i in range(2*size)]
e = [(min(x),max(x)) for x in e if Gp.get_eid(x[0],x[1],directed=False,error=False) == -1]
non_edges = list(set(e))[:size]
for e in non_edges:
    F.append(binary_operator(X[e[0]],X[e[1]],op=op))
F = np.array(F)
f.extend([0]*size)

## train model
logreg = LogisticRegression()
logreg.fit(F,f)

## prepare test set, first with all dropped edges from G 
X_test = []
for i in test_eid:
    e = ABCD2.es[i]
    X_test.append(binary_operator(X[e.tuple[0]],X[e.tuple[1]],op=op))
size = len(X_test)
y_test = [1]*size

## then for equal number of non-edges (we over-sample to drop edges and collisions from the list)
e = [tuple(np.random.choice(ABCD2.vcount(),size=2,replace=False)) for i in range(2*size)]
e = [(min(x),max(x)) for x in e if ABCD2.get_eid(x[0],x[1],directed=False,error=False) == -1]
non_edges = list(set(e))[:size]
for e in non_edges:
    X_test.append(binary_operator(X[e[0]],X[e[1]],op=op))
X_test = np.array(X_test)
y_test.extend([0]*size)

## apply the model to test data
print('Accuracy of logistic regression classifier with',op,'on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
print('AUC:',roc_auc_score(y_test, logreg.predict_proba(X_test)[:,1]))

Results are much better in this case. Below we plot the ROC curve; the dashed line is the expected random case, which yields AUC = 0.5.

In [None]:
logit_roc_auc = roc_auc_score(y_test, logreg.predict_proba(X_test)[:,1])
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, color='gray',label='Logistic Regression (AUC = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('')
plt.legend(loc="lower right")
#plt.savefig('embed_link.eps')
plt.show();

## Larger study -- use classification accuracy for comparing embeddings

We saw earlier an **unsupervised** method for selecting good graph embeddings where we computed some divergence score. In **supervised** case, it is usually better to take advantage of the known labels to compare embeddings.
With this larger experiment, we do the following using the 1,000 nodes ABCD graph. Recall that in this case, the class is the ground-truth community for each node. 

* we partition the nodes into training, validation and test sets in proportion 25%/25%/50%
* we generate 70 different embeddings (3 algorithms, different parameters)
* from each embedding, 
 * we compute the JS divergence (unsupervised score)
 * we use the training data to build a classification model (random forest)
 * we apply this model to the validation set 
 * we compute the accuracy score (supervised score) 

The code to do this is commented out in the cell below as this can take several minutes to run. 
A pickle file with the results is included in data directory and can be read directly.
If you re-run from scratch, the results can differ slightly due to non-deterministic algorithms like node2vec.


In [None]:
## load results from pickle file
with open(datadir+"ABCD/abcd_1000_embeddings.pkl","rb") as f:
    id_train,id_val,id_trainval,id_test,L = pickle.load(f)

## labels for train/validation/test sets
y_all = ABCD.vs['comm']
y_train = [y_all[i] for i in id_train]
y_trainval = [y_all[i] for i in id_trainval] ## training+validation sets
y_val = [y_all[i] for i in id_val]
y_test = [y_all[i] for i in id_test]


Below we compute the rank-based  Kendall-tau correlation between the divergence score (unsupervised) and the accuracy score (supervised). We see negative correlation which is to be expected since respectively low divergence and high accuracy are better.

In [None]:
## correlation: divergence and accuracy
R = pd.DataFrame(L,columns=['dim','algo','param','div','acc'])
from scipy.stats import kendalltau as tau
print(tau(R['div'],R['acc']))


In the next 2 cells, we show the top results on the validation set respectively for the divergence and accuracy scores. We also add two columns with the respective ranks.


In [None]:
## sort by JS-divergence on validation set
R = R.sort_values(by='div',axis=0,ascending=True)
size = R.shape[0]
R['rank_div'] = np.arange(1,size+1,1)
R.head()


In [None]:
## sort by Accuracy on validation set
R = R.sort_values(by='acc',axis=0,ascending=False)
size = R.shape[0]
R['rank_acc'] = np.arange(1,size+1,1)
R.head()


Below we show the lowest accuracy results. We see that there is quite a range of accuracy on the validation set!


In [None]:
R.tail()

###  Apply the models to the test set. 

In the previous cells, we built a table ranking the different algorithms w.r.t. accuracy and divergence using the training and validation sets. Here, we go through the same algorithms in (decreasing) order of accuracy, re-train with each model using the training and validation sets, and apply to the test set.

This takes several minutes to run so a pickle file is provided with the results.

Uncomment the cell below to re-run; results can differ in that case due to non-deterministic algorithms like node2vec

In [None]:
## load test results
with open(datadir+"ABCD/abcd_1000_embeddings_test.pkl","rb") as f:
    top_acc = pickle.load(f)
R['test'] = top_acc
print('mean accuracy over all models on the test set:',np.mean(R['test']))


In [None]:
## top results w.r.t. accuracy on the test set
R = R.sort_values(by='test',axis=0,ascending=False)
R['rank_test'] = np.arange(1,size+1,1)
R.head()

Next, we take the top-10 algorithms w.r.t. divergence on the validation set, and the top-10 algorithms w.r.t. accuracy on the valudation set. We then plot the distribution of results (accuracy) over the test set via box-plots.

As expected, using accuracy (supervised score) yields better results, but the results obtained with the (unsupervised) divergence score are also quite good.


In [None]:
R['test'].loc[:10]

In [None]:
## top results on test set w.r.t. divergence on validation set
R = R.sort_values(by='div',axis=0,ascending=True)
top_div = R['test'].iloc[:10]

## top results on test set w.r.t. accuracy on validation set
R = R.sort_values(by='acc',axis=0,ascending=False)
top_acc = R['test'].iloc[:10]

## pd with mu
B = pd.DataFrame(np.transpose(np.array([top_acc,top_div])), 
                 columns=['Top-10 validation set accuracy','Top-10 divergence score'])
B.boxplot(rot=0,figsize=(7,5), widths=.33)
plt.ylabel('Test set accuracy',fontsize=14);
#plt.savefig('embed_classify.eps')


Another way to compare the results is to plot the accuracy results on the test set as a function of the rank of the algorithms w.r.t. the accuracy score on the validation set (next cell) or the divergence score on the validation set (second next cell).

The correlation is very clear in the first case, and is still quite strong in the second case.

In [None]:
plt.plot(R['rank_acc'],R['test'],'.',color='black')
plt.xlabel('Rank',fontsize=14)
plt.ylabel('Test set accuracy',fontsize=14);
#plt.savefig('rank_accuracy.eps');
print('correlation:',np.corrcoef(R['rank_acc'],R['test'])[0,1])

In [None]:
plt.plot(R['rank_div'],R['test'],'.',color='black')
plt.xlabel('Rank',fontsize=14)
plt.ylabel('Test set accuracy',fontsize=14);
#plt.savefig('rank_divergence.eps');
print('correlation:',np.corrcoef(R['rank_div'],R['test'])[0,1])

Finally, we compare with accuracy obtained with a random classifier, averaging over several runs.

In [None]:
## random classification
ctr = Counter(y_trainval)
x = [ctr[i+1] for i in range(12)]
s = np.sum(x)
p = [i/s for i in x]
acc = []
for rep in range(30):
    y_pred = [x+1 for x in np.random.choice(12,size=len(y_test),replace=True,p=p)]
    cm = confusion_matrix(y_test, y_pred)
    acc.append(sum(cm.diagonal())/sum(sum(cm)))
print('\nRandom classifier average accuracy on test set:',np.mean(acc))


## ReFex: illustrate roles on Zachary graph

We use the 'graphrole' package here. There are two steps (details in section 6.7 of the book):
* extract node features recursively (ReFeX)
* apply non-neg. matrix factorization to recover different roles in the graph (RolX)
We use 3 dimensions for the RolX step. 

Results shos that the 3 roles correspond roughly to: hub nodes, peripherial nodes and nodes in-between those.


In [None]:
# extract features
feature_extractor = RecursiveFeatureExtractor(zac, max_generations=4)
features = feature_extractor.extract_features()
print(f'\nFeatures extracted from {feature_extractor.generation_count} recursive generations:')
features.head(10)

In [None]:
# assign node roles in a dictionary
role_extractor = RoleExtractor(n_roles=3)
role_extractor.extract_role_factors(features)
node_roles = role_extractor.roles
role_extractor.role_percentage.head()

In [None]:
#import seaborn as sns
unique_roles = sorted(set(node_roles.values()))
# uncomment for color plot
# cls = ['red','blue','green']
# map roles to colors
role_colors = {role: cls[i] for i, role in enumerate(unique_roles)}

# store colors for all nodes in G
zac.vs()['color'] = [role_colors[node_roles[node]] for node in range(zac.vcount())]

## Plot with node labels
zac.vs()['size'] = 10
#z.vs()['label'] = [v.index for v in z.vs()]
zac.vs()['label_size'] = 0
#ig.plot(z, 'refex.eps', bbox=(0,0,300,300)) 
ig.plot(zac, bbox=(0,0,300,300)) 



# Anomaly detection

### New dataset -- American College Football Graph

This is a nice, small graph for illustrating anomaly detection methods.
The graph consists of 115 US college football teams (nodes) playing games (edges).

Teams are part of 12 conferences (the 'communities'):
*   0 = Atlantic Coast
*   1 = Big East
*   2 = Big Ten
*   3 = Big Twelve
*   4 = Conference USA
*   5 = Independents
*   6 = Mid-American
*   7 = Mountain West
*   8 = Pacific Ten
*   9 = Southeastern
*  10 = Sun Belt
*  11 = Western Athletic

14 teams out of 115 appear as "anomalies" as can be seen in Figure 5 of [REF], namely:
- 5 teams in #5 conference (Independent) play teams in other conferences (green triangles in plot below)
- 7 teams in #10 conference (Sun Belt) are broken in 2 clumps (pink triangles in plot below) 
- 2 teams from #11 conference play mainly with #10 conference (red triangles below)

[REF]: "Community structure in social and biological networks", M. Girvan and M. E. J. Newman
PNAS June 11, 2002 99 (12) 7821-7826; https://doi.org/10.1073/pnas.122653799



In [None]:
## read graph and communities
cfg = ig.Graph.Read_Ncol(datadir+'Football/football.edgelist',directed=False)
c = np.loadtxt(datadir+'Football/football.community',dtype='uint16',usecols=(0))
cfg.vs['community'] = [c[int(x['name'])] for x in cfg.vs]

## plot the College Football Graph
## show communities in dfferent colors
## show known anomalies as triangles
cfg.vs['shape'] = 'circle'
cfg.vs['anomaly'] = 0
pal = ig.RainbowPalette(n=max(cfg.vs['community'])+1) 
cfg.vs['color'] = [pal.get(int(i)) for i in cfg.vs['community']]
for v in cfg.vs:
    if v['community'] in [5,10] or v['name'] in ['28','58']:
        v['shape']='triangle'
        v['anomaly']=1
ly = cfg.layout_fruchterman_reingold()
ig.plot(cfg, layout=ly, bbox=(0,0,500,300), vertex_size=8, edge_color='lightgray')
#ig.plot(cfg, target="anomaly_0.eps", layout=ly, bbox=(0,0,500,300), vertex_size=8, edge_color='lightgray')


In [None]:
## greyscale version (for the book)
pal = ig.GradientPalette("white","black",max(cfg.vs['community'])+1)
cfg.vs['color'] = [pal.get(int(i)) for i in cfg.vs['community']]
ig.plot(cfg, layout=ly, bbox=(0,0,500,300), vertex_size=8, edge_color='lightgray')
#ig.plot(cfg, target="anomaly_1.eps", layout=ly, bbox=(0,0,500,300), vertex_size=8, edge_color='lightgray')


Here, we try to recover those anomalous teams by running several node2vec embeddings with different parameters.
For each embedding:
* compute JS-divergence using the framework
* compute the entropy of the b-vector for each node (i.e. the probability distribution of edges w.r.t. every community in the geometric Chung-Lu model)
* since we have the ground truth (anomalous nodes), we also compute the area under the ROC curve (AUC)

From those results:
* plot entropy vs divergence
* for some good/bad embedding, we show boxplot for the entropy of anomalous vs other nodes

There are several other methods to find anomalous nodes, but this simple approach yields good results. The rationale is that an "anomalous" node will be difficult to place in a cluster, so the geometric Chung-Lu model will predict edges to several different clusters.


In [None]:
## keep track of best/worst results
best_jsd = 1
worst_jsd = 0
L = []

## node2vec with varying parameters (60 embeddings)
## on some platforms, we got better results with longer random walks (code commented out below)
for dim in np.arange(2,25,2):
    for (p,q) in [(1,0.5),(0.5,1),(1,0.1),(0.1,1),(1,1)]:
        ## long walks:
        #x = n2v + ' -i:'+datadir+'Football/football.edgelist -o:_embed -d:'+str(dim)+' -p:'+str(p)+' -q:'+str(q)
        ## short walks:
        x = n2v + ' -l:15 -i:'+datadir+'Football/football.edgelist -o:_embed -d:'+str(dim)+' -p:'+str(p)+' -q:'+str(q)
        r = os.system(x+' >/dev/null 2>&1') ## to avoid long output
        jsd = JS(datadir+'Football/football.edgelist',datadir+'Football/football.ecg','_embed',entropy=True)
        ## keep track of best and worst
        if jsd < best_jsd:
            os.system('cp _entropy _entropy_best')
            best_jsd = jsd
        if jsd > worst_jsd:
            os.system('cp _entropy _entropy_worst')
            worst_jsd = jsd

        ent = list(pd.read_csv('_entropy',header=None)[1])
        cfg.vs['ent'] = ent
        roc = roc_auc_score(cfg.vs['anomaly'], ent)
        L.append([dim,'n2v',str(p)+' '+str(q),jsd,roc])        

## store results in dataframe and show best ones w.r.t. divergence
D = pd.DataFrame(L,columns=['dim','algo','param','jsd','auc'])
D = D.sort_values(by='jsd',axis=0)
D.head()


In [None]:
## show worst results (high divergence) 
D.tail()

Below, we plot the AUC (w.r.t. ground truth) as a function of the divergence. We see the (negative) correltion between those quantities.

In [None]:
## auc vs divergence (jsd)
plt.plot(D['jsd'],D['auc'],'o',color='black')
plt.xlabel('JS Divergence',fontsize=14)
plt.ylabel('AUC',fontsize=14);
#plt.savefig('anomaly_2.eps')

In the next two plots, we compare the distributions of entropy for the anomalous and "regular" nodes, respectively with a good (low divergence) and bad embedding. Separation is clearer in the first case, as expected.

In [None]:
## Entropy scores - some good embedding
cfg.vs['ent'] = list(pd.read_csv('_entropy_best',header=None)[1])
X = [v['ent'] for v in cfg.vs if v['anomaly']==0]
Y = [v['ent'] for v in cfg.vs if v['anomaly']==1]
plt.boxplot([X,Y],labels=['Regular','Anomalous'],sym='.',whis=(0,100), widths=.5)
plt.title("Low divergence embedding",fontsize=14)
plt.ylabel('Entropy',fontsize=14);
#plt.savefig('anomaly_3.eps')

In [None]:
## Entropy scores - some not so good embedding
cfg.vs['ent'] = list(pd.read_csv('_entropy_worst',header=None)[1])
X = [v['ent'] for v in cfg.vs if v['anomaly']==0]
Y = [v['ent'] for v in cfg.vs if v['anomaly']==1]
plt.boxplot([X,Y],labels=['Regular','Anomalous'],sym='.',whis=(0,100), widths=.5)
plt.title("High divergence embedding",fontsize=14)
plt.ylabel('Entropy',fontsize=14);
#plt.savefig('anomaly_4.eps')

### Variation:  combining several good embeddings

We consider the top-$k$ embeddings w.r.t. divergence and compare sum ranks w.r.t. entropy score over those.
The hope is to add stability by using a ensemble of models.
You can try other ideas for combining results from different models.


In [None]:
from scipy.stats import rankdata

## try with top-k embeddings together - hopefully this consistently yields high AUC
k = 7
cfg.vs['rank'] = 0
for i in range(k):
    dim = D.iloc[i]['dim']
    p = float(D.iloc[i]['param'].split()[0])
    q = float(D.iloc[i]['param'].split()[1])
    x = 'node2vec -i:'+datadir+'Football/football.edgelist -o:_embed -d:'+str(dim)+' -p:'+str(p)+' -q:'+str(q)
    ## if you get unstable results, you can try with shorter random walks, for example:
    ## x = 'node2vec -l:15 -i:'+datadir+'Football/football.edgelist -o:_embed -d:'+str(dim)+' -p:'+str(p)+' -q:'+str(q)
    r = os.system(x+' >/dev/null 2>&1') ## to avoid long output
    jsd = JS(datadir+'Football/football.edgelist',datadir+'Football/football.ecg','_embed',entropy=True)
    cfg.vs['ent'] = list(pd.read_csv('_entropy',header=None)[1])
    rk = rankdata(cfg.vs['ent'])
    for i in range(len(rk)):
        cfg.vs[i]['rank'] += rk[i] ## add ranks
print('AUC: ',roc_auc_score(cfg.vs['anomaly'], cfg.vs['rank']))