The following code will install required Python packages. It needs to be run only once:
```
using PyCall
run(`$(PyCall.python) -m pip install python-igraph`)
run(`$(PyCall.python) -m pip install umap-learn`)
run(`$(PyCall.python) -m pip install sklearn`)
run(`$(PyCall.python) -m pip install partition_igraph`)
```

In this notebook we also show how to use both LightGraphs.jl and igraph from Julia.

## Requirements

* set the directories in the next cell

In [None]:
## set those accordingly
datadir = "../Datasets/"
abcd_path = "~/ABCD/utils/"

In [None]:
ENV["COLUMNS"] = 1000

In [None]:
using PyCall

In [None]:
using LightGraphs

In [None]:
using GraphPlot

In [None]:
using Random

In [None]:
using DataFrames

In [None]:
using Statistics

In [None]:
using StatsBase

In [None]:
using PyPlot

In [None]:
using FreqTables

In [None]:
using ABCDGraphGenerator

In [None]:
ig = pyimport("igraph")

In [None]:
umap = pyimport("umap")

In [None]:
partition_igraph = pyimport("partition_igraph")

In [None]:
AMI = pyimport("sklearn.metrics").adjusted_mutual_info_score
MI = pyimport("sklearn.metrics").mutual_info_score
ARI = pyimport("sklearn.metrics").adjusted_rand_score
NMI = pyimport("sklearn.metrics").normalized_mutual_info_score

## Zachary (karate) graph

A small graph with 34 nodes and two "ground-truth" communities;
modularity-based algorithms will typically find 4 or 5 communities.

In [None]:
Random.seed!(2)
z_lg = smallgraph(:karate)
comm = [0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1] .+ 1 # we match Python class labels
col = ["red", "green"]
gplot(z_lg,
      NODESIZE=0.06, nodefillc=col[comm],
      EDGELINEWIDTH=0.2, edgestrokec="gray",
      nodelabel=0:nv(z_lg)-1)

## Node roles --
 
We compute z(v) (normalized within module degree) and p(v) (participation coefficients) as defined in section 5.2 of the book. 

We identify 3 types of nodes

* provincial hubs
* peripheral nodes (non-hubs)
* ultra peripheral nodes (non-hubs)
    

In [None]:
z_df = DataFrame(id = 0:nv(z_lg)-1, comm=comm, deg=degree(z_lg), in_deg=0)
for e in edges(z_lg)
    src, dst = e.src, e.dst
    if z_df.comm[src] == z_df.comm[dst]
        z_df.in_deg[src] += 1
        z_df.in_deg[dst] += 1
    end
end
z_df.out_deg = z_df.deg - z_df.in_deg
transform!(groupby(z_df, :comm), :in_deg => (x -> (x .- mean(x)) / std(x, corrected=false)) => :z);
z_df.p = @. 1 - (z_df.in_deg / z_df.deg)^2 - (z_df.out_deg / z_df.deg)^2
first(sort!(z_df, :z, rev=true), 10)

In [None]:
Random.seed!(2)
col = [z < 2.5 ? (p < 0.62 ? (p < 0.05 ? "red" : # ultra peripherial
                                         "blue") : # peripherial
                             "black") : # should not happen
                 (p < 0.3 ? "green" : # hub
                            "black") # should not happen
       for (z, p) in zip(z_df.z, z_df.p)]
gplot(z_lg,
      NODESIZE=0.06, nodefillc=col,
      EDGELINEWIDTH=0.2, edgestrokec="gray",
      nodelabel=0:nv(z_lg)-1)

In [None]:
subplots(figsize=(12,9))
scatter(z_df.p,z_df.z, marker="o", s=75, color=col)

plot([0, .5], [2.5, 2.5], color="k", linestyle="-", linewidth=2)
plot([.05, .05], [-1.0, 2.4], color="k", linestyle="-", linewidth=2)

for i in 1:nrow(z_df)
    annotate(string(z_df.id[i]), (z_df.p[i]-0.003, z_df.z[i] + 0.07))
end

xlabel("participation coefficient (p)",fontsize=16)
ylabel("normalized within module degree (z)",fontsize=16);

### Checking the communities w.r.t. strong/weak definitions


In [None]:
## strong criterion: internal degree is larger for each node
## only two nodes do not qualify
z_df[z_df.in_deg .<= z_df.out_deg, :]

In [None]:
## weak criterion: total internal degree > total external degree
## both communities satisfy this criterion
combine(groupby(z_df, :comm, sort=true), [:in_deg, :out_deg] .=> sum)

### Hierarchical clustering and dendrogram


In [None]:
# switch to iGraph with the same graph
z = ig.Graph.Famous("zachary")

In [None]:
## Girvan-Newman algorithm
gn = z.community_edge_betweenness()

In [None]:
ENV["LINES"] = 40

# data frame showing assignment of vertices to clusters as a function of number of clusters
DataFrame([gn.as_clustering(i).membership for i in 34:-1:1], Symbol.(34:-1:1))

In [None]:
ENV["LINES"] = 20

In [None]:
## compute modularity at each possible cut
q = [z.modularity(gn.as_clustering(i)) for i in 1:34]
plt.plot(1:34,q,"o-",color="black")
plt.xlabel("number of clusters",fontsize=14)
plt.ylabel("modularity",fontsize=14);

In [None]:
## show result with 2 clusters -- only 1 node is misclassified
println("AMI: ", AMI(z_df.comm[sortperm(z_df.id)], gn.as_clustering(2).membership))
println("q: ", z.modularity(gn.as_clustering(2).membership))
freqtable(z_df.comm[sortperm(z_df.id)], gn.as_clustering(2).membership)

In [None]:
## show result with optimal modularity (5 clusters)
println("AMI: ", AMI(z_df.comm[sortperm(z_df.id)], gn.as_clustering(5).membership))
println("q: ", z.modularity(gn.as_clustering(5).membership))
freqtable(z_df.comm[sortperm(z_df.id)], gn.as_clustering(5).membership)

## ABCD graph with 100 nodes

This graph has 3 communities; with hierarchical clustering, we compare modularity and AMI for each possible cut.

Parameters: gamma=3, tau=2, degree range [5,15], comm size range [25,50], xi=.2.

In [None]:
## read graph and communities
g = ig.Graph.Read_Ncol(datadir * "ABCD/abcd_100.dat", directed=false)
c_raw = parse.(Int, getindex.(split.(readlines(datadir*"ABCD/abcd_100_comms.dat")), 2))
c = [c_raw[parse(Int, v.attributes()["name"])] for v in g.vs]

g_lg = SimpleGraph(100)
for line in readlines(datadir * "ABCD/abcd_100.dat")
    add_edge!(g_lg, parse.(Int, split(line))...)
end

In [None]:
Random.seed!(2)
gplot(g_lg,
      NODESIZE=0.03, nodefillc=["red", "green", "blue"][c_raw],
      EDGELINEWIDTH=0.2, edgestrokec="gray")

### Girvan-Newman algorithm -- modularity and AMI for each cut

In [None]:
gn = g.community_edge_betweenness()
q = [g.modularity(gn.as_clustering(i)) for i in 1:g.vcount()]
a = [AMI(c, gn.as_clustering(i).membership) for i in 1:g.vcount()]
plot(1:g.vcount(),q,".-",color="black",label="modularity")
plot(1:g.vcount(),a,".-",color="grey",label="AMI")
xlabel("number of clusters",fontsize=14)
ylabel("modularity or AMI",fontsize=14)
legend();

In [None]:
DataFrame(q=q, AMI=a)

In [None]:
## AMI
g_gn = gn.as_clustering(n=3).membership
println("AMI: ",AMI(c, g_gn))
println("q: ",g.modularity(g_gn))

In [None]:
## what would we get with 4 clusters?
## we see a few nodes get splitted from one community
freqtable(c, gn.as_clustering(n=4).membership)

In [None]:
## those form a triangle
cluster3 = [parse(Int, v.attributes()["name"]) for (m, v) in zip(gn.as_clustering(n=4).membership, g.vs) if m == 3]
gplot(induced_subgraph(g_lg, cluster3)[1])

## ABCD with varying xi

Here we show a typical way to compare graph clustering using benchmark graphs. 

We pick some model, here ABCD, and we vary the noise parameter (0 <= xi <= 1). 

With ABCD, the larger xi is, the closer we are to a random Chung-Lu or configuration model graph (i.e. where only the degree distribution matters). 

For xi=0, we get pure communities (all edges are internal).

We show how to load a pickle file created in Python (detailed codes for generating this file are given in the Python notebooks section)

In [None]:
pickle = pyimport("pickle")

In [None]:
fh = py"open"(datadir * "ABCD/abcd_study.pkl", "rb")
L = pickle.load(fh)
fh.close()
D = DataFrame(L, [:algo, :xi, :AMI])
X = combine(groupby(D, [:algo, :xi], sort=true), :AMI => mean)

In [None]:
a = ["ECG","Louvain","Infomap","Label Prop."]
lt = ["-","--",":","-.","--",":"]
cl = ["blue","green","purple","red","red","blue"]
for i in eachindex(a)
    plot(X[X.algo .== a[i], :AMI_mean], lt[i], label=a[i], color=cl[i])
end
xlabel("ABCD noise (xi)",fontsize=14)
ylabel("AMI",fontsize=14)
legend();

###  Look at standard deviations

In [None]:
S = combine(groupby(D, [:algo, :xi], sort=true), :AMI => std)
a = ["ECG","Louvain","Infomap","Label Prop."]
lt = ["-","--",":","-.","--",":"]
cl = ["blue","green","purple","red","red","blue"]
for i in eachindex(a)
    plot(S[S.algo .== a[i], :AMI_std], lt[i], label=a[i], color=cl[i])
end
xlabel("ABCD noise (xi)",fontsize=14)
ylabel("Standard Deviation (AMI)",fontsize=14)
legend();

### Compare stability 

This study is similar to the previous one, but we compare successive partitions for each algorithm instead of comparing with the ground truth.

We show how to load a pickle file created in Python (detailed codes for generating this file are given in the Python notebooks section)

In [None]:
## load L and train/val/test ids
fh = py"open"(datadir * "ABCD/abcd_study_stability.pkl", "rb")
Ls = pickle.load(fh)
fh.close()

## store in dataframe and take averages
D = DataFrame(Ls, [:algo,:xi,:AMI])
X = combine(groupby(D, [:algo, :xi], sort=true), :AMI => mean)

In [None]:
a = ["ECG","Louvain","Infomap","Label Prop."]
lt = ["-","--",":","-."]
cl = ["blue","green","purple","red","red","blue"]
for i in eachindex(a)
    plot(X[X.algo .== a[i], :AMI_mean], lt[i], label=a[i], color=cl[i])
end
xlabel("ABCD noise (xi)",fontsize=14)
ylabel("AMI between successive runs",fontsize=14)
legend();

## Modularity, resolution limit and rings of cliques

In [None]:
## n cliques of size s
function ringOfCliques(n,s)
    roc = SimpleGraph(n*s)
    ## cliques    
    for i in 0:n-1, j in s*i:s*(i+1)-1, k in j+1:s*(i+1)-1
        add_edge!(roc, j+1, k+1)
    end
    ## ring
    for i in 0:n-1
        add_edge!(roc, s*i, s*i+1)
    end
    add_edge!(roc, n*s, 1)
    return roc
end

## Ex: 10 3-cliques
roc = ringOfCliques(10,3)
gplot(roc, layout=spectral_layout,
      NODESIZE=0.03,
      EDGELINEWIDTH=0.1, edgestrokec="gray")

In [None]:
py"""
import numpy as np
import igraph as ig

def ringOfCliques(n,s):
    roc = ig.Graph.Erdos_Renyi(n=n*s,p=0)
    ## cliques
    for i in range(n):
        for j in np.arange(s*i,s*(i+1)):
            for k in np.arange(j+1,s*(i+1)):
                roc.add_edge(j,k)
    ## ring
    for i in range(n):
        if i>0:
            roc.add_edge(s*i-1,s*i)
        else:
            roc.add_edge(n*s-1,0)
    return roc
"""

In [None]:
## Compare number of cliques and number of clusters found
D = DataFrame(n=Int[], Louvain=Int[], ECG=Int[], CNM=Int[])
s = 3
for n in 3:3:48
    roc = py"ringOfCliques"(n,s)
    ml = maximum(roc.community_multilevel().membership) + 1
    ec = maximum(roc.community_ecg().membership) + 1
    cnm = maximum(roc.community_fastgreedy().as_clustering().membership) + 1
    push!(D, [n,ml,ec,cnm])
end

plot(D.n, D.Louvain, "--o",color="black", label="Louvain")
plot(D.n, D.ECG, "-o", color="black",label="ECG")
plot(D.n, D.CNM, ":o", color="black",label="CNM")

xlabel("number of $s-cliques",fontsize=14)
ylabel("number of clusters found",fontsize=14)
legend(fontsize=14);

In [None]:
## Louvain communities with 10 3-cliques
roc = py"ringOfCliques"(n=10,s=3)
membership = roc.community_multilevel().membership .+ 1

roc = ringOfCliques(10,3) # this time igraph and LightGraphs node indices match
gplot(roc, layout=spectral_layout,
      nodefillc=["red", "green", "blue", "orange", "purple"][membership],
      NODESIZE=0.03,
      EDGELINEWIDTH=0.1, edgestrokec="gray")

In [None]:
## ECG weights in this case: all 30 clique edges have max score
freqtable(py"ringOfCliques"(n=10,s=3).community_ecg().W)

# Ego nets and more

* we consider the airport graph we already saw
* we consider a simple, undirected version (no loops, directions or edge weights)
* we compare ego-nets (1 and 2-hops subgraphs from a given node) with clusters obtained via graph clustering

As above since some algorithms are available only in Python we show how to use iGraph from Julia

In [None]:
py"""
import pandas as pd
import igraph as ig

datadir = "../Datasets/"

D = pd.read_csv(datadir+'Airports/connections.csv')
g = ig.Graph.TupleList([tuple(x) for x in D.values], directed=True, edge_attrs=['weight'])
g = g.as_undirected()
g = g.simplify()

## read vertex attributes and add to graph
A = pd.read_csv(datadir+'Airports/airports_loc.csv')
lookup = {k:v for v,k in enumerate(A['airport'])}
l = [lookup[x] for x in g.vs()['name']]
g.vs()['layout'] = [(A['lon'][i],A['lat'][i]) for i in l]
g.vs()['state'] = [A['state'][i] for i in l]
g.vs()['city'] = [A['city'][i] for i in l]
"""

In [None]:
## pick a vertex
py"""
v = 207
"""

py"g.degree()[v],g.vs[v]"

In [None]:
## show its ego-net
sg = py"g.subgraph([i for i in g.neighborhood(v,order=1)])"
println(sg.vcount(), " nodes")
#ig.plot(sg,bbox=(0,0,300,300))

In [None]:
g = SimpleGraph(sg.vcount())
for e in sg.es()
    add_edge!(g, e.source + 1, e.target + 1)
end
Random.seed!(1)
gplot(g, nodefillc= [n.attributes()["name"] == "MQT" ? "black" : "red" for n in sg.vs()])

In [None]:
## show its 2-hops ego-net, this is already quite large!
py"""
sg = g.subgraph([i for i in g.neighborhood(v,order=2)])
sg.vs()['core'] = sg.coreness()
sg.delete_vertices([v for v in sg.vs if v['core']<2])
"""
sg = py"sg"
println(sg.vcount(), " nodes")

In [None]:
g = SimpleGraph(sg.vcount())
for e in sg.es()
    add_edge!(g, e.source + 1, e.target + 1)
end
# selected node is larger
Random.seed!(3)
gplot(g, NODESIZE=[n.attributes()["name"] == "MQT" ? 0.05 : 0.01 for n in sg.vs()])

In [None]:
## apply clustering, show cluster containing the selected vertex
## recall that we ignore edge weights
py"""
ec = g.community_ecg(ens_size=32)
g.es['W'] = ec.W
m = ec.membership[v]
sg = g.subgraph([i for i in range(g.vcount()) if ec.membership[i]==m])
sg.vs()['core'] = sg.coreness()
## display the 2-core
sg.delete_vertices([v for v in sg.vs if v['core']<2])
"""
sg = py"sg"
println(sg.vcount()," nodes")

In [None]:
g = SimpleGraph(sg.vcount())
for e in sg.es()
    add_edge!(g, e.source + 1, e.target + 1)
end
# selected node is larger
Random.seed!(3)
gplot(g, NODESIZE=[n.attributes()["name"] == "MQT" ? 0.05 : 0.01 for n in sg.vs()])

In [None]:
py"""
## filter edges w.r.t. ECG votes (weights)
## you can adjust the threshold to get different zooming
thresh = .9
tmp = sg.subgraph_edges([e for e in sg.es if e['W'] > thresh])
n = [i for i in range(tmp.vcount()) if tmp.vs[i]['name']=='MQT'][0]
tmp.vs['cl'] = tmp.clusters().membership
cl = tmp.vs[n]['cl']
ssg = tmp.subgraph([i for i in tmp.vs if i['cl']==cl])
ssg.vs()['core'] = ssg.coreness()
ssg.delete_vertices([v for v in ssg.vs if v['core']<2])
"""

ssg = py"ssg"
println(ssg.vcount(), " nodes")

In [None]:
g = SimpleGraph(ssg.vcount())
for e in ssg.es()
    add_edge!(g, e.source + 1, e.target + 1)
end
Random.seed!(1)
gplot(g, nodefillc = [n.attributes()["name"] == "MQT" ? "black" : "red" for n in ssg.vs()])

In [None]:
## states in the above subgraph
freqtable([n.attributes()["state"] for n in ssg.vs])

# ABCD Properties

We show ABCD graphs with different xi (noise) parameters;

This is for illustration purpose only:

* notice the density of edges between communities as xi increases.
* most runs should yield 3 communities, but this can vary when we re-run ABCD samplers

In [None]:
degs = ABCDGraphGenerator.sample_degrees(2.5, 5, 15, 100, 1000)

In [None]:
coms = ABCDGraphGenerator.sample_communities(1.5, 30, 50, 100, 1000)

In [None]:
xi = 0.05
g_src = ABCDGraphGenerator.gen_graph(ABCDGraphGenerator.ABCDParams(degs, coms, nothing, xi, false, false))
g = SimpleGraph(100)
for e in g_src.edges
    add_edge!(g, e...)
end
gplot(g, nodefillc=["red", "green", "blue"][g_src.clusters])

In [None]:
xi = 0.15
g_src = ABCDGraphGenerator.gen_graph(ABCDGraphGenerator.ABCDParams(degs, coms, nothing, xi, false, false))
g = SimpleGraph(100)
for e in g_src.edges
    add_edge!(g, e...)
end
gplot(g, nodefillc=["red", "green", "blue"][g_src.clusters])

In [None]:
xi = 0.33
g_src = ABCDGraphGenerator.gen_graph(ABCDGraphGenerator.ABCDParams(degs, coms, nothing, xi, false, false))
g = SimpleGraph(100)
for e in g_src.edges
    add_edge!(g, e...)
end
gplot(g, nodefillc=["red", "green", "blue"][g_src.clusters])

In [None]:
xi = 0.5
g_src = ABCDGraphGenerator.gen_graph(ABCDGraphGenerator.ABCDParams(degs, coms, nothing, xi, false, false))
g = SimpleGraph(100)
for e in g_src.edges
    add_edge!(g, e...)
end
gplot(g, nodefillc=["red", "green", "blue"][g_src.clusters])

## Measures

* We illustrate the importance of using proper adjusted measures when comparing partitions
* We generate some ABCD graph and compare ground truth with random partitions of different sizes

In [None]:
degs = ABCDGraphGenerator.sample_degrees(2.5, 5, 50, 1000, 1000)

In [None]:
coms = ABCDGraphGenerator.sample_communities(1.5, 75, 150, 1000, 1000)

In [None]:
xi = 0.1
g_src = ABCDGraphGenerator.gen_graph(ABCDGraphGenerator.ABCDParams(degs, coms, nothing, xi, false, false))

In [None]:
gp = ig.Graph.Erdos_Renyi(n=1000,p=0)
for (from, to) in g_src.edges
    gp.add_edge(from-1, to-1)
end

In [None]:
gt = g_src.clusters

In [None]:
## RAND Index: given two clusterings u and v
function RI(u,v)
    @assert length(u) == length(v)
    
    n = length(u)
    ## build sets from A and B
    minu, maxu = extrema(u)
    minv, maxv = extrema(v)
    A = [Set(findall(==(i), u)) for i in minu:maxu]
    B = [Set(findall(==(i), v)) for i in minv:maxv]

    ## RAND index step by step
    R = 0.0
    for sa in A, sb in B
        s = length(intersect(sa, sb))
        R += s*(s-1)
    end
    for sa in A
        s = length(sa)
        R -= s*(s-1)/2
    end
    for sb in B
        s = length(sb)
        R -= s*(s-1)/2
    end
    R += n*(n-1)/2
    R /= n*(n-1)/2
    return R
end

In [None]:
D = DataFrame(size=Int[], MI=Float64[], NMI=Float64[], AMI=Float64[], RI=Float64[], ARI=Float64[],
              GRI=Float64[], AGRI=Float64[])
n = length(gt)
tc = Dict(zip(0:n-1, gt))
ar = 2:20
for s in ar
    for i in 1:100
        r = rand(1:s, n)
        rc = Dict(zip(0:n-1,r))
        push!(D, (s, MI(gt, r), NMI(gt, r), AMI(gt, r), RI(gt, r), ARI(gt, r),
                  gp.gam(tc, rc, adjusted=false), gp.gam(tc, rc)))
    end
end

In [None]:
R = combine(groupby(D, :size), names(D, Not(:size)) .=> mean, renamecols=false)

In [None]:
plot(ar, R.MI,":",color="black",label="MI")
plot(ar, R.NMI,"--",color="black",label="NMI")
plot(ar, R.AMI,"-",color="black",label="AMI")
xlabel("number of random clusters",fontsize=14)
legend();

In [None]:
plot(ar, R.RI,":",color="black",label="RI")
plot(ar, R.GRI,"--",color="black",label="GRI")
plot(ar, R.ARI,"-",color="black",label="ARI/AGRI")
plot(ar, R.ARI,"-",color="black")
xlabel("number of random clusters",fontsize=14)
legend();