# Chapter 5 - Community Detection

In this notebook, we explore several algorithms to find communities in graphs.

In some cells, we use the ABCD benchmark to generate synthetic graphs with communities installed from https://github.com/bkamins/ABCDGraphGenerator.jl

In [None]:
datadir = "../Datasets/"

In [None]:
using ABCDGraphGenerator
using CSV, DataFrames
using DelimitedFiles
using Graphs
using GraphMakie, GLMakie
using NetworkLayout
using PyPlot
using PyCall
using Random
using Serialization
using StatsBase
using StatsPlots

In [None]:
run(`$(PyCall.python) -m pip install python-igraph pycairo scikit-learn partition-igraph`)

In [None]:
ig = pyimport("igraph")
ig_part = pyimport("partition_igraph")
random = pyimport("random")
np = pyimport("numpy")

In [None]:
AMI = pyimport("sklearn.metrics").adjusted_mutual_info_score
roc_curve = pyimport("sklearn.metrics").roc_curve
roc_auc_score = pyimport("sklearn.metrics").roc_auc_score

# Zachary (karate) graph

This is a small graph with 34 nodes and two ground-truth communities.
Modularity-based algorithms will typically find 4 or 5 communities.
In the next cells, we look at this small graph from several different angles.


In [None]:
g_karate = smallgraph(:karate)
karate_comms = [1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2];

In [None]:
## plot graph without axes and background grid
function clean_graphplot(G::AbstractGraph; kwargs...)
    f, ax, p = graphplot(G; kwargs...)
    hidedecorations!(ax)
    hidespines!(ax)
    return f
end

In [None]:
clean_graphplot(g_karate,
    layout=Stress(),
    node_size=25,
    ilabels=repr.(1:nv(g_karate)),
    node_color=[:white, :lightgray][karate_comms],
    edge_color=:grey)

## Node Roles
 
We compute $z(v)$ (normalized within module degree) and $p(v)$ (participation coefficients) as defined in section 5.2 of the book for the Zachary graph `g_zac`. 

We identify 3 types of nodes, as described in the book.

* provincial hubs
* peripheral nodes (non-hubs)
* ultra peripheral nodes (non-hubs)

In [None]:
## normalized within-module degree (z(v))
function nwmd(G::SimpleGraph, A::Vector{Int})
    # within module degrees
    deg_in = [sum([A[v] == A[i] for i in neighbors(G, v)]) for v in 1:nv(G)]
    deg_in_mean = [mean([deg_in[i] for i in 1:nv(G) if A[i] == j]) for j in Set(A)]
    deg_in_std = [std([deg_in[i] for i in 1:nv(G) if A[i] == j]) for j in Set(A)]
    return [(deg_in[v] - deg_in_mean[A[v]]) / deg_in_std[A[v]] for v in 1:nv(G)]
end

## participation coefficient
function pc(G::SimpleGraph, A::Vector{Int})
    deg = degree(G)
    coef = Float64[]
    for v in 1:nv(G)
        nbhs_comm_count = values(countmap(A[neighbors(G, v)]))
        push!(coef, 1 - sum(x -> (x / deg[v])^2, nbhs_comm_count))
    end
    return coef
end

In [None]:
## compute z (normalized within-module degree) and p (participation coefficient)
karate_z = nwmd(g_karate, karate_comms)
karate_p = pc(g_karate, karate_comms);

### Looking at $z(v)$ and $p(v)$

Below, we plot the Zachary graph with respect to $z(v)$ where $z(v) > 2.5$ are **hubs**, which we show as **white square** nodes.

The largest values are for node 0 (instructor), node 33 (president) and node 32.
Nodes 0 and 33 are the key nodes for the division of the group into factions.

The **ultra-peripherial** nodes are shown with darker color.

In [None]:
karate_color = Symbol[]
karate_marker = fill(:circle, nv(g_karate))
for v in vertices(g_karate)
    if karate_z[v] < 2.5
        ## peripheral
        karate_p[v] < 0.62 && karate_p[v] >= 0.05 && push!(karate_color, :white)
        ## ultra-peripheral
        karate_p[v] < 0.05 && push!(karate_color, :lightgray)
    end
    ## hubs (all provincial here)
    if karate_z[v] >= 2.5 && karate_p[v] < 0.3
        push!(karate_color, :white)
        karate_marker[v] = :rect
    end
end

In [None]:
clean_graphplot(g_karate,
    layout=Stress(),
    node_size=25,
    node_marker=karate_marker,
    ilabels=repr.(1:nv(g_karate)),
    node_color=karate_color,
    edge_color=:lightgrey
)

### Figure 5.3(b)

The code below is to generate Figure 5.3(b) in the book, again comparing node roles in the Zachary graph.


In [None]:
## Figure 5.3(b) -- comparing the roles
fig, ax = subplots(figsize=(12, 9))
ax.scatter(karate_p, karate_z, marker="o", s=75, color="k")

PyPlot.plot([0, 0.5], [2.5, 2.5], color="k", linestyle="-", linewidth=2)
PyPlot.plot([0.05, 0.05], [-0.5, 2.4], color="k", linestyle="-", linewidth=2)

ax.annotate("node 1", (karate_p[1], karate_z[1] - 0.05), xytext=(karate_p[1] + 0.01, karate_z[1] - 0.3),
    fontsize=14,
    arrowprops=Dict("arrowstyle" => "-", "connectionstyle" => "angle3,angleA=0,angleB=-90"))

ax.annotate("node 34", (karate_p[34], karate_z[34] - 0.05), xytext=(karate_p[34] - 0.07, karate_z[34] - 0.3),
    fontsize=14,
    arrowprops=Dict("arrowstyle" => "-", "connectionstyle" => "angle3,angleA=0,angleB=-90"))

ax.annotate("node 33", (karate_p[33] - 0.005, karate_z[33]), xytext=(karate_p[33] - 0.07, karate_z[33]),
    fontsize=14,
    arrowprops=Dict("arrowstyle" => "-", "connectionstyle" => "angle3,angleA=0,angleB=-90"))

ax.annotate("node 2", (karate_p[2], karate_z[2] - 0.05), xytext=(karate_p[2] - 0.07, karate_z[2] - 0.3),
    fontsize=14,
    arrowprops=Dict("arrowstyle" => "-", "connectionstyle" => "angle3,angleA=0,angleB=-90"))

ax.annotate("node 4", (karate_p[4], karate_z[4] - 0.05), xytext=(karate_p[4] + 0.07, karate_z[4] - 0.3),
    fontsize=14,
    arrowprops=Dict("arrowstyle" => "-", "connectionstyle" => "angle3,angleA=0,angleB=-90"))

ax.annotate("node 3", (karate_p[3], karate_z[3] - 0.05), xytext=(karate_p[3] - 0.07, karate_z[3] - 0.3),
    fontsize=14,
    arrowprops=Dict("arrowstyle" => "-", "connectionstyle" => "angle3,angleA=0,angleB=-90"))

ax.annotate("provincial hubs", (0.3, 3), fontsize=18)
ax.annotate("peripheral non-hubs", (0.3, 1.8), fontsize=18)
ax.annotate("ultra peripheral non-hubs", (0.025, 0.0), xytext=(0.1, 0), fontsize=18,
    arrowprops=Dict("arrowstyle" => "->", "connectionstyle" => "angle3,angleA=0,angleB=-90"))

xlabel("participation coefficient (p(v))", fontsize=16)
ylabel("normalized within module degree (z(v))", fontsize=16);

### Looking at a few other community-based features

We already saw the *normalized within-module degree* $z(v)$ and *participation coefficient* $p(v)$.

Recall that a high value for $z(v)$ is indicative of a hub. 

For $p(v)$, a value close to zero indicates homogeneity of communities amongst $v$'s neighbours, while a high value indicates heterogeneity.

Below we compute the *community distribution distance* (cdd) and the *community association strength* (cas).

In [None]:
# community distribution distance
function cdd(G::Graph, A::Vector{Int})
    deg = degree(G)
    Vol = sum(deg)
    max_comm = maximum(A)
    Vol_A = zeros(Float64, max_comm + 1)

    for i in 1:nv(G)
        Vol_A[A[i]+1] += deg[i]
    end
    Vol_A ./= Vol

    cdd_values = []
    for i in 1:nv(G)
        deg_A = zeros(Float64, max_comm + 1)
        for v in neighbors(G, i)
            deg_A[A[v]+1] += 1
        end
        push!(cdd_values, sqrt(sum((deg_A ./ deg[i] .- Vol_A) .^ 2)))
    end
    return cdd_values
end

# community association strength
function cas(G::Graph, A::Vector{Int})
    deg = degree(G)
    deg_int = [sum(A[i] == A[j] for i in neighbors(G, j)) for j in 1:nv(G)]
    Vol = sum(deg)
    max_comm = maximum(A)
    Vol_A = zeros(Float64, max_comm + 1)

    for i in 1:nv(G)
        Vol_A[A[i]+1] += deg[i]
    end

    return [deg_int[i] / deg[i] - (Vol_A[A[i]+1] - deg[i]) / Vol for i in 1:nv(G)]
end

In [None]:
karate_cdd = cdd(g_karate, karate_comms)
karate_cas = cas(g_karate, karate_comms);

Below we show the nodes with low *cas* values with white color. We see that those correspond to nodes that are at the boundary between communities.

We also compute the Pearson correlation coefficient between the community-based measures we computed.


In [None]:
## value with lowest 'cas' are shown in white
th = quantile(karate_cas, 0.15)
karate_plot = clean_graphplot(g_karate,
    layout=Stress(),
    node_size=25,
    node_marker=karate_marker,
    ilabels=repr.(1:nv(g_karate)),
    node_color=[:white, :lightgrey][Int.(karate_cas .> th).+1],
    edge_color=:lightgrey
)

In [None]:
## correlation between various community-based measures
cor(Float64.([karate_z karate_p karate_cdd karate_cas]))

## Strong and weak communities

Communities can be defined as strong or weak as per (5.1) and (5.2) in the book.

For the Zachary graph, we verify if nodes within communities satisfy the strong criterion, then we verify if the two communities satisfy the weak definition.

For the strong definition (internal degree larger than external degree for each node), only two nodes do not qualify: nodes 3 and 10.

For the weak definition (total community internal degree > total community external degree), both communities satisfy this criterion.


In [None]:
## strong criterion
for i in vertices(g_karate)
    c = karate_comms[i]
    n = [karate_comms[v] == c for v in neighbors(g_karate, i)]
    if sum(n) <= length(n) - sum(n)
        println("node $(i) has internal degree $(sum(n)) external degree $(length(n)-sum(n))")
    end
end

In [None]:
## weak criterion
I = [0, 0]
E = [0, 0]
for i in vertices(g_karate)
    c = karate_comms[i]
    n = [karate_comms[v] == c for v in neighbors(g_karate, i)]
    I[c] += sum(n)
    E[c] += length(n) - sum(n)
end
println("community 1 internal degree $(I[1]) external degree $(E[1])")
println("community 2 internal degree $(I[2]) external degree $(E[2])")


## Hierarchical clustering and dendrogram

Girvan-Newman algorithm is described in section 5.5 of the book. We apply it to the Zachary graph and show the results of this divisive algorithm as a dendrogram.


In [None]:
## Girvan-Newman algorithm
ig_karate = ig.Graph.Famous("zachary")
gn = ig_karate.community_edge_betweenness()
fig, ax = subplots(figsize=(6, 6));
ig.plot(gn, target=ax);

This is an example of a hierarchical clustering. In the next plot, we compute modularity for each possible cut of the dendrogram.

We see that we get strong modularity with 2 clusters, but maximal value is obtained with 5.


In [None]:
## compute modularity at each possible cut and plot
q = [ig_karate.modularity(gn.as_clustering(n=i)) for i in vertices(g_karate)]
PyPlot.plot(1:nv(g_karate), q, "o-", color=:gray)
xlabel("number of clusters", fontsize=14)
ylabel("modularity", fontsize=14);

How are the nodes partitioned if we pick only 2 communities? How does this compare to the underlying ground truth?

From the plot below, we see that only 1 node is misclassified (node 3 was "white" in the plot with ground-truth communities)

The modularity of this partition, $q = 0.35996$. 

We also compare the partition with ground truth via AMI (adjusted mutual information), as defined in section 5.3 of the book; we got a high value AMI = 0.83276 showing  strong concordance. 


In [None]:
## show result with 2 clusters
gn_2n = gn.as_clustering(n=2).membership
println("AMI: $(AMI(karate_comms, gn_2n))")  ## adjusted mutual information
println("q: $(ig_karate.modularity(gn_2n))") ## modularity
clean_graphplot(g_karate,
    layout=Stress(),
    node_size=25,
    node_marker=karate_marker,
    ilabels=1:nv(g_karate),
    node_color=[:white, :lightgrey][gn_2n.+1],
    edge_color=:lightgrey
)

Below we show the same plot as above, but we label the nodes with respect to the 5 communities found by modularity-based algorithms. 

We color the nodes with respect to the two ground-truth communities.

We indeed see that in that case, we get higher modularity, but weaker AMI value.

Other than breaking up each community in two, we see that node 3 is mis-labelled (as already noticed), and node 10 is isolated in its own community (community #5). 

Recall that those were the only two nodes **not** having **strong connectivity** with respect to the ground truth communities.


In [None]:
## show result with optimal modularity (5 clusters)
gn_5n = gn.as_clustering(n=5).membership
println("AMI: $(AMI(karate_comms, gn_5n))")
println("q: $(ig_karate.modularity(gn_5n))")
clean_graphplot(g_karate,
    layout=Stress(),
    node_size=25,
    node_marker=karate_marker,
    ilabels=gn_5n .+ 1,
    node_color=[:white, :lightgrey][karate_comms],
    edge_color=:lightgrey
)

# ABCD graph with 100 nodes

Next we look at a slightly larger graph generated with the ABCD benchmark model, which is described in section 5.3 of the book. This graph has 3 communities. 
Using hierarchical clustering, we compare modularity and AMI for each possible cut.

The ABCD parameters used to generate this graph are (see the content of `abcd_config.toml`): 
* $\gamma=3$
* degree range [5,15]
* $\tau=2$
* community size range [25,50]
* $\xi=.2$.

In [None]:
run(`julia --project abcd.jl abcd_config.toml`)

In [None]:
n = 100
g_abcd = SimpleGraph(n)
edgelist = collect(eachrow(readdlm("edge.dat", '\t', Int)))
for row in edgelist
    add_edge!(g_abcd, row...)
end
abcd_comms = readdlm("com.dat", '\t', Int)[:, 2];

In [None]:
clean_graphplot(g_abcd,
    layout=Stress(),
    node_color=[:white, :gray, :black][abcd_comms],
    node_strokewidth=1,
    edge_color=:lightgray)

### Girvan-Newman algorithm

We plot the modularity and AMI for each cut from the GN algorithm.

In this case, both modularity and AMI are maximized with 3 communities.


In [None]:
ig_abcd = ig.Graph()
ig_abcd.add_vertices(n)
ig_abcd.add_edges([(src - 1, dst - 1) for (src, dst) in edgelist])

In [None]:
q = Float64[]
a = Float64[]
gn = ig_abcd.community_edge_betweenness()
for i in 1:n
    clustering = gn.as_clustering(n=i)
    push!(q, ig_abcd.modularity(clustering))
    push!(a, AMI(abcd_comms, clustering.membership))
end
PyPlot.plot(1:n, q, ".-", color="black", label="modularity")
PyPlot.plot(1:n, a, ".-", color="grey", label="AMI")
xlabel("number of clusters", fontsize=14)
ylabel("modularity or AMI", fontsize=14)
legend();

We see that with 3 communities, $q=0.494$ and AMI=1, so perfect recovery.

In [None]:
first(DataFrame(n_comm=1:n, q=q, AMI=a), 5)

What would we get with 4 clusters, for which AMI = 0.94 is also quite high?
We see below that we have a few nodes splitted from one community.

In [None]:
## 4 communities
g_abcd_4n = gn.as_clustering(n=4).membership .+ 1
small_comm = findmin(countmap(g_abcd_4n))[1] ## smallest community
edges_color = fill(:lightgray, ne(g_abcd))
for (i, e) in enumerate(edges(g_abcd))
    if g_abcd_4n[src(e)] == small_comm && g_abcd_4n[dst(e)] == small_comm
        edges_color[i] = :black
    end
end
clean_graphplot(g_abcd,
    layout=Stress(),
    node_size=ifelse.(g_abcd_4n .== small_comm, 15, 10),
    node_color=[:white, :gray, :black, :lightgray][g_abcd_4n],
    node_strokewidth=1,
    edge_color=edges_color)

# Anomaly detection

### ABCD graph with outliers

Similar to the 100-node ABCD graph, but this time, 10 nodes are "outliers" (marked as community 0)


In [None]:
run(`julia --project abcd.jl abcd_outliers_config.toml`)

In [None]:
n = 100
g_abcd_o = SimpleGraph(n)
edgelist = collect(eachrow(readdlm("edge_outliers.dat", '\t', Int)))
for row in edgelist
    add_edge!(g_abcd_o, row...)
end
abcd_comms_o = readdlm("com_outliers.dat", '\t', Int)[:, 2]

In [None]:
clean_graphplot(g_abcd_o,
    layout=Spring(),
    node_color=[:white, :gray, :black, :lightgray][abcd_comms_o],
    node_strokewidth=1,
    edge_color=:lightgray)

In [None]:
## community-based features for outlier and other nodes
df = DataFrame(community=abcd_comms_o,
    cdd=cdd(g_abcd_o, abcd_comms_o),
    cas=cas(g_abcd_o, abcd_comms_o),
    pc=pc(g_abcd_o, abcd_comms_o))
dfg = combine(groupby(df, :community), Not(:community) .=>
    [x -> quantile(x, 0.25) x -> quantile(x, 0.75)] .=> [x -> x * "_Q1" x -> x * "_Q3"])
select(dfg, "community", sort(names(dfg)[2:end])...)

In [None]:
## plot - node size proportional to participation coefficient
clean_graphplot(g_abcd_o,
    layout=Spring(),
    node_size=[8 + 1.2 / x for x in df.cas],
    node_color=[:white, :gray, :black, :lightgray][abcd_comms_o],
    node_strokewidth=1,
    edge_color=:lightgray)

## American College Football Graph

This is a nice, small graph for illustrating anomaly detection methods.

[Ref]: "Community structure in social and biological networks", M. Girvan and M. E. J. Newman PNAS June 11, 2002 99 (12) 7821-7826; https://doi.org/10.1073/pnas.122653799
with corrections to the labels as described in: https://arxiv.org/pdf/1009.0638

The graph has 115 nodes, 613 edges and after the corrections, there are  12 communities corresponding to Football Conferences.

One of these communities (labelled as '5') is in fact a group of **independent** teamswhich we use as surrogate for **outlier** nodes.

In [None]:
fbl_comms = vec(readdlm(datadir * "Football/football.community", Int))
fbl_n = length(fbl_comms)
g_fbl = SimpleGraph(fbl_n)
fbl_edgelist = collect(eachrow(readdlm(datadir * "Football/football.edgelist", Int)))
for row in fbl_edgelist
    add_edge!(g_fbl, row .+ 1...)
end

In [None]:
## plot the College Football Graph
## show communities in different colors
## show known anomalies as triangles
clean_graphplot(g_fbl,
    node_strokewidth=1,
    node_size=12,
    node_color=fbl_comms .+ 1,
    node_attr=(colormap=:lightrainbow,),
    node_marker=ifelse.(fbl_comms .== 5, :utriangle, :circle),
    edge_color=:lightgray)

In [None]:
## greyscale version (for the book)
clean_graphplot(g_fbl,
    node_strokewidth=1,
    node_size=12,
    node_color=fbl_comms .+ 1,
    node_attr=(colormap=:Greys,),
    node_marker=ifelse.(fbl_comms .== 5, :utriangle, :circle),
    edge_color=:lightgray)

### Anomaly detection

We apply some of the community-based measures we defined earlier to find the *anomalous* nodes, namely looking at:

* the participation coefficient (pc) - high values are indicative of anomalous nodes
* the community association strength (cas) - low values are indicative of anomalous nodes
* the community distribution distance (cdd)- low values are indicative of anomalous nodes

The rationale is that an *anomalous* node will be difficult to place in a single cluster.
In the plot below, we show the distribution of those scores amongst the anomalous and non-anomalous nodes respecvively.
For the communities, we use the results from the ECG algorithm rather than using the ground-truth communities.

We also show the ROC curves for each measure and compute the AUC (area under the ROC curve), 
A ROC curve is a plot of the true positive rate (TPR) against the false positive rate (FPR) at each threshold value.
The area unde rthe ROC curve, the AUC, can be interpreted as the probability that a randomly chosen positive case has a higher 
score than a randomly chosen negative case.


In [None]:
ig_fbl = ig.Graph()
ig_fbl.add_vertices(fbl_n)
ig_fbl.add_edges(fbl_edgelist)

In [None]:
cfg_comm = ig_fbl.community_ecg(final="leiden").membership
anomaly = Int.(fbl_comms .== 5)
ROC_cdd = roc_curve(anomaly, 1 .- cdd(g_fbl, cfg_comm))
ROC_cas = roc_curve(anomaly, 1 .- cas(g_fbl, cfg_comm))
ROC_pc = roc_curve(anomaly, pc(g_fbl, cfg_comm))
AUC_cdd = roc_auc_score(anomaly, 1 .- cdd(g_fbl, cfg_comm))
AUC_cas = roc_auc_score(anomaly, 1 .- cas(g_fbl, cfg_comm))
AUC_pc = roc_auc_score(anomaly, pc(g_fbl, cfg_comm))

PyPlot.plot(ROC_pc[1], ROC_pc[2], label="pc,  AUC=$(round(AUC_pc,digits=3))")
PyPlot.plot(ROC_cdd[1], ROC_cdd[2], label="cdd, AUC=$(round(AUC_cdd,digits=3))")
PyPlot.plot(ROC_cas[1], ROC_cas[2], label="cas, AUC=$(round(AUC_cas,digits=3))")
legend()
xlabel("False positive rate (FPR)", fontsize=14)
ylabel("True positive rate (TPR)", fontsize=14);

In [None]:
## build dataframe with community-based features
fbl_df = DataFrame(
    measure=[fill("pc", fbl_n); fill("cas", fbl_n); fill("cdd", fbl_n)],
    value=[pc(g_fbl, cfg_comm); cas(g_fbl, cfg_comm); cdd(g_fbl, cfg_comm)],
    anomaly=ifelse.(repeat(anomaly, 3) .== 1, "anomaly", "non-anomaly")
)

## boxplots
groupedboxplot(
    fbl_df.measure,
    fbl_df.value,
    group=fbl_df.anomaly,
    bar_width=0.7,
    xlabel="Measure",
    ylabel="Value")

# ABCD with varying $\xi$ - Illustration

The cell below is for illustration purpose only, to show some small ABCD graphs with different $\xi$ (noise) parameters.

* notice the density of edges between communities as $\xi$ increases.


In [None]:
## ABCD with varying community strength (xi)
n = 100

## degrees
gamma = 2.5
delta = 5
Delta = 15

## communities
beta = 1.5
s = 25
S = 50

XIs = [0.05, 0.15, 0.33, 0.5]
g_abcds_xi = []
for xi in XIs
    Random.seed!(42)
    degs = ABCDGraphGenerator.sample_degrees(gamma, delta, Delta, n, 1000)
    coms = ABCDGraphGenerator.sample_communities(beta, s, S, n, 1000)
    p = ABCDGraphGenerator.ABCDParams(degs, coms, nothing, xi, false, false, false)
    edges, clusters = ABCDGraphGenerator.gen_graph(p)
    g = SimpleGraph(n)
    for row in edges
        add_edge!(g, row...)
    end
    push!(g_abcds_xi, (g, clusters))
end

In [None]:
fig = Makie.Figure(size=(800, 600))
for i in 1:length(XIs)
    ax, plt = graphplot(
        fig[i <= 2 ? 1 : 2, in(i, [1, 3]) ? 1 : 2],
        g_abcds_xi[i][1],
        node_strokewidth=1,
        node_size=12,
        node_color=g_abcds_xi[i][2],
        node_attr=(colormap=:Greys,),
        edge_color=:lightgray
    )
    hidedecorations!(ax)
    hidespines!(ax)
    ax.title = "ξ = $(XIs[i]) "
end
fig

# ABCD with varying $\xi$ -- Experiments

Here we show a typical way to compare graph clustering algorithms using benchmark graphs. 
We pick some model, here ABCD, and we vary the noise parameter $\xi$. 
With ABCD, the larger $\xi$ is, the closer we are to a random Chung-Lu or configuration model graph (i.e. where only the degree distribution matters). For $\xi=0$, we get pure communities (all edges are internal).

For each choice of $\xi$, we generate 30 graphs, apply several different clustering algorithms,
and compute AMI (adjusted mutual information) for each algorithm, comparing with ground-truth communities.

The code below is commented out as it can take a while to run; saved results are included in the Data directory. To re-run from scratch, uncomment the cell below.

Parameters for the ABCD benchmark graphs are:

* $n=1,000$
* $\gamma=2.5$
* $\tau=1.5$
* degree range [10,50]
* community size range [50,100]
* $0.3 \le \xi \le 0.8$


We plot the results below. 
We see good results with Leiden and Infomap, and slightly better results with ECG.
Label propagation is a fast algortihm, but it does collapse with moderate to high level of noise.

From the standard deviation plot, we see high variability around the value(s) for $\xi$ where the different
algorithms start to collapse. We see that this happen later and at a smaller scale with EGC, which is known to have good stability.

ECG and Leiden are often good choices for *unweighted* graphs while for *weighted* graphs, Leiden is usually a good option.

Such studies are useful to compare algorithms; using benchmarks, we can directly control parameters such as the noise level.

Uncomment the cell below to re-run all the experiments (can take 4-5 minutes), otherwise we load the results in the next cell.


In [None]:
# ## common ABCD graph parameters

# n = 1000

# ## degrees
# gamma = 2.5
# delta = 10
# Delta = 50

# ## communities
# beta = 1.5
# s = 50
# S = 100

# ## generate the graphs and run various clustering algorithms
# Random.seed!(42)
# REP = 30
# L = Vector{Float64}[]
# for xi in .3:.02:.801
#     println(xi)
#     for rep in 1:REP
#         v = [xi, AMI(G.community_leiden(objective_function="modularity").membership, clusters),
#              AMI(G.community_ecg(ens_size=16, final="leiden").membership, clusters),
#              AMI(G.community_infomap().membership, clusters),
#              AMI(G.community_label_propagation().membership, clusters)
#             ]
#         push!(L,v)
#     end
# end
# df = DataFrame(hcat(L...)',["xi","leiden","ecg","infomap","lp"])

# serialize(datadir*"ABCD/abcd_python_study_jl.ser", df)


In [None]:
## load data generated with the code from above cell
df = deserialize(datadir * "ABCD/abcd_python_study_jl.ser");

In [None]:
## mean
D = combine(groupby(df, "xi"), Not(:xi) .=> mean)
PyPlot.plot(D.xi, D.ecg_mean, "-", label="ECG", color=:black)
PyPlot.plot(D.xi, D.leiden_mean, "--", label="Leiden", color=:black)
PyPlot.plot(D.xi, D.infomap_mean, "-.", label="Infomap", color=:black)
PyPlot.plot(D.xi, D.lp_mean, ":", label="Label Prop.", color=:black)
xlabel("ABCD noise (ξ)", fontsize=14)
ylabel("AMI", fontsize=14)
legend();

In [None]:
## Standard deviation
D = combine(groupby(df, "xi"), Not(:xi) .=> std)
PyPlot.plot(D.xi, D.ecg_std, "-", label="ECG", color=:black)
PyPlot.plot(D.xi, D.leiden_std, "--", label="Leiden", color=:black)
PyPlot.plot(D.xi, D.infomap_std, "-.", label="Infomap", color=:black)
PyPlot.plot(D.xi, D.lp_std, ":", label="Label Prop.", color=:black)
xlabel("ABCD noise (ξ)", fontsize=14)
ylabel("Standard Deviation (AMI)", fontsize=14)
legend();

## Compare stability 

This study is similar to the previous one, but this time we run each algorithm **twice** on each graph, and we compare the similarity for each such pair of partitions, instead of comparing with the ground truth.

Thus we look at the **stability** of the algorithms. 

Note that an algorithm can be stable, but still be bad (ex: always cluster all nodes in a single community).

The code below can take a while to run; a pickle file with results is included in the Data directory. To re-run from scratch, uncomment the cell below.


In [None]:
## same graph - mean AMI between successive runs
## takes about 10 min to run with REP = 30
Random.seed!(42)
L = Vector{Float64}[]
REP = 30
for xi in 0.3:0.02:0.801
    println(xi)
    for rep in 1:REP
        V = [xi]
        degs = ABCDGraphGenerator.sample_degrees(gamma, delta, Delta, n, 1000)
        coms = ABCDGraphGenerator.sample_communities(beta, s, S, n, 1000)
        p = ABCDGraphGenerator.ABCDParams(degs, coms, nothing, xi, false, false, false)
        edges, clusters = ABCDGraphGenerator.gen_graph(p)

        G = ig.Graph()
        G.add_vertices(n)
        G.add_edges([(src - 1, dst - 1) for (src, dst) in edges])
        idx = shuffle(0:G.vcount()-1)
        idx_range = 1:length(idx)
        ## same graph - permute vertices
        Gp = ig.Graph.Erdos_Renyi(n=G.vcount(), p=0)
        for e in G.es()
            Gp.add_edge(idx[e.source+1], idx[e.target+1])
        end
        x = Gp.community_leiden(objective_function="modularity").membership
        push!(V, AMI(G.community_leiden(objective_function="modularity").membership, [x[idx[i]+1] for i in idx_range]))

        x = Gp.community_ecg(ens_size=16, final="leiden").membership
        push!(V, AMI(G.community_ecg(ens_size=16, final="leiden").membership, [x[idx[i]+1] for i in idx_range]))

        x = Gp.community_infomap().membership
        push!(V, AMI(G.community_infomap().membership, [x[idx[i]+1] for i in idx_range]))

        x = Gp.community_label_propagation().membership
        push!(V, AMI(G.community_label_propagation().membership, [x[idx[i]+1] for i in idx_range]))

        push!(L, V)
    end
end
df = DataFrame(hcat(L...)', ["xi", "leiden", "ecg", "infomap", "lp"])

# save results
serialize(datadir * "ABCD/abcd_python_study_stability_jl.ser", df)

We plot the results below. The behaviour of algorithms can be clustered in two groups:

* For Leiden and ECG, stability is excellent and degrades gradually for high noise level.
* For Infomap and Label Propagation, stability is also good until the noise value where the results start to degrade, as we saw in the previous study. We see near perfect stability for very high noise values; those are values where the results were very bad in the previous study; this typically happens when the algorithm can't get any good clustering and returns some trivial partition, such as putting all nodes together in the same community, thus a stable but bad result.


In [None]:
## load results
df = deserialize(datadir * "ABCD/abcd_python_study_stability_jl.ser");

In [None]:
D = combine(groupby(df, "xi"), Not(:xi) .=> mean)
PyPlot.plot(D.xi, D.ecg_mean, "-", label="ECG", color=:black)
PyPlot.plot(D.xi, D.leiden_mean, "--", label="Leiden", color=:black)
PyPlot.plot(D.xi, D.infomap_mean, "-.", label="Infomap", color=:black)
PyPlot.plot(D.xi, D.lp_mean, ":", label="Label Prop.", color=:black)
xlabel("ABCD noise (ξ)", fontsize=14)
ylabel("AMI between successive runs", fontsize=14)
legend();

### Other clustering algorithms

Below, we show several other clustering algorithms available in igraph and how to run them.


In [None]:
seed = 123
Random.seed!(seed)

## generate ABCD graph
n = 1000
xi = 0.5
## degrees
gamma = 2.5
delta = 10
Delta = 50
## communities
beta = 1.5
s = 50
S = 100
degs = ABCDGraphGenerator.sample_degrees(gamma, delta, Delta, n, 1000)
coms = ABCDGraphGenerator.sample_communities(beta, s, S, n, 1000)
p = ABCDGraphGenerator.ABCDParams(degs, coms, nothing, xi, false, false, false)
abcd_edges, clusters = ABCDGraphGenerator.gen_graph(p)

G = ig.Graph()
G.add_vertices(n)
G.add_edges([(src - 1, dst - 1) for (src, dst) in abcd_edges])
println("Number of ground-truth communities: $(maximum(clusters))")

## Run several algorithms

## CNM
partition = G.community_fastgreedy().as_clustering().membership
println("CNM: Number of communities: $(maximum(partition)+1) AMI: $(AMI(clusters,partition))")
## Louvain
partition = G.community_multilevel().membership
println("Louvain: Number of communities: $(maximum(partition)+1) AMI: $(AMI(clusters,partition))")
## Leiden
partition = G.community_leiden(objective_function="modularity").membership
println("Leiden: Number of communities: $(maximum(partition)+1) AMI: $(AMI(clusters,partition))")
## ECG
partition = G.community_ecg().membership
println("ECG: Number of communities: $(maximum(partition)+1) AMI: $(AMI(clusters,partition))")
## Infomap
partition = G.community_infomap().membership
println("Infomap: Number of communities: $(maximum(partition)+1) AMI: $(AMI(clusters,partition))")
## LP
partition = G.community_label_propagation().membership
println("Label propagation: Number of communities: $(maximum(partition)+1) AMI: $(AMI(clusters,partition))")
## LE
partition = G.community_leading_eigenvector().membership
println("Leading eigenvector: Number of communities: $(maximum(partition)+1) AMI: $(AMI(clusters,partition))")


# Modularity, resolution limit and rings of cliques

We illustrate issues with modularity with the famous ring of cliques examples.

For example below, we have a ring of 3-cliques connected by a single (inter-clique) edge.

In [None]:
n = 3
s = 3
for i in 0:n-1
    for j in s*i:s*(i+1)
        for k in j+1:s*(i+1)-1
            println("i", i, "j", j, "k", k)
        end
    end
end

In [None]:
## build a RoC with n cliques of size s
function ringOfCliques(n::Int, s::Int)
    roc = ig.Graph.Erdos_Renyi(n=n * s, p=0)
    ## cliques
    for i in 0:n-1
        for j in s*i:s*(i+1)
            for k in j+1:s*(i+1)-1
                roc.add_edge(j, k)
            end
        end
    end
    ## ring
    for i in 0:n-1
        if i > 0
            roc.add_edge(s * i - 1, s * i)
        else
            roc.add_edge(n * s - 1, 0)
        end
    end
    return roc
end

## Ex: 10 3-cliques
roc = ringOfCliques(10, 3)
fig, ax = subplots(figsize=(4, 4))
ig.plot(roc, target=ax,
    vertex_size=8,
    vertex_color="dimgray",
    edge_color="lightgray",
    edge_width=1);

We compare the number of cliques (the natural parts in a partition) with the actual number of communities found via modularity based algorithms Leiden and ECG.

With Leiden as is (i.e. resolution parameter 1.0), we see a smaller number of communities than the number of cliques; 
this is a known problem with modularity: merging cliques in the same community often lead to higher modularity.

One way to alleviate this issue is to increase the **resolution parameter**; with resolution parameter 5 instead of the default value of 1, we see that we can recover 
the cliques in most cases. 

A concensus algorithm like ECG can help a lot in such cases; below we see that all the cliques are correctly recovered.


In [None]:
## Compare number of cliques and number of clusters found
L = []
s = 3
for n in 3:3:50
    roc = ringOfCliques(n, s)
    le = maximum(roc.community_leiden(objective_function="modularity").membership) + 1
    ler = maximum(roc.community_leiden(objective_function="modularity", resolution=5.0).membership) + 1
    ec = maximum(roc.community_ecg().membership) + 1
    push!(L, [n, ler, le, ec])
end
D = DataFrame(hcat(L...)', ["n", "Leiden_res", "Leiden", "ECG"])
PyPlot.figure(figsize=(8, 6))
PyPlot.plot(D.n, D.Leiden, ":o", color=:black, label="Leiden (default)")
PyPlot.plot(D.n, D.Leiden_res, "--o", color=:black, label="Leiden (res.=5)")
PyPlot.plot(D.n, D.ECG, "-o", color=:black, label="ECG")
xlabel("number of $(s)-cliques", fontsize=14)
ylabel("number of clusters found", fontsize=14)
legend(fontsize=14);

Let us look at a specific example: 10 cliques of size 3. Below we plot the communities found with Leiden; we clearly see that several **pairs** of communities are grouped into clusters.

In [None]:
## Leiden communities with 10 3-cliques
roc = ringOfCliques(10, 3)
roc_comms = roc.community_leiden(objective_function="modularity").membership
fig, ax = subplots(figsize=(4, 4))
ig.plot(roc, target=ax,
    vertex_size=8,
    vertex_color=["white", "gray", "dimgray"][mod.(roc_comms, 3).+1],
    edge_color="lightgray",
    edge_width=1);

Why is ECG solving this problem? It is due to the first step, where we run an ensemble of level-1 Louvain and assign new weights to edges based on the proportion of times those edges are internal to a community.

We see below that there are exactly 30 edges with maximal edge weight of 1 (edges within cliques) and 10 edges with default minimal weight of 0.05 (edges between cliques). 

With those new weights, the last clustering in ECG can easily recover the cliques as communities.


In [None]:
## ECG weights in this case: all 30 clique edges have max score
countmap(roc.community_ecg().W)

# Ego nets and more

Suppose we want to look at the node that are *near* some seed node $v$. 

One common way to do this is to look at its ego-net, i.e. the subgraph consisting of node $v$ and all other nodes that can be reached from $v$ in $k$ hops or less, where $k$ is small, typically 1 or 2. 

Such subgraphs can become large quickly as we increase $k$. In the cells below, we look at ego-nets and compare with another approach to extract subgraph(s) around $v$ via clustering.

We consider the **airport** graph we already saw several times. We consider a simple, undirected version (no loops, directions or edge weights).

We compare ego-nets (1 and 2-hops subgraphs from a given node) with clusters obtained via graph clustering for some vertex $v$ (airport 'MQT') which has degree 11 (you can try other vertices).

In [None]:
## read edges and build simple undirected graph
D = CSV.read(datadir * "Airports/connections.csv", DataFrame)
all_airports = unique([D.orig_airport D.dest_airport])
n = length(all_airports)
node_map = Dict(zip(all_airports, 1:n))
g = SimpleGraph(n)
for row in eachrow(D)
    src = node_map[row.orig_airport]
    tgt = node_map[row.dest_airport]
    if src != tgt
        add_edge!(g, src, tgt)
    end
end

## read vertex attributes
A = CSV.read(datadir * "Airports/airports_loc.csv", DataFrame)

## pick a vertex v ('MQT' airport)
v = node_map["MQT"]
println("vertex degree: $(degree(g)[v])")
A[A.airport.=="MQT", :]

In [None]:
## show its ego-net for k=1 (vertex v shown in black)
sg = egonet(g, v, 1)
println("$(nv(sg)) nodes")
clean_graphplot(sg,
    node_strokewidth=1,
    node_size=12,
    node_color=[[:black]; fill(:gray, nv(sg) - 1)],
    edge_color=:lightgray
)

In [None]:
## show its 2-hops ego-net ... this is already quite large!
sg = egonet(g, v, 2)
rem_vertex!.(Ref(sg), findall(<(2), core_number(sg)))
println("$(nv(sg)) nodes")
clean_graphplot(sg,
    node_strokewidth=1,
    node_size=12,
    node_color=[[:black]; fill(:gray, nv(sg) - 1)],
    edge_color=:lightgray
)

In [None]:
## apply clustering and show the cluster containing the selected vertex
## recall that we ignore edge weights
g = ig.Graph.TupleList(Tuple.(eachrow(D)), directed=true)
g = g.as_undirected()
g = g.simplify()

random.seed(12345)
np.random.seed(12345)
ec = g.community_ecg(ens_size=32)
set!(g.es, "W", ec.W)
m = ec.membership[g.vs.find("MQT").index+1]
sg = g.subgraph(findall(ec.membership .== m) .- 1)
## display the 2-core
sg.delete_vertices(findall(sg.coreness() .< 2) .- 1)
println("$(sg.vcount()) nodes")
ecg_colors = fill("gray", sg.vcount())
ecg_colors[sg.vs.find("MQT").index+1] = "black"
set!(sg.vs, "color", ecg_colors)
fig, ax = subplots(figsize=(4, 4))
ig.plot(sg, target=ax,
    vertex_size=8,
    edge_color="lightgray",
    edge_width=1);

We see above that looking at the cluster with $v$ is smaller than the 2-hops ego-net, and several nodes are tightly connected.

Below we go further and look at the ECG edge weights, which we can use to prune the graph above, so we can look at the nodes most tightly connected to node $v$.

You can adjust the threshold below to get different zoomings.


In [None]:
## filter edges w.r.t. ECG votes (weights)
thresh = 0.85

tmp = sg.subgraph_edges([e for e in sg.es if get(e, "W") > thresh])
n = tmp.vs.find("MQT").index + 1
comps = tmp.connected_components().membership
cl = comps[n]
ssg = tmp.subgraph(findall(comps .== cl) .- 1)
ssg.delete_vertices(findall(ssg.coreness() .< 2) .- 1)
println("$(ssg.vcount()) nodes")
fig, ax = subplots(figsize=(4, 4))
ig.plot(ssg, target=ax,
    vertex_size=8,
    edge_color="lightgray",
    edge_width=1);

Most nodes in this subgraph are from the same state as node $v$ (MI) or nearby state (WI).

In [None]:
## states in the above subgraph
name_state_map = Dict(zip(A.airport, A.state))
countmap([name_state_map[e] for e in get(ssg.vs, "name")])

## Measures to compare partitions - Figure 5.13

* We illustrate the importance of using proper adjusted measures when comparing partitions; this is why we use AMI (adjusted mutual information) or ARI (adjusted Rand index) in our experiments
* We generate some ABCD graph and compare ground truth with **random** partitions of different sizes
* Scores for random partitions should be close to 0 regardless of the number of parts

In [None]:
## ABCD example
n = 1000
xi = 0.1

## degrees
gamma = 2.5
delta = 5
Delta = 50

## communities
beta = 1.5
s = 75
S = 150

Random.seed!(42)
degs = ABCDGraphGenerator.sample_degrees(gamma, delta, Delta, n, 1000)
coms = ABCDGraphGenerator.sample_communities(beta, s, S, n, 1000)
p = ABCDGraphGenerator.ABCDParams(degs, coms, nothing, xi, false, false, false)
abcd_edges, clusters = ABCDGraphGenerator.gen_graph(p)
println("number of communities: $(maximum(clusters))")

In [None]:
## generate random clusterings and compute various measures w.r.t. ground truth
## this can take a few minutes to run
MI = pyimport("sklearn.metrics").mutual_info_score
RI = pyimport("sklearn.metrics").rand_score
ARI = pyimport("sklearn.metrics").adjusted_rand_score
NMI = pyimport("sklearn.metrics").normalized_mutual_info_score

REP = 10 ## we used 100 for the textbook

L = Vector{Float64}[]
gt = clusters
n = G.vcount()
tc = Dict(idx - 1 => part for (idx, part) in enumerate(gt))
ar = 2:21
for s in ar
    for i in 1:REP
        r = sample(1:s, n)
        rd = Dict(idx - 1 => part for (idx, part) in enumerate(r))
        push!(L, [s, MI(gt, r), NMI(gt, r), AMI(gt, r), RI(gt, r), ARI(gt, r), G.gam(tc, rd, "rand", false), G.gam(tc, rd)])
    end
end
df = DataFrame(hcat(L...)', ["size", "MI", "NMI", "AMI", "RI", "ARI", "GRI", "AGRI"])
df_avg = combine(groupby(df, "size"), Not(:size) .=> mean)

Below we show results for 3 measures:
* Mutual information (MI), which has strong bias w.r.t. number of clusters
* Normalized MI (NMI) is better
* AMI is best, no bias w.r.t. number of clusters.

In [None]:
## Mutual information (MI), normalized MI (NMI) and adjusted MI (AMI)
PyPlot.plot(ar, df_avg.MI_mean, ":", color="black", label="MI")
PyPlot.plot(ar, df_avg.NMI_mean, "--", color="black", label="NMI")
PyPlot.plot(ar, df_avg.AMI_mean, "-", color="black", label="AMI")
xlabel("number of random clusters", fontsize=14)
legend();

Same below for Rand index (RI) and adjusted version. 

GRI (graph RI) and AGRI (adjusted GRI) are variations of RI specifically for graph data.

In [None]:
## RAND index (RI) and adjusted (ARI)
## Also: Graph-aware RAND index (GRI) and adjusted version (AGRI)
## those measures are included in "partition-igraph"
## input are partitions of type "igraph.clustering.VertexClustering"or a dictionaries of node:community.
PyPlot.plot(ar, df_avg.RI_mean, ":", color="black", label="RI")
PyPlot.plot(ar, df_avg.GRI_mean, "--", color="black", label="GRI")
PyPlot.plot(ar, df_avg.ARI_mean, "-", color="black", label="ARI/AGRI")
PyPlot.plot(ar, df_avg.AGRI_mean, "-", color="black")
xlabel("number of random clusters", fontsize=14)
legend();