# Chapter 4 - Degree Correlations

We explore various notions of degree correlation (**assortativity, disassortativity**) and related measures.

As with the previous notebooks, make sure to set the data directory properly in the next cell

In [None]:
datadir = "../Datasets/"

In [None]:
using Graphs
using DataFrames
using CSV
using Statistics
using PyCall
using PyPlot
using Chain
using StatsBase
using GraphPlot
using Random

## Some useful functions

* **deg_corr**: compute $k_{nn}(l)$ for all degrees $l$, as defined in section 4.3 of the book (undirected)
* **deg_corr_directed**: compute $k_{nn}^{mode1,mode2}(l)$ for all degrees $l$, as defined in section 4.5 of the book; mode1 and mode2 can be 'indegree', or 'outdegree'
* **deg_corr_neutral**: computes degree correlations for a neutral graph with the same degree distribution as the input graph
* **corr_exp**: compute the correlation exponent via linear regression (taking logs) as in section 4.3 of the book
* **rich_club**: compute the rich club coefficient (see section 4.3 of the book) for a given degree $l$
* **assortativity_directed**: allows for all mix of 'indegree' and 'outdegree'


In [None]:
# undirected
function deg_corr(G::Graph)
    deg = degree(G)
    deg_set = Set(deg)
    idx = Dict(d => i for (i, d) in enumerate(deg_set))
    idx_inv = Dict(i => d for (d, i) in idx)

    L = [Int[] for _ in 1:length(idx)]

    for v in vertices(G)
        d_v = deg[v]
        neighbors_list = neighbors(G, v)
        neighbor_degrees = deg[neighbors_list]
        push!(L[idx[d_v]], neighbor_degrees...)
    end

    return Dict(idx_inv[i] => mean(L[i]) for i in 1:length(L))
end

In [None]:
deg_corr_df(G::SimpleGraph) =
    @chain edges(G) begin
        DataFrame
        append!(_, select(_, :dst => :src, :src => :dst)) # add edge in reverse
        transform(:src => ByRow(x -> degree(G, x)) => :src_deg,
            :dst => ByRow(x -> degree(G, x)) => :dst_deg)
        groupby(:src_deg, sort=true)
        combine(:dst_deg => mean)
    end

In [None]:
## k_nn^{mode1,mode2}(l) : average mode2-degree of mode1-neighbours of nodes with mode1-degree = l
# normally mode1 and mode2 should be: degree, indegree or outdegree
deg_corr_directed(G, mode1::Function, mode2::Function) =
    @chain edges(G) begin
        DataFrame
        transform(:src => ByRow(x -> mode1(G, x)) => :src_deg,
            :dst => ByRow(x -> mode2(G, x)) => :dst_deg)
        groupby(:src_deg, sort=true)
        combine(:dst_deg => mean)
    end

In [None]:
## degree correlation for neutral graphs: <k^2>/<k>
function deg_corr_neutral(G, mode::Function=degree)
    x = mode(G)
    return mean(x .^ 2) / mean(x)
end

In [None]:
## Correlation exponent via linear regression (taking logs)
function corr_exp(G)
    ## compute knn's
    knn = deg_corr_df(G)
    # Fit the regression
    x = log.(knn.src_deg)
    y = log.(knn.dst_deg_mean)
    return ([ones(length(x)) x]\y)[2]
end

In [None]:
function corr_exp_directed(G,mode1::Symbol,mode2::Symbol)
    m1 = mode1 == :in  ? indegree  : outdegree
    m2 = mode2 == :in  ? indegree  : outdegree
    ## compute knn's
    knn = deg_corr_directed(G,m1,m2)
    knn = knn[(knn.src_deg .!=0 .&& knn.dst_deg_mean .!=0),:]
    # Fit the regression
    x = log.(knn.src_deg)
    y = log.(knn.dst_deg_mean)
    return ([ones(length(x)) x]\y)[2]
end

In [None]:
function rich_club(g, l=1)
    l_max = maximum(degree(g))
    c = countmap(degree(g))
    n = nv(g)
    moment = sum(k * ck / n for (k, ck) in pairs(c))^2
    S = [k * ck / n for (k, ck) in pairs(c) if k >= l]
    phi_hat = sum(x * y for x in S, y in S) * ne(g) / moment
    G = induced_subgraph(g, findall(>=(l), degree(g)))[1]
    phi = ne(G)
    return phi / phi_hat
end

In [None]:
assortativity(G) =
    @chain edges(G) begin
        DataFrame
        append!(_, select(_, :dst => :src, :src => :dst)) # add edge in reverse
        transform(:src => ByRow(x -> degree(G, x)) => :src_deg,
            :dst => ByRow(x -> degree(G, x)) => :dst_deg)
        cov(_.src_deg, _.dst_deg, corrected=false) /
        (std(_.src_deg, corrected=false) * std(_.dst_deg, corrected=false))
    end

In [None]:
function assortativity_directed(G, mode1::Symbol, mode2::Symbol)
    d1 = mode1 == :in ? indegree(G) : outdegree(G)
    d2 = mode2 == :in ? indegree(G) : outdegree(G)

    D = [(d1[src(e)], d2[dst(e)]) for e in edges(G)]

    d1_vals = first.(D)
    d2_vals = last.(D)

    avg1 = mean(d1_vals)
    avg2 = mean(d2_vals)

    S1 = sum((x - avg1) * (y - avg2) for (x, y) in D)
    S2 = sqrt(sum((x - avg1)^2 for x in d1_vals))
    S3 = sqrt(sum((y - avg2)^2 for y in d2_vals))

    return S1 / (S2 * S3)
end

In [None]:
function cm_simple(ds)
    @assert iseven(sum(ds))
    stubs = reduce(vcat, fill(i, ds[i]) for i in 1:length(ds))
    shuffle!(stubs)
    local_edges = Set{Tuple{Int,Int}}()
    recycle = Tuple{Int,Int}[]
    for i in 1:2:length(stubs)
        e = minmax(stubs[i], stubs[i+1])
        if (e[1] == e[2]) || (e in local_edges)
            push!(recycle, e)
        else
            push!(local_edges, e)
        end
    end

    # resolve self-loops and duplicates
    last_recycle = length(recycle)
    recycle_counter = last_recycle
    while !isempty(recycle)
        recycle_counter -= 1
        if recycle_counter < 0
            if length(recycle) < last_recycle
                last_recycle = length(recycle)
                recycle_counter = last_recycle
            else
                break
            end
        end
        p1 = popfirst!(recycle)
        from_recycle = 2 * length(recycle) / length(stubs)
        success = false
        for _ in 1:2:length(stubs)
            p2 = if rand() < from_recycle
                used_recycle = true
                recycle_idx = rand(axes(recycle, 1))
                recycle[recycle_idx]
            else
                used_recycle = false
                rand(local_edges)
            end
            if rand() < 0.5
                newp1 = minmax(p1[1], p2[1])
                newp2 = minmax(p1[2], p2[2])
            else
                newp1 = minmax(p1[1], p2[2])
                newp2 = minmax(p1[2], p2[1])
            end
            if newp1 == newp2
                good_choice = false
            elseif (newp1[1] == newp1[2]) || (newp1 in local_edges)
                good_choice = false
            elseif (newp2[1] == newp2[2]) || (newp2 in local_edges)
                good_choice = false
            else
                good_choice = true
            end
            if good_choice
                if used_recycle
                    recycle[recycle_idx], recycle[end] = recycle[end], recycle[recycle_idx]
                    pop!(recycle)
                else
                    pop!(local_edges, p2)
                end
                success = true
                push!(local_edges, newp1)
                push!(local_edges, newp2)
                break
            end
        end
        success || push!(recycle, p1)
    end
    g = SimpleGraph(length(ds))
    for e in local_edges
        add_edge!(g, e...)
    end
    return g
end

# Assortativity in geometric and uniform random graphs
## Figures 4.1 - 4.3 in the book

We consider two families of (undirected) random graphs we saw in Chapter 2, namely:
* random geometric graphs (we consider a torus by default) $RGG(n,r)$ where $n$ is the number of nodes, and $r$ the radius.
* Uniform or Erdos-Renyi graphs

With $n=1,000$, we consider a range of values for the radius $r$ for the RGGs, and for each $r$, we generate a uniform random graph with the same number of edges as the RGG.

We then plot the degree assortativity for the two random graph families as a function of the average degree.


In [None]:
n = 1000
torus = :open

Random.seed!(123)
L = []
for radius in 0.025:0.025:0.501
    g_RGG, _, _ = euclidean_graph(n, 2, seed=123, cutoff=radius, bc=torus)
    m = ne(g_RGG)
    g_ER = erdos_renyi(n, m)
    push!(L, [assortativity(g_RGG), assortativity(g_ER), mean(degree(g_RGG))])
end
df_rg = DataFrame(hcat(L...)', ["RGG", "Uniform", "avg_degree"])
first(df_rg, 5)

In [None]:
## plot degree assortativity vs average degree
plot(df_rg.avg_degree, df_rg.RGG, ".-", label="RGG", color=:gray)
plot(df_rg.avg_degree, df_rg.Uniform, ".-", label="Uniform", color=:lightgray)
legend()
xlabel("average degree", fontsize=14)
ylabel("assortativity", fontsize=14);

From the above, we see high assortativity for the RGG graphs when the average degree is not a significant fraction of the number of nodes, which correspond to small radius in the model. For the uniform random graphs, assortativity is close to zero everywhere.

We can illustrate this by plotting $k_{nn}(l)$ for all degrees $l$,
We show the results for RGG and the corresponding uniform random graph (same number of edges) for two cases:
* $r=0.1$, which shows clear correlation for RGG
* $r=0.5$, where the correlation is not so clear for RGG (all nodes have high degree!)

In [None]:
## radius = 0.1
Random.seed!(123)
n = 1000
g_RGG = euclidean_graph(n, 2, seed=123, cutoff=0.1, bc=:open)[1]
m = ne(g_RGG)
g_ER = erdos_renyi(n, m)
println("average degree ", mean(degree(g_RGG)))

## plot for RGG
knn = deg_corr(g_RGG)
x = collect(keys(knn))
y = collect(values(knn))
r = deg_corr_neutral(g_RGG)
figure(figsize=(10, 4))
subplot(121)
scatter(x, y, c=:black)
hlines(y=r, xmin=minimum(x), xmax=maximum(x), linestyles=":", color=:lightgray)
xlabel(raw"degree ($\ell$)", fontsize=12)
ylabel(raw"$k_{nn}(\ell)$", fontsize=12)
title("RGG with radius 0.1")

## plot for ER
knn = deg_corr(g_ER)
x = collect(keys(knn))
y = collect(values(knn))
r = deg_corr_neutral(g_ER)
subplot(122)
scatter(x, y, c=:black)
hlines(y=r, xmin=minimum(x), xmax=maximum(x), linestyles=":", color=:lightgray)
xlabel(raw"degree ($\ell$)", fontsize=12)
title("Uniform RG with same average degree");

In [None]:
## radius = 0.5
Random.seed!(123)
n = 1000
g_RGG = euclidean_graph(n, 2, seed=123, cutoff=0.5, bc=:open)[1]
m = ne(g_RGG)
g_ER = erdos_renyi(n, m)
println("average degree ", mean(degree(g_RGG)))

## plot for RGG
knn = deg_corr(g_RGG)
x = collect(keys(knn))
y = collect(values(knn))
r = deg_corr_neutral(g_RGG)
figure(figsize=(10, 4))
subplot(121)
scatter(x, y, c=:black)
hlines(y=r, xmin=minimum(x), xmax=maximum(x), linestyles=":", color=:lightgray)
xlabel(raw"degree ($\ell$)", fontsize=12)
ylabel(raw"$k_{nn}(\ell)$", fontsize=12)
title("RGG with radius 0.5")

## plot for ER
knn = deg_corr(g_ER)
x = collect(keys(knn))
y = collect(values(knn))
r = deg_corr_neutral(g_ER)
subplot(122)
scatter(x, y, c=:black)
hlines(y=r, xmin=minimum(x), xmax=maximum(x), linestyles=":", color=:lightgray)
xlabel(raw"degree ($\ell$)", fontsize=12)
title("Uniform RG with same average degree");

# Spatial Preferential Attachment (SPA) Graphs
## Figures 4.4, 4.6 and Table 4.5 in the book

RGG do not have power law degree distribution. We consider another model that does - SPA.

We consider the SPA model generating directed graphs. In the experiment below, we generate all graphs
on a 2-dim unit square but we use **torus** distance. However for plotting, we drop the edges that *wrap around* the unit square, to get a nicer display.

We generate **directed** graphs as detailed in the book, namely:
* start with single node at time $t=1$ randomly placed on the square
* at time $t>1$, generate a new node $v_t$ randomly placed on the square and for each node $u$ already in the graph:
 * if $v_t$ is in the **sphere of influence** of node $u$, add a directed edge $v_t \rightarrow u$ with probability $p$
 
A node's sphere of influence is parametrized by the node's in-degree at time $t$, and two other parameters $A_1$ and $A_2$, as described in the book.

The in-degree distribution follows a power law with exponent $1+1/(p \cdot A_1)$

The expected average out-degree is $(p \cdot A_2)/(1-p \cdot A_1)$ (asymptotically)


In [None]:
function torusDist(U, V)
    dx = abs(V[1] - U[1])
    dy = abs(V[2] - U[2])
    dx = (dx > 0.5) ? 1 - dx : dx
    dy = (dy > 0.5) ? 1 - dy : dy
    return sqrt(dx * dx + dy * dy)
end

function squareDist(U, V)
    dx = abs(V[1] - U[1])
    dy = abs(V[2] - U[2])
    return sqrt(dx * dx + dy * dy)
end

# build a SPA graph given the parameters described above
function buildSPA(n::Int, p::Float64, A1::Int, A2::Int, seed::Int=12345)
    ## random points
    rng = MersenneTwister(seed)
    X = rand(rng, n, 2)

    ## initialize new graph
    t = 0
    r = zeros(n)
    G = DiGraph(n)

    ## add vertices
    for v in 1:n
        for i in 1:v
            if torusDist(X[v, :], X[i, :]) < r[i]
                if rand() < p
                    add_edge!(G, v, i)
                end
            end
        end
        t += 1
        r = [sqrt(min(1, (A1 * d + A2) / t) / pi) for d in indegree(G)]
    end
    return G, X
end

### Generate a SPA graph - build with the torus distance

Lower the number of edges $n$ for a quicker run.


In [None]:
#Generate a SPA graph - build with the torus distance
n = 1000
p = 0.667
A1 = 1
A2 = 3
seed = 1234

G_spa, X = buildSPA(n, p, A1, A2, seed)
print("number of edges: ", ne(G_spa))

### Plot the above graph - ignore edges that "wrap around" for nicer visualization

In [None]:
E = Edge[]
for e in edges(G_spa)
    a = X[e.src, :]
    b = X[e.dst, :]
    if torusDist(a, b) == squareDist(a, b)
        push!(E, e)
    end
end

## plot with those edges only
G_sq, _ = induced_subgraph(G_spa, E)
gplot(G_sq, X[:, 1], X[:, 2],
    nodefillc="black",
    arrowlengthfrac=0.01,
    plot_size=(20cm, 20cm))

### Statistics for the SPA graph above

We compute the following:
* directed assortativity for the 4 different types $(a,b)$ where $a,b \in \{'in','out'\}$ corresponding respectively to the source and target node of each directed edge
* the corresponding correlation exponents
* undirected assortativity
* average and expected average out degree

We implemented our own function for directed assortativity.

In [None]:
## statistics for the SPA graph
for m1 in [:in, :out]
    for m2 in [:in, :out]
        println(
            "Directed assortativity of type (",
            m1,
            ",",
            m2,
            "): ",
            assortativity_directed(G_spa, m1, m2),
        )
        println(
            "Directed correlation exponent of type (",
            m1,
            ",",
            m2,
            "): ",
            corr_exp_directed(G_spa, m1, m2),
        )
    end
end
println("Undirected assortativity: ", assortativity(G_spa))
println("Average out degree: ", mean(outdegree(G_spa)))
println("expected average out degree:", p * A2 / (1 - p * A1))

### Fitted vs expected power law exponent (in degrees)

We use the same function we saw in Chapter 2.


In [None]:
#Fitted vs expected power law exponent (in degrees)
d = indegree(G_spa)
powerlaw = pyimport("powerlaw");
X = powerlaw.Fit(d, verbose=false)
println("Value of gamma: ", X.power_law.alpha)
println("Expected power law exponent:", 1 + 1 / (p * A1))

### Degree correlation function(s)

Set ```mode1``` and ```mode2``` to 'indegree' or 'outdegree' to generate the various plots.

In [None]:
## degree correlation
mode1 = outdegree
mode2 = indegree
knn = deg_corr_directed(G_spa, mode1, mode2)
scatter(knn.src_deg, knn.dst_deg_mean, c="black");
xlabel(L"degree ($\ell$)", fontsize=12)
ylabel(L"$k_{nn}(\ell)$", fontsize=12);

# Xulvi-Brunet and Sokolov (XBS) algorithm
## Figures 4.7 and 4.8 in the book

We show a smaller scale example here as compared to the book. (smaller value for $n$ and less repeats).

* Fig. 4.7: degree correlation function for various assortativity values, using XBS algorithm
* Fig. 4.8: order of the giant component vs expected average degree


In [None]:
## Naive Xulvi-Brunet and Sokolov algorithm

function XBS(n, λ, q, assortative, seed)
    p = λ / n
    Random.seed!(seed)
    g = erdos_renyi(n, p)
    q == 0 && return g
    e = [(x.src, x.dst) for x in edges(g)]
    touched = falses(length(e))
    count_touched = 0
    while count_touched < length(e)
        i, j = rand(axes(e, 1)), rand(axes(e, 1))
        if i != j
            vs = [e[i]..., e[j]...]
            if allunique(vs)
                if rand() < q
                    sort!(vs, by=x -> degree(g, x))
                    if !assortative
                        vs[2], vs[4] = vs[4], vs[2]
                    end
                else
                    shuffle!(vs)
                end
                a1, a2, b1, b2 = vs
                if ((a1, a2) == e[i] || (a1, a2) == e[j])
                    count_touched += !touched[i] + !touched[j]
                    touched[i] = true
                    touched[j] = true
                else
                    if !has_edge(g, a1, a2) && !has_edge(g, b1, b2)
                        @assert rem_edge!(g, e[i]...)
                        @assert rem_edge!(g, e[j]...)
                        e[i] = (a1, a2)
                        e[j] = (b1, b2)
                        add_edge!(g, a1, a2)
                        add_edge!(g, b1, b2)
                        count_touched += !touched[i] + !touched[j]
                        touched[i] = true
                        touched[j] = true
                    end
                end
            end
        end
    end
    @assert all(touched)
    return g
end

In [None]:
## Fig 4.2 with N nodes and averaging Rep results
N = 2^9 ## we use 2^16 and Rep=64 in book
Rep = 8
seeds = rand(UInt64, Rep)

df = DataFrame()
for seed in seeds
    for (q, a) in ((0.0, true), (1 / 3, true), (2 / 3, true), (1 / 3, false), (2 / 3, false))
        for d in 0.25:0.25:3
            g = XBS(2^9, d, q, a, seed)
            c = maximum(length, connected_components(g)) / nv(g)
            push!(df, (seed=seed, q=q, a=a, d=d, component=c))
        end
    end
end

@chain df begin
    groupby([:q, :a, :d])
    combine(:component => mean => :component)
    groupby([:q, :a])
    foreach((c, s, sdf) -> plot(sdf.d, sdf.component, color=c, linestyle=s),
        ["black", "gray", "black", "gray", "black"],
        ["-", "-", ":", "--", "--"], _[[3, 2, 1, 4, 5]])
end
ylim([0.0, 1.0])
xlim([0.0, 3.0])
xlabel("average degree")
ylabel("fraction of nodes")
legend(["assortative, q=2/3", "assortative, q=1/3", "q=0",
    "dissortative, q=1/3", "dissortative, q=2/3"]);

In [None]:
## Fig 4.1

Random.seed!(1234)
seeds = rand(UInt64, Rep)
df = DataFrame()
for seed in seeds
    for (q, a) in ((0.0, true), (1 / 3, true), (2 / 3, true), (1 / 3, false), (2 / 3, false))
        g = XBS(2^9, 4, q, a, seed)
        g = induced_subgraph(g, findall(>(0), degree(g)))[1]
        ds, knns = eachcol(deg_corr_df(g))
        append!(df, DataFrame(seed=seed, q=q, a=a, ds=ds, knns=knns))
    end
end

@chain df begin
    groupby([:q, :a, :ds])
    combine(:knns => mean => :knns)
    sort(:ds)
    groupby([:q, :a])
    foreach((c, s, sdf) -> plot(sdf.ds, sdf.knns, color=c, linestyle=s),
        ["black", "gray", "black", "gray", "black"],
        ["-", "-", ":", "--", "--"], _[[3, 2, 1, 4, 5]])
end
xlabel("degree (\$\\ell\$)")
ylabel("\$k_{nn} (\\ell)\$")
xticks(1:2:17)
xlim([0.0, 10.0])
ylim([3, 8.0])
legend(["assortative, q=2/3", "assortative, q=1/3", "q=0",
        "dissortative, q=1/3", "dissortative, q=2/3"],
    loc="upper center", ncol=2);


# US Airports Graph

We use the same dataset as in previous chapter: a directed, weighted graph where edge weights are passenger volumes between airports. The weights are not used in this notebook, as we only consider node degrees (in, out and total degrees).


In [None]:
## read edges and build weighted directed graph
D = CSV.read(datadir * "Airports/connections.csv", DataFrame)

id2name = sort!(unique(union(D.orig_airport, D.dest_airport)))
name2id = Dict(id2name .=> axes(id2name, 1))
g = SimpleDiGraph(length(id2name))
for row in eachrow(D)
    from = name2id[row.orig_airport]
    to = name2id[row.dest_airport]
    from == to || add_edge!(g, from, to)
end
g

## Directed Degree Correlation Functions (4 cases)

Plot $k_{nn}^{mode1,mode2}(l)$ for all degrees $l$, where mode1 and mode2 are either 'indegree' or 'outdegree'.
As detailed in section 4.5, this is the average mode2-degree of mode1-neighbours of nodes with mode1-degree = $l$.
Dashed lines are for neutral graphs.

While not identical, we see that the four plots are very similar, showing a slightly negative correlation except for the nodes of small degree. This similarity is not surprising as in this directed graph the edges very often come in pairs: when there are flights from A to B, there are very often flights from B to A.


In [None]:
knn = deg_corr_directed(g, indegree, indegree)
r = deg_corr_neutral(g, indegree)
scatter(eachcol(knn)..., c="black")
hlines(y=r, xmin=minimum(knn.src_deg), xmax=maximum(knn.src_deg), linestyles=":")
ylabel(raw"$k_{nn}(\ell)$", fontsize=12);

In [None]:
knn = deg_corr_directed(g, indegree, outdegree)
r = deg_corr_neutral(g, outdegree)
scatter(eachcol(knn)..., c="black")
hlines(y=r, xmin=minimum(knn.src_deg), xmax=maximum(knn.src_deg), linestyles=":")
ylabel(raw"$k_{nn}(\ell)$", fontsize=12);

In [None]:
knn = deg_corr_directed(g, outdegree, indegree)
r = deg_corr_neutral(g, indegree)
scatter(eachcol(knn)..., c="black")
hlines(y=r, xmin=minimum(knn.src_deg), xmax=maximum(knn.src_deg), linestyles=":")
ylabel(raw"$k_{nn}(\ell)$", fontsize=12);

In [None]:
knn = deg_corr_directed(g, outdegree, outdegree)
r = deg_corr_neutral(g, outdegree)
scatter(eachcol(knn)..., c="black")
hlines(y=r, xmin=minimum(knn.src_deg), xmax=maximum(knn.src_deg), linestyles=":")
ylabel(raw"$k_{nn}(\ell)$", fontsize=12);

##  Airport graph -- undirected

We now consider an **undirected** version of the airport graph.  Degree correlation is quite similar to the previous plots. We plot both using linear and log scale.


In [None]:
## Undirected graph
g = SimpleGraph(g)

In [None]:
knn = deg_corr_df(g)
r = deg_corr_neutral(g, degree)
scatter(eachcol(knn)..., c="black")
hlines(y=r, xmin=minimum(knn.src_deg), xmax=maximum(knn.src_deg), linestyles=":")
xlabel("degree l", fontsize=12)
ylabel(raw"$k_{nn}(\ell)$", fontsize=12);

In [None]:
## log scale plot
loglog(eachcol(knn)..., "o", c="black")
hlines(y=r, xmin=minimum(knn.src_deg), xmax=maximum(knn.src_deg), linestyles=":")
ylim((52, 120))
xlabel("degree l", fontsize=12)
ylabel(raw"$k_{nn}(\ell)$", fontsize=12);

# State by state assortativity and correlation exponent

We consider each state in the airport graph for which there are more than 5 vertices that have within-state edges.
We compute the following quantities defined in section 4.3 of the book:

* the degree correlation coefficient, or degree assortativity ($r$)
* the correlation exponent ($\mu$) estimated via regression


In [None]:
A = CSV.read(datadir * "Airports/airports_loc.csv", DataFrame)
A.id = [name2id[a] for a in A.airport]
@assert A.id == axes(A, 1)
@assert A.airport == id2name
first(A, 5)

In [None]:
## for each state compute degree assortativity (r)
## note that we drop airports w/o in-state edge
## also estimate correlation exponent (mu) via regression (taking the logs)
## Show assortativity and mu for states with 5+ vertices
P = DataFrame(state=String[], nodes=Int[], edges=Int[], assortativity=Float64[], mu=Float64[])
for s in unique(A.state)
    hva = findall(==(s), A.state)
    G = induced_subgraph(g, hva)[1]
    G = induced_subgraph(G, findall(>(0), degree(G)))[1]
    if nv(G) > 5
        mu = corr_exp(G)
        push!(P, [s, nv(G), ne(G), assortativity(G), mu])
    end
end
sort!(P, :assortativity)
first(P, 5)

In [None]:
## show bottom states w.r.t. 'r' (highest assortativity)
last(P, 5)

### Correlation between $r$ and $\mu$

Some states are quite small (very few nodes and edges), but we still see good correlation between those quantities.

In [None]:
plot(P.assortativity, P.mu, "o", color="black")
xlabel("degree correlation coefficient (r)", fontsize=12)
ylabel("correlation exponent (mu)", fontsize=12)
println("Person correlation: ", cor(P.assortativity, P.mu))
ident = [-1.0, 1.0]
plot(ident, ident, ":", c="gray");

## Looking at a few states with high/low assortativity

## Two extreme cases

Below we show two small state subgraphs with $r=-1$ (Nebraska - NE) and $r=1$ (Arkansas - AR).
The AR graph consists of two cliques, so each node has the same degree as its neighbour. On the other hand, in the NE graph every node of degree 1 has a neighbour of degree 2, and nodes of degree 2 have 2 neighbours of degree 1. 


In [None]:
## state with r = -1 (NE)
hva = findall(A.state .== "NE")
G_D = induced_subgraph(g, hva)[1]
G_D = induced_subgraph(G_D, findall(>(0), degree(G_D)))[1]
println("r = ", assortativity(G_D))
Random.seed!(4)
gplot(G_D,
      NODESIZE=0.03, nodefillc="black",
      EDGELINEWIDTH=0.2, edgestrokec="gray")

In [None]:
## state with r = +1 (AR)
hva = findall(A.state .== "AR")
G_D = induced_subgraph(g, hva)[1]
G_D = induced_subgraph(G_D, findall(>(0), degree(G_D)))[1]
println("r = ", assortativity(G_D))
Random.seed!(4)
gplot(G_D,
      NODESIZE=0.03, nodefillc="black",
      EDGELINEWIDTH=0.2, edgestrokec="gray")

## Positive assortativity: ND+SD

We merge the graphs from North and South Dakota; the resulting graph has positive assortativity value $r$ = 0.243. 
We notice the presence of a dense region (interconnected high degree nodes) and some tendrils with low degree nodes. 


In [None]:
hva = findall(in(["SD", "ND"]), A.state)
G_D = induced_subgraph(g, hva)[1]
G_D = induced_subgraph(G_D, findall(>(0), degree(G_D)))[1]
println("r = ", assortativity(G_D))
Random.seed!(4)
gplot(G_D,
      NODESIZE=0.03, nodefillc="black",
      EDGELINEWIDTH=0.2, edgestrokec="gray")

Below we compare $r$ and $\mu$ for a random models with the same degree distribution as in the North+South Dakota graph ```g_D```. We use the configuration model from Viger and Latapy; we also report the proportion of nodes above the structural cutoff given the degree distribution.

The resulting values are quite different than for the real graph.

In [None]:
## compare r and mu vs random models for the Dakotas: G = SD+ND
## here we use the configuration model
## we also report the proportion of nodes above the structural cutoff
## given the degree distribution.
r = Float64[]
mu = Float64[]
for i in 1:1000
    cm = cm_simple(degree(G_D))
    push!(r, assortativity(cm))
    push!(mu, corr_exp(cm))
end
## structural cutoff
sc = sqrt(2 * ne(G_D))
p = count(>(sc), degree(G_D)) / nv(G_D)
println("r = ", assortativity(G_D), "\nmu = ", corr_exp(G_D), "\nP(edges above structural cutoff) = ", p)
boxplot([r, mu], labels=["assortativity (r)", "correlation exponent (mu)"], widths=0.7, sym="");

Another way to see this is to plot the degree correlation functions for the real graphs and for one of the generated configuration model graphs, which we show below.

In [None]:
## degree correlation function for the Dakotas graph
knn = deg_corr_df(G_D)
r = deg_corr_neutral(G_D, degree)
scatter(eachcol(knn)..., c="black")
hlines(y=r, xmin=minimum(knn.src_deg), xmax=maximum(knn.src_deg), linestyles=":")
xlabel("degree (k)", fontsize=12)
ylabel(raw"$k_{nn}(\ell)$", fontsize=12);

In [None]:
## degree correlation function for a configuration model random graph used above
## quite different!
cms = cm_simple(degree(G_D))
knn = deg_corr_df(cms)
r = deg_corr_neutral(cms, degree)
scatter(eachcol(knn)..., c="black")
hlines(y=r, xmin=minimum(knn.src_deg), xmax=maximum(knn.src_deg), linestyles=":")
xlabel("degree (k)", fontsize=12)
ylabel(raw"$k_{nn}(\ell)$", fontsize=12);

## Negative assortativity: MO


The subgraph for MO (Missouri) has negative assortativity value r = -0.547. We see that in this graph the low degree nodes mostly connect to the large degree hub nodes.


In [None]:
hva = findall(==("MO"), A.state)
G_D = induced_subgraph(g, hva)[1]
G_D = induced_subgraph(G_D, findall(>(0), degree(G_D)))[1]
println("r = ", assortativity(G_D))
Random.seed!(4)
gplot(G_D,
      NODESIZE=0.03, nodefillc="black",
      EDGELINEWIDTH=0.2, edgestrokec="gray")

Below we compare $r$ and $\mu$ for a random models with the same degree distribution as in the Missouri graph ```g_D```. We use the configuration model from Viger and Latapy; we also report the proportion of nodes above the structural cutoff given the degree distribution.

The values are similar to the ones for the real graph. Therefore, in that case, the resulting values for $r$ and $\mu$ can mostly be explained by the degree distribution: hubs are forced to be adjacent to small degree nodes. 


In [None]:
## r and mu vs random configuration model for MO graph
## compare r and mu vs random models for the Dakotas: G = SD+ND
## here we use the configuration model
## we also report the proportion of nodes above the structural cutoff
## given the degree distribution.
r = Float64[]
mu = Float64[]
for i in 1:1000
    cms = cm_simple(degree(G_D))
    push!(r, assortativity(cms))
    push!(mu, corr_exp(cms))
end
## structural cutoff
sc = sqrt(2 * ne(G_D))
p = count(>(sc), degree(G_D)) / nv(G_D)
println("r = ", assortativity(G_D), "\nmu = ", corr_exp(G_D), "\nP(edges above structural cutoff) = ", p)
boxplot([r, mu], labels=["assortativity (r)", "correlation exponent (mu)"], widths=0.7, sym="");

Another way to see this is to plot the degree correlation functions for the real graphs and for one of the generated configuration model graphs, which we show below.

In [None]:
## degree correlation function for MO graph
knn = deg_corr_df(G_D)
r = deg_corr_neutral(G_D, degree)
scatter(eachcol(knn)..., c="black")
hlines(y=r, xmin=minimum(knn.src_deg), xmax=maximum(knn.src_deg), linestyles=":")
xlabel("degree (k)", fontsize=12)
ylabel(raw"$k_{nn}(\ell)$", fontsize=12);

In [None]:
## degree correlation function for a configuration random graph
## quite similar!
cms = cm_simple(degree(G_D))
knn = deg_corr_df(cms)
r = deg_corr_neutral(cms, degree)
scatter(eachcol(knn)..., c="black")
hlines(y=r, xmin=minimum(knn.src_deg), xmax=maximum(knn.src_deg), linestyles=":")
xlabel("degree (k)", fontsize=12)
ylabel(raw"$k_{nn}(\ell)$", fontsize=12);

## Back to Overall US Airport graph 

We go back to the full (undirected) US airport graph in order to illustrate two concepts:

* the friendship "paradox" 
* the rich club phenomenon, if any

For each node, we compare its degree with the average degree of its neighbours. We draw a line with unit slope to highlight the fact that the region above that line is much denser. This is due to the fact that there are many low degree nodes (smaller airports) which mostly tend to connect to hub airports, which explains the presence of this “paradox.”

Next we compute the rich-club ratio $\rho(l)$ for all values of $l$. We see that the curve starts at 1, increases slightly before decreasing gradually. We conclude that there is no indication of a rich-club phenomenon here.


In [None]:
## plot degree vs avg neighbour degree
## friendship 'paradox' US Airport graph (overall)
deg = degree(g)
nad = [mean(degree(g, neighbors(g, v))) for v in 1:nv(g)]
scatter(deg, nad, c="black", marker=".")
xlim((0, 200))
ylim((0, 200))
xlabel("node degree", fontsize=14)
ylabel("average neighbour degree", fontsize=14);
plot([0, 200], [0, 200], "--", c="gray")
print("r = ", assortativity(g));

In [None]:
## looking for rich club -- not here!
d = unique(degree(g))
rc = rich_club.(Ref(g), d)
semilogx(d, rc, ".", c="black")
xlabel("degree l", fontsize=12)
ylabel("rich club coefficient rho(l)");

## Assortativity: Europe Electric Grid Network

This is a graph we saw before. 
Recall that the degree distribution is quite uniform (no high degree node, for example).

We notice a small, but positive assortativity $r$, which we also see on the degree correlation function plot.
From the average neighbour degree plot, the friendship paradox is not obvious here, mainly due to the absence of high degree "hub" nodes.


In [None]:
## Europe Electric Grid
edge_list = split.(readlines(datadir * "GridEurope/gridkit_europe-highvoltage.edges"))
vertex_ids = unique(reduce(vcat, edge_list))
vertex_map = Dict(vertex_ids .=> 1:length(vertex_ids))
grid = SimpleGraph(length(vertex_ids))
foreach(((from, to),) -> add_edge!(grid, vertex_map[from], vertex_map[to]), edge_list)

In [None]:
## plot degree vs avg neighbour degree

deg = degree(grid)
nad = [mean(degree(grid, neighbors(grid, v))) for v in 1:nv(grid)]
scatter(deg, nad, c="black", marker=".")
xlim((0, 18))
ylim((0, 18))
xlabel("node degree", fontsize=14)
ylabel("average neighbour degree", fontsize=14);
plot([0, 18], [0, 18], "--", c="gray")
print("r = ", assortativity(grid));

In [None]:
## Degree correlation function

knn = deg_corr_df(grid)
r = deg_corr_neutral(grid, degree)
scatter(eachcol(knn)..., c="black")
hlines(y=r, xmin=minimum(knn.src_deg), xmax=maximum(knn.src_deg), linestyles=":")
xlabel("degree (k)", fontsize=12)
ylabel(raw"$k_{nn}(\ell)$", fontsize=12);

## Assortativity: GitHub Developers Graph

This is another graph we saw before. 

Here, we observe negative assortativity, which is common for social networks, and we also strongly observe the friendship paradox phenomenon.

Despite the presence of high degree nodes, there is no evidence of the rich club phenomenon in this graph.

In [None]:
## GitHub Developers (undirected)
D = CSV.read(datadir * "GitHubDevelopers/musae_git_edges.csv", DataFrame) .+ 1
max_node_id = max(maximum(D.id_1), maximum(D.id_2))
git = SimpleGraph(max_node_id)
foreach(row -> add_edge!(git, row...), eachrow(D))

In [None]:
## plot degree vs avg neighbour degree
## zoom in on nodes with degree < LIM
LIM = 1000
deg = degree(git)
nad = [mean(degree(git, neighbors(git, v))) for v in 1:nv(git)]
scatter(deg, nad, c="black", marker=".")
xlim((0, LIM))
ylim((0, LIM))
xlabel("node degree", fontsize=14)
ylabel("average neighbour degree", fontsize=14);
plot([0, LIM], [0, LIM], "--", c="gray")
print("r = ", assortativity(git));

In [None]:
## degree correlation function
knn = deg_corr_df(git)
r = deg_corr_neutral(git, degree)
xlim((0, LIM))
scatter(eachcol(knn)..., c="black")
hlines(y=r, xmin=minimum(knn.src_deg), xmax=maximum(knn.src_deg), linestyles=":")
xlabel("degree (k)", fontsize=12)
ylabel(raw"$k_{nn}(\ell)$", fontsize=12);

In [None]:
## still no rich club group here
d = unique(degree(git))
rc = rich_club.(Ref(git), d)
semilogx(d, rc, ".", c="black")
xlabel("degree l", fontsize=12)
ylabel("rich club coefficient rho(l)");

## Showing a Rich Club phenomenon: the Actors graph

This data set is part of the accompanying material of the book "Complex Networks: Principles, Methods and Applications", V. Latora, V. Nicosia, G. Russo, Cambridge University Press (2017).

It has 248,243 nodes (actors) and 8,302,734 edges (actors co-appearing in at least 1 movie).

We use this data to illustrate the presence of a "rich club", which may be explained by the fact that famous actors tend to play in many movies (high degree) and with other famous actors (rich-club phenomenon).

To speed up the computation, we sample 100 distinct node degrees appearing in the graph for which we compute and plot the rich club coefficient. This can still take a few minutes to complete.


In [None]:
D = CSV.read(datadir * "Actors/movie_actors.net", header=[:id_1, :id_2], DataFrame) .+ 1
max_node_id = max(maximum(D.id_1), maximum(D.id_2))
g = SimpleGraph(max_node_id)
foreach(row -> add_edge!(g, row...), eachrow(D))
g = induced_subgraph(g, findall(>(0), degree(g)))[1]

d = sample(unique(degree(g)), 100, replace=false)

rc = rich_club.(Ref(g), d)
semilogx(d, rc, ".", c="black")
xlabel("degree l", fontsize=12)
ylabel("rich club coefficient rho(l)");