# Chapter 3 - Centrality Measures

## Requirements

* python-igraph
* set directory in the next cell

In [None]:
datadir = "../Datasets/"

In [None]:
using Graphs
using SimpleWeightedGraphs
using DataFrames
using CSV
using PyPlot
using GraphPlot
using LinearAlgebra
using StatsBase
using Random
using CategoricalArrays

In [None]:
ENV["COLUMNS"] = 1000

## US Airport Volume of Passengers

The nodes are represented by the 3-letter airport codes; we also read in the volume of passengers that we use as edge weights. The edges are directed.

We also read node attributes: lat/lon, state, city.

In [None]:
## read edges and build weighted directed graph
D = CSV.read(datadir * "Airports/connections.csv", DataFrame)

In [None]:
# normalize weights
max_passengers = maximum(D.total_passengers)
D.total_passengers /= max_passengers
extrema(D.total_passengers)

In [None]:
id2name = sort!(unique(union(D.orig_airport, D.dest_airport)))
name2id = Dict(id2name .=> axes(id2name, 1))
g = SimpleWeightedDiGraph(length(id2name))
for row in eachrow(D)
    from = name2id[row.orig_airport]
    to = name2id[row.dest_airport]
    from == to || add_edge!(g, from, to, row.total_passengers)
end
g

In [None]:
A = CSV.read(datadir * "Airports/airports_loc.csv", DataFrame)
A.id = [name2id[a] for a in A.airport]
@assert A.id == axes(A, 1)
@assert A.airport == id2name
A

In [None]:
connected_components(g)

In [None]:
A[connected_components(g)[end], :]

In [None]:
maximum(core_number(g))

## California subgraph - we look at several centrality measures 

In [None]:
## Build smaller subgraph for California (you can try other states)
## drop isolated vertices (i.e. without in-state connections)

CA = findall(==("CA"), A.state)
G = induced_subgraph(g, CA)[1]
A_CA = A[CA, :]
NZ = findall(>(0), degree(G))
G = induced_subgraph(G, NZ)[1]
A_CANZ = A_CA[NZ, :]

In [None]:
## again the graph is weakly connected except for 2 airports
connected_components(G)

In [None]:
A_CANZ[connected_components(G)[2], :]

In [None]:
## plot using lat/lon as layout
gplot(G, A_CANZ.lon, -A_CANZ.lat,
      NODESIZE=0.03, nodefillc="black",
      EDGELINEWIDTH=0.2, edgestrokec="gray", arrowlengthfrac=0.05,
      linetype="curve")

In [None]:
dir_degree_centrality(G::SimpleWeightedDiGraph) =
    (vec(sum(G.weights, dims=1)) + vec(sum(G.weights, dims=2))) / (2 * (nv(G) - 1))

In [None]:
function pagerank_simple(G::SimpleWeightedDiGraph; α=0.85)
    A = G.weights
    B = A ./ sum(A, dims=1)
    B[findall(isnan, B)] .= 1 / nv(G) # handle 0 out-degree nodes
    return (1 - α) / nv(G) * ((I - α * B) \ ones(nv(G)))
end

In [None]:
function hub_authority_simple(G::SimpleWeightedDiGraph)
    A = Matrix(G.weights)
    e = eigen(transpose(A) * A)
    λ = e.values[end]
    y = e.vectors[:, end]
    if all(<=(eps()), y)
        y .= -y
    end
    @assert all(>=(-eps()), y)
    x = A * y
    y ./= maximum(y)
    x ./= maximum(x)
    return x, y
end

In [None]:
function simple_closeness(G::SimpleGraph)
    c = zeros(nv(G))
    for i in 1:nv(G)
        x = gdistances(G, i)
        x .= min.(x, nv(G))
        c .+= x
    end
    return (nv(G) - 1) ./ c
end

In [None]:
function simple_eccentricity(G::SimpleDiGraph)
    return [replace(gdistances(G, v), typemax(Int) => 0) |> maximum for v in 1:nv(G)]
end

In [None]:
function harmonic_centrality(G::SimpleWeightedDiGraph)
    return [mean(replace(filter(x -> !isinf(x), 1 ./ gdistances(G, v)), 1 / typemax(Int) => 0)) for v in 1:nv(G)]
end

In [None]:
df = DataFrame("airport" => A_CANZ.airport,
    "degree" => dir_degree_centrality(G),
    "pagerank" => pagerank_simple(G),
    (["authority", "hub"] .=> hub_authority_simple(G))...,
    "between" => 2 * betweenness_centrality(SimpleDiGraph(G)),
    "harmonic" => harmonic_centrality(G),
    "closeness" => simple_closeness(SimpleGraph(SimpleDiGraph(G))),
    "eccentricity" => simple_eccentricity(SimpleDiGraph(G))
)
sort(df, :degree, rev=true)

In [None]:
## highlight top-3 airports w.r.t. pagerank
## plot using lat/lon as layout
gplot(G, A_CANZ.lon, -A_CANZ.lat,
      NODESIZE=0.03, nodefillc=ifelse.(ordinalrank(df.pagerank, rev=true) .<= 3, "red", "black"),
      EDGELINEWIDTH=0.2, edgestrokec="gray", arrowlengthfrac=0.05,
      linetype="curve")

In [None]:
## rank-based correlation between measures
DataFrame(corkendall(Matrix(df[:, 2:end])), names(df)[2:end])

In [None]:
coreness = core_number(G)

Mc = maximum(coreness)

color = [x == Mc ? "red" : x <= mc + 1 ? "blue" : "black" for x in coreness];

In [None]:
## plot nodes w.r.t. coreness
gplot(G, A_CANZ.lon, -A_CANZ.lat,
      NODESIZE=0.03, nodefillc=color,
      EDGELINEWIDTH=0.2, edgestrokec="gray", arrowlengthfrac=0.05,
      linetype="curve")

In [None]:
## the above uses the geographical layout, so it is not clear what is going on
## let's use a spring layout
## Different coreness is more clear
Random.seed!(12)
gplot(G,
      NODESIZE=0.03, nodefillc=color,
      EDGELINEWIDTH=0.2, edgestrokec="gray", arrowlengthfrac=0.05,
      linetype="curve")

In [None]:
## show closeness centralities, same layout
Random.seed!(12)
gplot(G, nodelabel=round.(df.closeness, digits=2),
      nodelabeldist=8, nodelabelangleoffset=π / 4,
      NODESIZE=0.01, nodefillc=color,
      EDGELINEWIDTH=0.2, edgestrokec="gray", arrowlengthfrac=0.05,
      linetype="curve")

In [None]:
## now the 13-core is clearly appearing, and we aso observe a small connected component
## that was buried in the previous viz.

## vertices in 13-core
## note that there are less than 14 nodes, this is an interesting remark and
## it is because we consider both in and out-going edges by default for directed graph.
df.airport[coreness.==Mc]

In [None]:
## comparing coreness with other centrality measures
df.coreness = coreness
combine(groupby(df, :coreness, sort=true),
        names(df, Between(:degree, :closeness)) .=> median,
        renamecols=false)

In [None]:
## group in 3 categories
df.core_grp = categorical([x <= 2 ? "low" : x == 13 ? "high" : "mid" for x in df.coreness])
levels!(df.core_grp, ["low", "mid", "high"])
df_grp = combine(groupby(df, :core_grp, sort=true),
    names(df, Between(:degree, :closeness)) .=> mean,
    renamecols=false)

In [None]:
## grouped barplot
bl, bm, bh = Vector.(eachrow(df_grp[:, 2:end]))
barWidth = 0.25
# Set position of bar on X axis
r1 = 1:length(bh)
r2 = r1 .+ barWidth
r3 = r2 .+ barWidth
# Make the plot
bar(r1, bh, color="red", width=barWidth, edgecolor="white", label="high coreness")
bar(r2, bm, color="blue", width=barWidth, edgecolor="white", label="mid coreness")
bar(r3, bl, color="black", width=barWidth, edgecolor="white", label="low coreness")

# Add xticks on the middle of the group bars
xlabel("measure", fontsize=14)
xticks(r2, names(df_grp, Not(1)), fontsize=10)
ylabel("score", fontsize=14)
# Create legend & Show graphic
legend(fontsize=12);

### delta-centrality example

In [None]:
## Delta-centrality with a simple pandemic spread model
function spread(A::AbstractMatrix, α=0.1)
    One = ones(size(A, 1))
    X = I - α * transpose(A)
    return transpose(One) * (X \ One) / size(A, 1)
end

function spread_delta_centrality(g::SimpleDiGraph, α=0.1)
    A = Matrix(adjacency_matrix(g))
    dc = Float64[]
    spr = spread(A, α)
    for i in 1:nv(g)
        A′ = copy(A)
        A′[i, :] .= 0
        A′[:, i] .= 0
        push!(dc, (spr - spread(A′, α)) / spr)
    end
    return dc
end

In [None]:
df.delta = spread_delta_centrality(SimpleDiGraph(G))
df2 = sort(df, :delta, rev=true)

In [None]:
heights = df2.delta
bars = df2.airport
y_pos = axes(bars, 1)
bar(y_pos, heights, color=recode(get.(df2.core_grp), "high" => "red", "mid" => "blue", "low" => "black"))
# Rotation of the bars names
ylabel("Delta Centrality", fontsize=12)
xticks(y_pos, bars, rotation=90)
yticks();

## Group centrality, centralization

Back to US graph, 
- which states have highest delta centralities w.r.t. efficiency?
- what about centralization for each state subgraph?

In [None]:
## group delta centrality
function efficiency(g::SimpleDiGraph)
    n = nv(g)
    s = 0
    for i in 1:n
        v = gdistances(g, i)
        s += sum([1 / x for x in v if 0 < x < n])
    end
    return s / (n * (n - 1))
end

In [None]:
sg = SimpleDiGraph(g)
states = unique(A.state)
eff_us = efficiency(sg)
dc = Float64[]
for s in states
    v = findall(==(s), A.state)
    csg = copy(sg)
    for i in 1:nv(csg), j in v
        rem_edge!(csg, i, j)
        rem_edge!(csg, j, i)
    end
    push!(dc, (eff_us - efficiency(csg)) / eff_us)
end
DC = DataFrame(state=states, delta_centrality=dc)
sort!(DC, :delta_centrality, rev=true)
first(DC, 3)

In [None]:
last(DC, 3)

In [None]:
## group centralization (using PageRank) -- by state
states = unique(A.state)
pr = Float64[]
st = String[]
for s in states
    v = findall(==(s), A.state)
    if length(v) > 5 ## look at states with more than 5 airports only
        G = induced_subgraph(g, v)[1]
        p = pagerank_simple(G)
        push!(pr, maximum(p) - mean(p))
        push!(st, s)
    end
end

DC = DataFrame("State" => st, "Pagerank Centralization" => pr)
sort!(DC, 2, rev=true)

In [None]:
Random.seed!(12)
v = findall(==("MI"), A.state)
G = induced_subgraph(g, v)[1]
NZ = findall(>(0), degree(G))
G = induced_subgraph(G, NZ)[1]
gplot(G,
      NODESIZE=0.03, nodefillc=[x == "DET" ? "red" : "black" for x in A.airport[v]],
      EDGELINEWIDTH=0.2, edgestrokec="gray", arrowlengthfrac=0.05,
      linetype="curve")

In [None]:
## state with one big hub city: Detroit
A.city[v[NZ]]

In [None]:
## lowest ones

last(DC, 3)

In [None]:
Random.seed!(3)
v = findall(==("ND"), A.state)
G = induced_subgraph(g, v)[1]
NZ = findall(>(0), degree(G))
G = induced_subgraph(G, NZ)[1]
gplot(G,
      NODESIZE=0.03, nodefillc="black",
      EDGELINEWIDTH=0.2, edgestrokec="gray", arrowlengthfrac=0.05,
      linetype="curve")

In [None]:
# no big city, two components
A.city[v]

In [None]:
# what about California
last(DC, 15)