# Chapter 6 - Graph Embeddings

In this notebook, we illustrate several graph embedding algorithms, we show how we can compare embeddings using an unsupervised framework, and we look at various applications such as visualization, clustering, link prediction and classification.

### Things to install:

We use a Julia package from https://github.com/KrainskiL/CGE.jl to compare graph embeddings.
Follow the instructions from that GitHub repository to install it.

Results presented in the book were run on MacOS. Most results are identical on Linux (we use seeds), but we found that Node2Vec can yield slightly different results. This can lead to small differences in some results, but not in the conclusions. 

Set the path(s) in the cell below. 

### Windows users:

You need to change ```cp``` to ```copy``` in the ```test_embeddings``` function. 


In [None]:
## the data directory
datadir = "../Datasets/"

In [None]:
using Arpack
using CGE
using CSV, DataFrames
using DelimitedFiles
using Graphs
using GraphMakie, GLMakie
using LinearAlgebra
using NetworkLayout
using PyPlot
using Random
using ScikitLearn
using ScikitLearn.CrossValidation: train_test_split
using Serialization
using StatsBase
using StatsPlots
using UMAP

In [None]:
@sk_import linear_model:LogisticRegression
@sk_import ensemble:RandomForestClassifier
@sk_import cluster:(KMeans, DBSCAN)
@sk_import metrics:(accuracy_score, roc_auc_score, roc_curve, confusion_matrix)
@sk_import metrics:(calinski_harabasz_score, adjusted_mutual_info_score)
@sk_import manifold:TSNE

In [None]:
using PyCall
run(`$(PyCall.python) -m pip install fastnode2vec numpy igraph partition_igraph scikit-network`)

In [None]:
ig = pyimport("igraph")
skn = pyimport("sknetwork")
pyimport("partition_igraph")

# A few useful functions

In [None]:
## as defined in Table 1 of node2vec paper for link prediction:
## https://cs.stanford.edu/~jure/pubs/node2vec-kdd16.pdf
function binary_operator(u, v, op::String="had")
    if op == "had"
        return u .* v
    elseif op == "l1"
        return abs.(u .- v)
    elseif op == "l2"
        return (u .- v) .^ 2
    elseif op == "avg"
        return (u .+ v) ./ 2.0
    else
        error("Unsupported operation: $op")
    end
end

## read embedding from disk, in node2vec format
function readEmbedding(fn::String="_embed", sort::Bool=true)
    df = CSV.File(fn; delim=' ', header=false, skipto=2) |> DataFrame
    # Drop any columns that are entirely missing
    df = df[:, all.(!ismissing, eachcol(df))]
    sort && sort!(df, :Column1)
    Y = Matrix(df[:, 2:end])
    return Y
end

## Read embedding from file in node2vec format
## Map to 2d layout format, using UMAP if dim > 2
function embed2layout(fn::String="_embed", seed::Int=123, n_jobs::Int=1)
    df = CSV.File(fn; delim=' ', header=false, skipto=2) |> DataFrame
    df = df[:, all.(!ismissing, eachcol(df))]
    sort!(df, :Column1)  # sort by first column
    Y = Matrix(df[:, 2:end])

    if size(Y, 2) > 2
        Random.seed!(seed)
        Y = umap(Y', 2)
        Y = Y'
    end
    return Tuple.(eachrow(Matrix(Y)))
end

## Computing Jensen-Shannon (JS) divergence with the Julia CGE framework code
## given files: edgelist, communities and embedding
function JS(edge_file, comm_file, embed_file, return_local=true, seed=123)
    out = Pipe()
    cmd = `julia --project ../CGE/CGE_CLI.jl -g $(edge_file) -c $(comm_file) -e $(embed_file) --seed $(string(seed))`
    run(pipeline(cmd, stdout=out, stderr=Pipe()))
    close(out.in)
    result = String(read(out))
    x = parse.(Float64, split(result[2:end-2], ", "))
    return_local && return (x[2], x[6])
    return x[2]
end

## save embedding to disk to compute divergence with Julia CGE framework
function saveEmbedding(X::AbstractMatrix, g, fn::String="_embed")
    open(fn, "w") do f
        println(f, "$(size(X, 1)) $(size(X, 2))")
        for i in 1:size(X, 1)
            print(f, i - 1, " ")
            for j in 1:size(X, 2)
                print(f, X[i, j], " ")
            end
            print(f, "\n")
        end
    end
end


## Hope embedding with various similarity functions

function Hope(g::AbstractGraph, sim::String="katz", dim::Int=2, verbose::Bool=false, beta::Float64=0.01, alpha::Float64=0.5)
    if !is_directed(g)
        dim *= 2
    end

    A = Matrix(adjacency_matrix(g))
    n = nv(g)
    M_g = I
    M_l = I

    if sim == "katz"
        M_g = I - beta * A
        M_l = beta * A
    elseif sim == "aa"
        M_g = I
        D = Diagonal([x > 1 ? 1 / log(x) : 0.0 for x in degree(g)])
        M_l = A * D * A
        M_l[diagind(M_l)] .= 0.0
    elseif sim == "cn"
        M_g = I
        M_l = A * A
    elseif sim == "ppr"
        P = zeros(n, n)
        for i in 1:n
            s = sum(A[i, :])
            if s > 0
                P[i, :] = A[i, :] ./ s
            else
                P[i, :] .= 1 / n
            end
        end
        P = transpose(P)
        M_g = I - alpha * P
        M_l = (1 - alpha) * I
    end

    S = inv(M_g) * M_l
    u, s, v = svd(S)
    k = dim ÷ 2
    u = u[:, begin:k]
    s = s[begin:k]
    v = v[begin:k, :]
    sqrt_s = Diagonal(sqrt.(s))
    X1 = u * sqrt_s
    X2 = v' * sqrt_s
    X = hcat(X1, X2)

    p_d_p_t = u * Diagonal(s) * v
    eig_err = norm(p_d_p_t - S)

    if verbose
        println("SVD error (low rank): $eig_err")
    end

    if !is_directed(g)
        d = dim ÷ 2
        return X[:, 1:d]
    else
        return X
    end
end


## Laplacian eigenmaps embedding
function LE(g, dim::Int=2)
    adjmat = Graphs.LinAlg.CombinatorialAdjacency(adjacency_matrix(g))
    L_sym = Matrix(I - Diagonal(adjmat.D .^ (-1 / 2)) * (adjmat.A) * Diagonal(adjmat.D .^ (-1 / 2)))
    w, v = eigs(L_sym, nev=dim + 1, which=:SM)  # Smallest magnitude eigenvalues
    idx = sortperm(real(w))  # Sort eigenvalues
    w = w[idx]
    v = v[:, idx]
    X = v[:, 2:end]
    return X
end

# ## Returns a LaTeX bmatrix
function bmatrix(a::AbstractArray)
    if ndims(a) > 2
        throw(ArgumentError("bmatrix can at most display two dimensions"))
    end

    lines = split(replace(string(a), ['[', ']'] => ""), '\n')
    rv = [raw"\begin{bmatrix}"]
    append!(rv, ["  " * join(split(l), " & ") * raw"\\" for l in lines])
    push!(rv, raw"\end{bmatrix}")
    return join(rv, "\n")
end

## plot graph without axes and background grid
function clean_graphplot(G::AbstractGraph; kwargs...)
    f, ax, p = graphplot(G; kwargs...)
    hidedecorations!(ax)
    hidespines!(ax)
    return f
end

# Figure 6.2 in the Book

This is to illustrate random walks on (directed) graphs.


In [None]:
g = SimpleDiGraph(4)
for e in [(1, 2), (2, 3), (2, 4), (3, 2), (4, 3)]
    add_edge!(g, e...)
end

clean_graphplot(g,
    node_size=20,
    ilabels=["A", "B", "C", "D"],
    arrow_size=25)

# Datasets

* ```abcd```: is a small ABCD graph (100 nodes), mainly for visualization and quick examples
* ```ABCD1```: is a larger ABCD graph (1000 nodes), for experiments. It is noisy with $\xi=0.6$.
* ```ABCD2```: Similar to ```ABCD1``` but less noisy with $\xi=0.2$.
* ```zac```: Zachary (karate club) graph, mainly for visualzation

The small ```abcd``` graph was generated with the following parameters:

```
n = "100"                     # number of vertices in graph
t1 = "3"                      # power-law exponent for degree distribution
d_min = "5"                   # minimum degree
d_max = "15"                  # maximum degree
d_max_iter = "1000"           # maximum number of iterations for sampling degrees
t2 = "2"                      # power-law exponent for cluster size distribution
c_min = "25"                  # minimum cluster size
c_max = "50"                  # maximum cluster size
xi = "0.2"                    # fraction of edges to fall in background graph
```

The larger ```ABCD1``` and ```ABCD2``` graphs were generated with the following parameters:

```
n = "1000"                     # number of vertices in graph
t1 = "3"                       # power-law exponent for degree distribution
d_min = "10"                   # minimum degree
d_max = "100"                  # maximum degree
d_max_iter = "1000"            # maximum number of iterations for sampling degrees
t2 = "2"                       # power-law exponent for cluster size distribution
c_min = "50"                   # minimum cluster size
c_max = "150"                  # maximum cluster size
xi = "0.6" or "0.2"            # fraction of edges to fall in background graph
```


### Load the small ABCD graph and visualize

Beware: node names are 1-based and are distinct from vertex ids.

In [None]:
## read graph and communities
abcd_edgelist = readdlm(datadir * "ABCD/abcd_100.dat", Int)
c_abcd = readdlm(datadir * "ABCD/abcd_100_comms.dat", Int)[:, 2]
n = length(c_abcd)

abcd = SimpleGraph(n)
for row in eachrow(abcd_edgelist)
    add_edge!(abcd, row...)
end

## print a few stats
println(nv(abcd), " vertices, ", ne(abcd), " edges, ", "mean degree: ", mean(degree(abcd)),
    ", no. of communities: ", maximum(c_abcd))

## define the colors and node sizes here
clean_graphplot(abcd,
    node_size=15,
    node_strokewidth=1,
    node_color=[:white, :gray, :black][c_abcd],
    edge_color=:lightgray)


### Load the larger ABCD graphs and visualize

```ABCD1```: this is a larger graph with lots of noise edges ($\xi$=0.6). Node colours refer to the communities.
With this amount of noise, the communities are far from obvious on a 2-dimensional layout.

```ABCD2```: the second graph has stronger communities ($\xi$=0.2).


In [None]:
## read graph and communities
abcd_edgelist = readdlm(datadir * "ABCD/abcd_1000.dat", Int)
c_abcd1 = readdlm(datadir * "ABCD/abcd_1000_comms.dat", Int)[:, 2]
n = length(c_abcd1)

ABCD1 = SimpleGraph(n)
for row in eachrow(abcd_edgelist)
    add_edge!(ABCD1, row...)
end

## print a few stats
println(nv(ABCD1), " vertices, ", ne(ABCD1), " edges, ", "mean degree: ", mean(degree(ABCD1)),
    ", no. of communities: ", maximum(c_abcd1))

## define the colors and node sizes here
clean_graphplot(ABCD1,
    node_size=15,
    node_strokewidth=1,
    node_color=c_abcd1,
    node_attr=(colormap=:lightrainbow,),
    edge_color=:lightgray)

In [None]:
## read graph and communities
abcd_edgelist = readdlm(datadir * "ABCD/abcd_1000_xi2.dat", Int)
c_abcd2 = readdlm(datadir * "ABCD/abcd_1000_xi2_comms.dat", Int)[:, 2]
n = length(c_abcd2)

ABCD2 = SimpleGraph(n)
for row in eachrow(abcd_edgelist)
    add_edge!(ABCD2, row...)
end

## print a few stats
println(nv(ABCD2), " vertices, ", ne(ABCD2), " edges, ", "mean degree: ", mean(degree(ABCD2)),
    ", no. of communities: ", maximum(c_abcd2))

## define the colors and node sizes here
clean_graphplot(ABCD2,
    node_size=15,
    node_strokewidth=1,
    node_color=c_abcd2,
    node_attr=(colormap=:lightrainbow,),
    edge_color=:lightgray)

## Graph layouts 

We show a variety of graph layout functions available in `GraphMakie` on the Zachary graph.


### Zachary (karate club) graph


In [None]:
zac_edgelist = readdlm(datadir * "Zachary/zachary.edgelist", Int) .+ 1
c = vec(readdlm(datadir * "Zachary/zachary.communities", Int)) .+ 1
n = length(c)
zac = SimpleGraph(n)
for row in eachrow(zac_edgelist)
    add_edge!(zac, row...)
end
zac

In [None]:
## Spring (repulsion) layout
clean_graphplot(zac,
    layout=Spring(),
    node_size=10,
    node_strokewidth=1,
    node_color=[:gray, :black][c],
    edge_color=:lightgray,
    figure=(size=(300, 300),))

In [None]:
## Scalable Force Directed Placement layout
clean_graphplot(zac,
    layout=SFDP(),
    node_size=10,
    node_strokewidth=1,
    node_color=[:gray, :black][c],
    edge_color=:lightgray,
    figure=(size=(300, 300),))

In [None]:
## Stress Majorization layout
clean_graphplot(zac,
    layout=Stress(),
    node_size=10,
    node_strokewidth=1,
    node_color=[:gray, :black][c],
    edge_color=:lightgray,
    figure=(size=(300, 300),))

In [None]:
## Shell (circular) layout
clean_graphplot(zac,
    layout=Shell(),
    node_size=10,
    node_strokewidth=1,
    node_color=[:gray, :black][c],
    edge_color=:lightgray,
    figure=(size=(300, 300),))

In [None]:
## Grid layout
clean_graphplot(zac,
    layout=SquareGrid(),
    node_size=10,
    node_strokewidth=1,
    node_color=[:gray, :black][c],
    edge_color=:lightgray,
    figure=(size=(300, 300),))

In [None]:
## Spectral layout
clean_graphplot(zac,
    layout=Spectral(dim=2),
    node_size=10,
    node_strokewidth=1,
    node_color=[:gray, :black][c],
    edge_color=:lightgray,
    figure=(size=(300, 300),))

# Generate and compare several embeddings -- Zachary graph

We run a few graph embedding algorithms on the Zachary graph with
different parameters. 

For example, we try different embedding dimensions.

We run the following:
* node2vec with different values for $p$ and $q$
* HOPE with different similarities
* Laplacian Eigenmaps (LE)

For each embedding, we use the communities obtained with **ECG** along with the **CGE** framework to compute the **graph embedding divergence** with the **CGE** Julia package. We visualize some good and bad results w.r.t. the global divergence score.

For embeddings with low divergence, we see good separation of the communities (even in 2-dim projection, using **UMAP**), while this is not the case for embeddings with high divergence.

Since we are going to compare embeddings for several graphs, we write the procedure as a function below. This function keeps a local copy of the best (`_embed_best`) and worst (`_embed_worst`) embeddings on disk, and returns the JS divergence (including local) for every test.

#### Windows users:

change ```cp``` to ```copy``` below


In [None]:
#### compare several embedding methods
function test_embeddings(
    G,
    edgefile::String,
    commfile::String;
    run_hope::Bool=true,
    run_le::Bool=true,
    run_n2v::Bool=true,
    Dims::Vector{Int}=[2, 4, 8],
    local_flag::Bool=false,
    verbose::Bool=true,
    seed::Int=123
)
    L = Vector{Any}[]  # to store results
    best_jsd = 1.0
    worst_jsd = 0.0

    if run_hope
        for dim in Dims, sim in ["katz", "ppr", "cn", "aa"]
            X = Hope(G, sim, dim)
            saveEmbedding(X, G)
            div = JS(edgefile, commfile, "_embed")
            jsd = local_flag ? div[2] : div[1]
            if verbose
                println("HOPE: dim=$dim sim=$sim jsd=$jsd")
            end
            if jsd < best_jsd
                run(`cp _embed _embed_best`)
                best_jsd = jsd
            end
            if jsd > worst_jsd
                run(`cp _embed _embed_worst`)
                worst_jsd = jsd
            end
            push!(L, [dim, "hope", sim, div[1], div[2]])
        end
    end

    if run_le
        for dim in Dims
            X = LE(G, dim)
            saveEmbedding(X, G)
            div = JS(edgefile, commfile, "_embed")
            jsd = local_flag ? div[2] : div[1]
            if verbose
                println("LE: dim=$dim jsd=$jsd")
            end
            if jsd < best_jsd
                run(`cp _embed _embed_best`)
                best_jsd = jsd
            end
            if jsd > worst_jsd
                run(`cp _embed _embed_worst`)
                worst_jsd = jsd
            end
            push!(L, [dim, "le", " ", div[1], div[2]])
        end
    end

    if run_n2v
        for dim in Dims, (p, q) in [(1, 0.5), (0.5, 1), (1, 1)]
            cmd = `$(PyCall.python) ./n2v_to_file.py $(edgefile) $(dim) $(p) $(q) $(seed)`
            run(pipeline(cmd, stderr=Pipe()))
            div = JS(edgefile, commfile, "_embed")
            jsd = local_flag ? div[2] : div[1]
            if verbose
                println("n2v: dim=$dim p=$p q=$q jsd=$jsd")
            end
            if jsd < best_jsd
                run(`cp _embed _embed_best`)
                best_jsd = jsd
            end
            if jsd > worst_jsd
                run(`cp _embed _embed_worst`)
                worst_jsd = jsd
            end
            push!(L, [dim, "n2v", "$p $q", div[1], div[2]])
        end
    end

    # Convert to DataFrame
    D = DataFrame(permutedims(hcat(L...)), [:dim, :algo, :param, :jsd, :local_jsd])
    if local_flag
        sort!(D, :local_jsd)
    else
        sort!(D, :jsd)
    end
    return D
end


The code below takes about one minute to run as several embeddings are tested.


In [None]:
D = test_embeddings(zac,
    datadir * "Zachary/zachary.edgelist",
    datadir * "Zachary/zachary.ecg",
    Dims=[2, 4])
first(D, 5)

In [None]:
## plot top results
l = embed2layout("_embed_best", 123)
clean_graphplot(zac,
    layout=l,
    node_size=10,
    node_strokewidth=1,
    node_color=[:gray, :black][c],
    edge_color=:lightgray,
    figure=(size=(300, 300),))

In [None]:
## plot result with largest divergence
l = embed2layout("_embed_worst", 123)
clean_graphplot(zac,
    layout=l,
    node_size=10,
    node_strokewidth=1,
    node_color=[:gray, :black][c],
    edge_color=:lightgray,
    figure=(size=(300, 300),))

# Compare embeddings - small ABCD  graph

This is the same exercise as what we did above, this time for the 100-nodes ABCD graph.

We look at slightly higher embedding dimension as there are more nodes than the Zachary graph.


### effect of local divergence score

So far we considered the global Jenssen-Shannon divergence, where the objective is to preserve the community structure.

We show the best result with respect to the global divergence below, and we see that it preserves the community structure. We may want better separation of the nodes within community, based on their connectivity. This is what the local Jenssen-Shannon divergence can provide. 

Below we also show an embedding with lower local divergence. The result is an embedding that still preserves community structure, but nodes within community are more separated than with the global divergence.

The code below takes about one minute to run as several embeddings are tested.


In [None]:
D = test_embeddings(abcd,
    datadir * "ABCD/abcd_100.dat",
    datadir * "ABCD/abcd_100.ecg",
    Dims=[2, 16])
first(D, 5)

In [None]:
PyPlot.loglog(D.jsd ./ minimum(D.jsd), D.local_jsd ./ minimum(D.local_jsd), "o", color="black", base=2)
xlabel("Global divergence score (normalized)", fontsize=14)
ylabel("Local divergence score (normalized)", fontsize=14);

In [None]:
## plot result with lowest global JS divergence
l = embed2layout("_embed_best", 123)
clean_graphplot(abcd,
    layout=l,
    node_size=10,
    node_strokewidth=1,
    node_color=[:white, :lightgray, :black][c_abcd],
    edge_color=:lightgray,
    figure=(size=(300, 300),))

In [None]:
## look at results with low local divergence
first(sort(D, "local_jsd"), 5)

In [None]:
## Plot one of the top ones
X = Hope(abcd, "ppr", 16)
saveEmbedding(X, abcd)
l = embed2layout("_embed", 123)
clean_graphplot(abcd,
    layout=l,
    node_size=10,
    node_strokewidth=1,
    node_color=[:white, :lightgray, :black][c_abcd],
    edge_color=:lightgray,
    figure=(size=(300, 300),))

In [None]:
last(D, 5)

In [None]:
## Plot one of the bottom ones
X = Hope(abcd, "katz", 2)
saveEmbedding(X, abcd)
l = embed2layout("_embed", 123)
clean_graphplot(abcd,
    layout=l,
    node_size=10,
    node_strokewidth=1,
    node_color=[:white, :lightgray, :black][c_abcd],
    edge_color=:lightgray,
    figure=(size=(300, 300),))

# Classification on the larger ABCD graph

We saw that embedding can be used to visualize graphs. Below we use graph embedding as a way to define a feature vector (a point in vector space) for each node, and we use this representation to train a classifier.

We use the ```ABCD1``` (noisy) graph.

We use a saved embedding (48-dimension running HOPE with "ppr" similarity).

We split the data (the nodes) into a training and testing set. Using the training set, we build a **random forest** classification model where the classes are the communities for each node.

We then apply this model to the test set.

The graph has 1000 nodes; we use 250 nodes for training and the rest for testing; we obtain good accuracy (around 90%).

What do you think will happen if we increase/decrease the size of the training set?

We also report the confusion matrix (details in section 6.7 of the book).

Finally, we compare with results obtained via a baseline **random** classifier where we supply the correct number of classes and their relative sizes. We see that our random forest model gives much better results than with a random classifier.


In [None]:
## load a saved embedding for the ABCD graph
X = readEmbedding(datadir * "ABCD/abcd_1000_embed_best", true)
y = c_abcd1

## train/test split
Random.seed!(1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.75, random_state=0);

In [None]:
# Create the model with 100 trees
model = RandomForestClassifier(n_estimators=100,
    bootstrap=true,
    max_features="sqrt")
# Fit on training data
ScikitLearn.fit!(model, X_train, y_train)

# Class predictions on test data
y_pred = ScikitLearn.predict(model, X_test);

In [None]:
## Confusion matrix
cm = confusion_matrix(y_test, y_pred)

In [None]:
## percent correct -- this can vary slightly as we split train/test randomly
println("accuracy:", sum(diag(cm)) / sum(cm))

In [None]:
## compare with random classifier
## assuming we know the number of classes (12)
## and using class proportions from training data
ctr = countmap(y_train)
x = [ctr[i] for i in 1:12]
s = sum(x)
p = [i / s for i in x]
acc = []
for rep in 1:30 ## repeat 30 times, we"ll take average
    y_pred = [x + 1 for x in StatsBase.sample(1:12, Weights(p), length(y_test))]
    cm = confusion_matrix(y_test, y_pred)
    push!(acc, sum(diag(cm)) / sum(cm))
end
## accuracy
println("Average accuracy:", mean(acc))

# Clustering in embedded space

Again using the larger (noisy) ```ABCD1``` graph, we run some graph clustering algorithms (Leiden and ECG).

We run each algorithm several times are report two statistics:
* the modularity score of the clustering, and
* the adjusted mutual information (AMI) score when comparing with ground-truth (GT) communities.

We also try seeding Leiden with initial clusters obtained with k-means (in embedded space) where k=100.

We also run k-means (with 5 choices for k, including correct value) in embedded vector space.

We use the same saved embedding than in the previous experiment. 

This time, we report:
* the CHS score (Calinski and Harabasz score, or Variance Ratio Criterion); higher value is indicative of better quality clustering
* the adjusted mutual information (AMI) score when comparing with ground-truth (GT) communities.

In practical applications where we do not have access to the ground-truth, we need some other measure to quantify the quality of the clusters we obtain, such as modularity or CHS. We report AMI for runs with highest score (w.r.t. modularity or CHS) for each clustering algorithm.

The cell below can take a few minutes to run. Results are provided in a .ser file. Uncomment the cell below to re-run.


In [None]:
abcd_edgelist = readdlm(datadir * "ABCD/abcd_1000.dat", Int)
c_abcd1 = readdlm(datadir * "ABCD/abcd_1000_comms.dat", Int)[:, 2]
n = length(c_abcd1)
ig_ABCD1 = ig.Graph()
ig_ABCD1.add_vertices(n)
ig_ABCD1.add_edges([(src - 1, dst - 1) for (src, dst) in eachrow(abcd_edgelist)])
set!(ig_ABCD1.vs, "comm", c_abcd1);

In [None]:
## load results from above cell
## load test results
D = deserialize(datadir * "ABCD/abcd_1000_clustering_jl.ser");

In [None]:
## AMI results with best scoring clustering for the 3 algorithms
x = first(sort(filter(row -> startswith(row.algo, "km"), D), :scr, rev=true).ami)
println("K-Means best, AMI:", x)

x = first(sort(D[D.algo.=="lei", :], :scr, rev=true).ami)
println("Leiden best, AMI:", x)

x = first(sort(D[D.algo.=="ecg", :], :scr, rev=true).ami)
println("ECG best, AMI:", x)

x = first(sort(D[D.algo.=="km+lei", :], :scr, rev=true).ami)
println("K-Means+Leiden best, AMI:", x)


Next, we summarize the results for all runs in a boxplot. 

Results with k-means are best when we supply the correct number of clusters (12). 

We see excellent results when using ECG or Leiden, in particular with the initial partition provided by k-means with large k=100.


In [None]:
## boxplot the AMI results
algos = ["km6", "km9", "km12", "km15", "km24", "lei", "km+lei", "ecg"]
colnames = ["k-means(6)", "k-means(9)", "k-means(12)", "k-means(15)",
    "k-means(24)", "Leiden", "k-Leiden", "ECG"]

A = [D[D.algo.==a, :ami] for a in algos]
B = DataFrame(A, colnames)
PyPlot.boxplot(Matrix(B)', labels=names(B))
PyPlot.xticks(rotation=30)
ylabel("Adjusted Mutual Information (AMI)");

Below we cluster using the DBSCAN algorithm after reducing the dimension via UMAP.
Running a good dimension reduction algorithm such as UMAP before clustering in vector space often gives better results. This is for illustration and you can experiment with different choices of parameter below.

DBSCAN does not always cluster all the points, which can be quite useful in practice. Some points can be tagged as *outliers*. Below, we compute AMI with and without the outlying points. 
Result without outliers is quite good (recall that unlike k-means, we do not supply the number of communities here).


In [None]:
## load the saved low divergence embedding
ABCD1_emb = readEmbedding(datadir * "ABCD/abcd_1000_embed_best", true)

## We tried a few "min_sample" and "dim" with good results using 8 and 16 resp.
top = 0
e_top = 0
dim = 16  ## reduce to this dimension
ms = 8    ## min-sample in DBSCAN
U = Matrix(umap(ABCD1_emb', dim)')
## We try various "eps" and pick the best via calinski_harabasz_score (CHS)
for e in 0.4:0.0025:0.5
    cl = ScikitLearn.fit!(DBSCAN(eps=e, min_samples=ms), U)
    labels = cl.labels_
    s = calinski_harabasz_score(U, labels) ## CHS score
    if s > top
        top = s
        e_top = e
    end
end
## result with best CHS score
cl = ScikitLearn.fit!(DBSCAN(eps=e_top, min_samples=ms), U)
b = [x > -1 for x in cl.labels_]
l = get(ig_ABCD1.vs, "comm")
v = [l[i] for i in 1:length(l) if b[i]]
println("AMI without outliers:", adjusted_mutual_info_score(v, cl.labels_[b]))
println("AMI with outliers:", adjusted_mutual_info_score(l, cl.labels_))

# Link prediction

Given a graph, link prediction aims at finding pairs of nodes not linked by an edge that are the most likely to actually have an edge between them. This could happen if we have a partial view of a graph. For example if edges 
are observed over some period of time, which new edges are we most likely to observe next?

In order to simulate this situation, we take the ```ABCD1``` graph with 1,000 nodes and drop 10% of the edges.
We re-compute the embedding (since the graph has changed), train a logistic regression model using pairs
of nodes with and without an edge, and apply the model to a test set consisting of the dropped edges, and other 
pairs of nodes not linked by an edge.

Given node embeddings $e(u)$ and $e(v)$, a representation for the node pair $(u,v)$ is obtained via some binary operator $B(e(u),e(v))$. We defined the same 4 operators as in https://arxiv.org/pdf/1607.00653.pdf at the beginning of this notebook, and will use the **Hadamar** operator in the experiments below.

We build the training data by considering all edges in the reduced graph, and an equal number of node pairs without an edge. From this data, we build a logistic regression model to predict edges vs non-edges. We then apply the model to the test set which includes the dropped edges, and the same number of non-edges.

First we tried with the ```ABCD1``` graph with noise parameter $\xi=0.6$.
Given the large number of "noise" edges, results are not very good, as expected.

We do another set of tests, this time with the ```ABCD2``` graph with lower noise parameter $\xi=0.2$, with better results.

Since we are going to run this experiment for several graphs, we write the procedure as a function below. 


In [None]:
# default: model with Hadamard binary operator (other choices are "l1", "l2 and "avg")
## test_eid: the list of edges in the test set (dropped edges)
## Emb: the embedding
function link_pred_exp(G::SimpleGraph, test_eid::Vector{Int}, Emb::Matrix{Float64};
    seed::Int=123, op::String="had",
    return_plot::Bool=false,
    verbose::Bool=true)
    ## make a copy of the graph and drop some edges
    Gp = deepcopy(G)
    gp_e = collect(edges(Gp))
    for eid in test_eid
        src, dst = gp_e[eid].src, gp_e[eid].dst
        rem_edge!(Gp, src, dst)
    end
    X = collect.(eachrow(Emb))

    ## Build training data, first the edges
    F = [binary_operator(X[src(e)], X[dst(e)], op) for e in edges(Gp)]
    f = ones(Int, length(F))

    ## then for equal number of non-edges (we over-sample to drop edges or collisions from the list)
    n = length(F)
    Random.seed!(seed)
    non_edges = Set{Tuple{Int,Int}}()
    while length(non_edges) < n
        u, v = rand(1:nv(G), 2)
        u == v && continue
        u, v = min(u, v), max(u, v)
        has_edge(Gp, u, v) && continue
        push!(non_edges, (u, v))
    end
    append!(F, [binary_operator(X[u], X[v], op) for (u, v) in non_edges])
    append!(f, zeros(Int, n))

    ## train the model, here a logistic regression
    logreg = LogisticRegression(random_state=seed)
    ScikitLearn.fit!(logreg, hcat(F...)', f)

    ## prepare test set, first with all dropped edges from G
    g_e = collect(edges(G))
    X_test = [binary_operator(X[g_e[i].src], X[g_e[i].dst], op) for i in test_eid]
    y_test = ones(Int, length(X_test))

    ## then for equal number of non-edges (we over-sample to drop edges and collisions from the list)
    n = length(X_test)
    Random.seed!(seed)
    non_edges_test = Set{Tuple{Int,Int}}()
    while length(non_edges_test) < n
        u, v = rand(1:nv(G), 2)
        u == v && continue
        u, v = min(u, v), max(u, v)
        has_edge(G, u, v) && continue
        push!(non_edges_test, (u, v))
    end
    append!(X_test, [binary_operator(X[u], X[v], op) for (u, v) in non_edges_test])
    append!(y_test, zeros(Int, n))

    ## apply the model to test data
    X_test_mat = Matrix(hcat(X_test...)')
    _acc = ScikitLearn.score(logreg, X_test_mat, y_test)
    probs = predict_proba(logreg, X_test_mat)[:, 2]
    _auc = roc_auc_score(y_test, probs)
    if verbose
        println("Accuracy of logistic regression classifier with $op on test set: $(round(_acc, digits=2))")
        println("AUC: ", round(_auc, digits=4))
    end
    if return_plot
        logit_roc_auc = roc_auc_score(y_test, predict_proba(logreg, X_test_mat)[:, 2])
        fpr, tpr, thresholds = roc_curve(y_test, predict_proba(logreg, X_test_mat)[:, 2])
        PyPlot.figure()
        PyPlot.plot(fpr, tpr, color="gray", label="Logistic Regression (AUC = $(round(logit_roc_auc,digits=2)))")
        PyPlot.plot([0, 1], [0, 1], "k--")
        PyPlot.xlim([0.0, 1.0])
        PyPlot.ylim([0.0, 1.05])
        PyPlot.xlabel("False Positive Rate", fontsize=14)
        PyPlot.ylabel("True Positive Rate", fontsize=14)
        PyPlot.title("")
        PyPlot.legend(loc="lower right")
    end
    if !verbose
        return _acc, _auc
    end
end

### Link prediction with noisy ABCD graph $\xi = 0.6$

The results are better than random, but not great; recall that $\xi$=0.6, so the majority of edges are noise to start with, so link prediction is very hard in this case. We try with less noisy graph next.


In [None]:
## pick 10% edges at random for test set, save new graph as Gp
Random.seed!(123)
test_size = round(Int, 0.1 * ne(ABCD1))
test_eid = StatsBase.sample(1:ne(ABCD1), test_size, replace=false)
Gp = deepcopy(ABCD1)
gp_e = collect(edges(Gp))
for eid in test_eid
    src, dst = gp_e[eid].src, gp_e[eid].dst
    rem_edge!(Gp, src, dst)
end

## select a low divergence embedding (from separate tests)
X = LE(Gp, 8)

## run the experiment - link prediction
link_pred_exp(ABCD1, test_eid, X, return_plot=true)


### Link prediction with less noisy ABCD graph $\xi = 0.2$


We test all embeddings on the modified ```ABCD2``` graph, after removing the test set edges.

We also perform the link prediction experiment for several different embeddings, and store the **accuracy** and **AUC** for each run.

Again the cell below can take a few minutes - uncomment to run.

We saved the results in a .ser file.

In [None]:
## load results from pickle file
D = deserialize(datadir * "ABCD/abcd_1000_xi2_linkpred_jl.ser");

In [None]:
## accuracy values vary from about 60% to 83% - top values shown here
first(sort(D, "acc", rev=true), 5)

In [None]:
## kendall-tau divengence: Accuracy vs global/local divergence scores
println("global divergence:", corkendall(Float64.(D.jsd), D.acc))
println("local divergence:", corkendall(Float64.(D.local_jsd), D.acc))

In [None]:
no_of_balls = size(D, 1)

## normalize the scores (divide by the min values)
x = D.jsd ./ minimum(D.jsd)
y = D.local_jsd ./ minimum(D.local_jsd)

## plot results as "balls" with area proportional to the accuracy
acc = D.acc
areas = [(25 * i - 10)^2 for i in acc]
PyPlot.figure()
PyPlot.scatter(x, y, s=areas, alpha=0.85, color="dimgrey")
PyPlot.xlabel("Global divergence score (normalized)", fontsize=13)
PyPlot.ylabel("Local divergence score (normalized)", fontsize=13)

# pick markersize to approximate the size ball sizes corresponding to 60% and 83% accuracy
p1 = PyPlot.plot([], [], color="white", marker="o", markersize=6, markerfacecolor="dimgrey")
p2 = PyPlot.plot([], [], color="white", marker="o", markersize=11, markerfacecolor="dimgrey")
PyPlot.legend((p1[1], p2[1]), ("50%", "80%",), numpoints=1, loc=4, title="test set accuracy");

In [None]:
## redo experiment for best case and draw the ROC curve

## low divergence embedding from tests in previous cells
Edges = Matrix(hcat([[e.src - 1, e.dst - 1] for e in edges(ABCD2)]...)')
writedlm("_edges.dat", Edges)
cmd = `$(PyCall.python) ./n2v_to_file.py _edges.dat 8 1.0 0.5 123`
run(pipeline(cmd, stderr=Pipe()))
X = readEmbedding("_embed", true)
link_pred_exp(ABCD2, test_eid, X, return_plot=true)


In [None]:
## now try with high local divergence embedding from test in previous cell
X = Hope(ABCD2, "cn", 2)
link_pred_exp(ABCD2, test_eid, X, return_plot=true)


## Supervised learning - using classification accuracy to compare embeddings

We saw earlier an **unsupervised** method for selecting good graph embeddings where we computed some divergence score. 

In **supervised** case, it is usually better to take advantage of the known labels to compare embeddings.

With this experiment, we do the following using the 1,000 nodes ```ABCD1``` graph. Recall that in this case, the class is the ground-truth community for each node. 

* we partition the nodes into training, validation and test sets in proportion 25%/25%/50%
* we generate 40 different embeddings (3 algorithms, different parameters)
* from each embedding, 
  * we compute the JS divergences (unsupervised score)
  * we use the training data to build a classification model (random forest)
  * we apply this model to the validation set 
  * we compute the accuracy score (supervised score) 

The code to do this is commented out in the cell below as this can take several minutes to run. 
A serialized results are included in data directory and can be read directly.

In [None]:
## load results from the above
id_train, id_val, id_trainval, id_test, L = deserialize(datadir * "ABCD/abcd_1000_embeddings_jl.ser")

## labels for train/validation/test sets
y_all = c_abcd1
y_train = [y_all[i] for i in id_train]
y_trainval = [y_all[i] for i in id_trainval] ## training+validation sets
y_val = [y_all[i] for i in id_val]
y_test = [y_all[i] for i in id_test];

Below we compute the rank-based  Kendall-tau correlation between the divergence score (unsupervised) and the accuracy score (supervised). We see negative correlation which is to be expected since respectively low divergence and high accuracy are better.

In [None]:
## correlation: divergence and accuracy
R = DataFrame(permutedims(hcat(L...)), ["dim", "algo", "param", "div_glb", "div_loc", "val_acc"])
println("global divergence:", corkendall(Float64.(R.div_glb), Float64.(R.val_acc)))
println("local divergence:", corkendall(Float64.(R.div_loc), Float64.(R.val_acc)))
first(R, 5)

In the next 2 cells, we show the top results on the validation set respectively for the divergence and accuracy scores. We also add two columns with the respective ranks.


In [None]:
## sort by JS-divergence on validation set
df_size = nrow(R)
R = sort(R, :div_glb)
R.rank_div_glb = 1:df_size
R = sort(R, :div_loc)
R.rank_div_loc = 1:df_size
R.rank_div = (R.rank_div_glb .+ R.rank_div_loc) ./ 2
R = sort(R, :rank_div)
first(R, 5)

In [None]:
## sort by Accuracy on validation set
R = sort(R, :val_acc, rev=true)
R.rank_val_acc = 1:nrow(R)
first(R, 5)

Below we show the lowest accuracy results. We see that there is quite a range of accuracy on the validation set!


In [None]:
last(R, 5)

###  Apply the models to the test set. 

In the previous cells, we built a table ranking the different algorithms w.r.t. accuracy and divergence using the training and validation sets. Here, we go through the same algorithms in (decreasing) order of accuracy, re-train with each model using the training and validation sets, and apply to the test set.

A serialized results are provided.

Uncomment the cell below to re-run.


In [None]:
## load test results
top_acc = deserialize(datadir * "ABCD/abcd_1000_embeddings_test_jl.ser");

In [None]:
## add test results
R.test_acc = top_acc
println("mean accuracy over all models on the test set: ", mean(R.test_acc))

## top results w.r.t. accuracy on the test set
R = sort(R, :test_acc, rev=true)
R.rank_test_acc = 1:nrow(R)
first(R, 5)

In [None]:
last(R, 5)

Next, we take the top-10 algorithms w.r.t. divergence on the validation set, and the top-10 algorithms w.r.t. accuracy on the validation set. 

We then plot the distribution of results (accuracy) over the test set via box-plots.

As expected, using accuracy (supervised score) yields better results, but the results obtained with the (unsupervised) global divergence score are also quite good.


In [None]:
## top results on test set w.r.t. divergence on validation set
R = sort(R, :rank_div)
top_div = R.test_acc[1:10]

## top results on test set w.r.t. accuracy on validation set
R = sort(R, :val_acc, rev=true)
top_acc = R.test_acc[1:10]

## pd with mu
B = DataFrame(hcat([top_acc, top_div]...),
    ["Top-10 validation set accuracy", "Top-10 divergence score"])
PyPlot.boxplot(Matrix(B), labels=names(B))
PyPlot.ylim((0, 1))
PyPlot.ylabel("Test set accuracy", fontsize=14);


Another way to compare the results is to plot the accuracy results on the test set as a function of the rank of the algorithms w.r.t. the accuracy score on the validation set (next cell) or the divergence score on the validation set (second next cell).

The correlation is very clear in the first case, and is still quite strong in the second case.

In [None]:
PyPlot.plot(R.rank_val_acc, R.test_acc, ".", color="black")
PyPlot.xlabel("Rank (vadidation set accuracy)", fontsize=14)
PyPlot.ylabel("Test set accuracy", fontsize=14);
println("correlation:", cor(R.rank_val_acc, R.test_acc))

In [None]:
PyPlot.plot(R.rank_div, R.test_acc, ".", color="black")
PyPlot.xlabel("Rank (divergence score)", fontsize=14)
PyPlot.ylabel("Test set accuracy", fontsize=14);
println("correlation:", cor(R.rank_div, R.test_acc))

Finally, we compare with accuracy obtained with a random classifier, averaging over several runs.

In [None]:
## random classification
Random.seed!(123)
ctr = countmap(y_trainval)
x = [ctr[i] for i in 1:12]
s = sum(x)
p = [i / s for i in x]
acc = []
for rep in 1:30
    y_pred = [x + 1 for x in StatsBase.sample(1:12, Weights(p), length(y_test), replace=true)]
    cm = confusion_matrix(y_test, y_pred)
    push!(acc, sum(diag(cm)) / sum(cm))
end
println("Random classifier average accuracy on test set:", mean(acc))


### Test set accuracy vs both divergence scores

Recall that we took the average rank of the global and local and divergence scores to obtain the (unsupervised) ranking of the embeddings. 

Below, we plot the test set accuracy vs both divergence scores.
As we already saw with the correlation valuers, the global score is a better predictor here, but the local score is still correlated as expected, with a few points having high local divergence, low global divergence and high accuracy (so using only the local score would ranked those as bad).


In [None]:
no_of_balls = nrow(R)
x = R.div_glb ./ minimum(R.div_glb)
y = R.div_loc ./ minimum(R.div_loc)
acc = R.test_acc
areas = [(10 * x)^2 for x in acc]

PyPlot.figure()
PyPlot.scatter(x, y, s=areas, alpha=0.85, color="dimgrey")
#plt.axis([0.0, 1.0, 0.0, 1.0])
PyPlot.xlabel("Global divergence score (normalized)", fontsize=13)
PyPlot.ylabel("Local divergence score (normalized)", fontsize=13)
p1 = PyPlot.plot([], [], color="white", marker="o", markersize=5.5, markerfacecolor="dimgrey")
p2 = PyPlot.plot([], [], color="white", marker="o", markersize=8.5, markerfacecolor="dimgrey")
p3 = PyPlot.plot([], [], color="white", marker="o", markersize=11.5, markerfacecolor="dimgrey")
PyPlot.legend((p1[1], p2[1], p3[1]), ("35%", "65%", "95%",), numpoints=1, loc=4, title="test set accuracy");

# GCN structural embedding of Zachary graph

In the cells below, we embed the nodes from the Zachary graph using a simple GCN model (graph convolution net) with one hidden layer and 3-dimensional output. 

We use the implementation from the ```sknetwork``` package. 

For **structural** node features, we use each node"s degree and number of edges in its egonet. We cluster the resulting embedding with k-means setting k=3. 

We plot the embedding (after dimension reduction via UMAP) with colors representing the k-means clusters.
We see good separation between the 3 clusters.

In the next cell we plot the graph this time using the ... layout we saw before.
We see that this embedding finds the central/intermediate/peripheral nodes as its clusters.


In [None]:
## GCN embedding of karate graph + kmeans with 3 communities
ig_zac = ig.Graph.Famous("Zachary")
zac_A = ig_zac.get_adjacency_sparse()

## GCN
hidden_dim = 5
n_labels = 3
gnn = skn.gnn.GNNClassifier(dims=[hidden_dim, n_labels],
    layer_types="Conv",
    activations="ReLu",
    verbose=false)

## for structural features, use degree and number of edges in egonet
features = hcat(
    ig_zac.degree(),
    [ig_zac.subgraph(V).ecount() for V in ig_zac.neighborhood()]
)
labels = zeros(nv(zac)) ## embedding, no need for node labels

## compute the embedding
zac_emb = gnn.fit_transform(zac_A, features, labels=labels, n_epochs=25, random_state=42)

## apply k-means to this embedding and color the nodes
cl = ScikitLearn.fit!(KMeans(n_clusters=3, random_state=123, n_init="auto"), zac_emb).labels_

# ## map the structural embedding to 2-d via UMAP for visualization
Random.seed!(4)
Y = umap(zac_emb')
clean_graphplot(zac,
    layout=Tuple.(eachrow(Y')),
    ilabels=1:nv(zac),
    node_color=[:white, :lightgray, :gray][cl.+1],
    edge_color=:lightgray,
    figure=(size=(400, 400),))

In [None]:
## Now with Spring layout
clean_graphplot(zac,
    ilabels=1:nv(zac),
    node_color=[:white, :lightgray, :gray][cl.+1],
    edge_color=:lightgray,
    figure=(size=(400, 400),))

Below we train another GCN, but with 1-dimension output layer (using same node features), so we get a 1dimensional embedding that we can use to order the nodes, which we show in the second cell below (where the node labels are replaced by their respective ranks in the ordering). 

We see a clear ranking from the most central nodes to the peripherial nodes.


In [None]:
## 1-d embedding allows for node ordering
n_labels = 1
gnn = skn.gnn.GNNClassifier(dims=[hidden_dim, n_labels],
    layer_types="Conv",
    activations="ReLu",
    verbose=false)
emb_1 = gnn.fit_transform(zac_A, features, labels=labels, n_epochs=20, random_state=42)
roles = sortperm(vec(emb_1))
plot_labels = zeros(Int, nv(zac))
i = 1
for pos in roles
    plot_labels[pos] = i
    i += 1
end
clean_graphplot(zac,
    ilabels=plot_labels,
    edge_color=:lightgray,
    figure=(size=(400, 400),))

## semi-supervised learning with GCN

Below we consider the ground-truth labels in the Zachary graphs; recall that there are two communities. 

We mask 1/3 of the labels and train a GCN model using the other 2/3, given the ground-truth labels for those. 

We then use the trained model to predict the labels for the masked 1/3 of the nodes (a.k.a. the test set).

We see that we get a good accuracy, around 90%, on the test set.

In [None]:
## classification - karate graph
labels = deepcopy(c) .- 1

Random.seed!(123)
train_mask = rand(length(labels)) .< 0.33 ## mask 1/3 of the nodes for training
labels[train_mask] .= -1 ## the negative labels are ignored in the training

# GNN classifier with a single hidden layer
hidden_dim = 5
n_labels = 2 ## 2 ground-truth communities
gnn = skn.gnn.GNNClassifier(dims=[hidden_dim, n_labels],
    layer_types="Conv",
    activations="ReLu",
    verbose=false)

## for features, we simply use the adjacency matrix
## fit the GCN
Pred = gnn.fit_predict(zac_A, zac_A, labels=labels, n_epochs=50, random_state=42)

## apply to test set and compute accuracy
acc = accuracy_score((c.-1)[train_mask], Pred[train_mask])
print("accuracy on the test set:", acc)

# Embedding the Twitch dataset

### running node2vec and UMAP/t-SNE

Here are the steps to follow to re-create the experiment below:
* get the dataset from https://snap.stanford.edu/data/twitch_gamers.html (168,114 nodes, 6,797,557 edges)
* build a dataframe with all node features
* build the graph from the edge list and run node2vec on the graph
* run UMAP and t-SNE to get 2-dim mappings for visualization, 


In [None]:
## read the data obtained from the above experiment
twitch = deserialize(datadir * "Twitch/twitch_jl.ser")
first(twitch, 5)

### greyscale plot - highlight a few languages


In [None]:
## define a greyscale palette without the extremes
colors = pyimport("matplotlib.colors")
function truncate_colormap(cmap, minval=0.0, maxval=1.0, n=100)
    new_colors = cmap.(range(minval, stop=maxval, length=n))
    return colors.LinearSegmentedColormap.from_list(
        "trunc($(cmap.name), $(round(minval, digits=2)), $(round(maxval, digits=2)))",
        new_colors, N=n
    )
end
cmap = get_cmap("binary")
greyscale = truncate_colormap(cmap, 0.1, 1.0)

## select a few languages to highlight
languages = ["French", "Spanish", "German"]
language_codes = ["FR", "ES", "DE"]
twitch.color = zeros(Int, nrow(twitch))

twitch.color[twitch.language.==language_codes[1]] .= 1
twitch.color[twitch.language.==language_codes[2]] .= 2
twitch.color[twitch.language.==language_codes[3]] .= 3

## plot - change ticklabels as required
figure(figsize=(12, 8))
PyPlot.scatter(twitch.X, twitch.Y, c=twitch.color, cmap=greyscale, s=5)
gca().set_aspect("equal", "datalim")
cb = colorbar(boundaries=0:5 .- 0.5, shrink=0.7)
cb.set_ticks(0:3)
cb.set_ticklabels(["Other"; languages])
PyPlot.xticks([])
PyPlot.yticks([])
PyPlot.title("Twitch dataset embedding", fontsize=20);

In [None]:
## compute the average position for every language in mapped 2-dim space
L = combine(groupby(twitch, :language), [:X, :Y] .=> mean)
X = collect(L.X_mean)
Y = collect(L.Y_mean)
Z = collect(L.language)
fig, ax = subplots(figsize=(9, 9))
ax.scatter(X, Y, s=0)
PyPlot.xticks([])
PyPlot.yticks([])
PyPlot.xlim(minimum(X) - 2, maximum(X) + 2)
PyPlot.ylim(minimum(Y) - 2, maximum(Y) + 2)
for (i, txt) in enumerate(Z)
    if txt != "OTHER"
        ax.annotate(string(txt), (X[i], Y[i]), color="black", size=14)
    end
end

## colorplot - all languages


In [None]:
x = sort(collect(Set(twitch.language)))
x = filter(!=("OTHER"), x)
lang_list = vcat("OTHER", reverse(x))
num_lang = length(lang_list)
lang_dict = Dict(lang => i-1 for (i, lang) in enumerate(lang_list))

cvals = [lang_dict[i] for i in twitch.language]

## plot - change ticklabels as required
figure(figsize=(12, 8))
PyPlot.scatter(twitch.X, twitch.Y, c=cvals, cmap="Spectral", s=5)
gca().set_aspect("equal", "datalim")

ax = colorbar(boundaries=0:num_lang .- 0.5, shrink=0.9)
ax.set_ticks(0:num_lang - 1 .+ 0.2)
ax.set_ticklabels(lang_list, fontsize=12)

PyPlot.xticks([])
PyPlot.yticks([])
PyPlot.title("Twitch dataset embedding", fontsize=20);

### filtering - users with high number of view only

In [None]:
twitch.log_num_views = round.(Int, log10.(twitch.views .+ 1))

subset = filter(row -> row.log_num_views > 5, twitch)
cvals = [lang_dict[i] for i in subset.language]

figure(figsize=(12, 8))
PyPlot.scatter(subset.X, subset.Y, c=cvals, cmap="Spectral", s=5)
gca().set_aspect("equal", "datalim")

ax = colorbar(boundaries=0:num_lang .- 0.5, shrink=0.9)
ax.set_ticks(0:num_lang - 1 .+ 0.2)
ax.set_ticklabels(lang_list, fontsize=12)

PyPlot.xticks([])
PyPlot.yticks([])
PyPlot.title("Twitch dataset embedding - subset", fontsize=20);

### t-SNE views of the above

In [None]:
## plot - change ticklabels as required
figure(figsize=(12, 8))
PyPlot.scatter(twitch.X_tsne, twitch.Y_tsne, c=twitch.color, cmap=greyscale, s=5)
gca().set_aspect("equal", "datalim")

ax = colorbar(boundaries=0:5 .- 0.5, shrink=0.7)
ax.set_ticks(0:3 .+ 0.2)
ax.set_ticklabels(["Other"; languages], fontsize=12)

PyPlot.xticks([])
PyPlot.yticks([])
PyPlot.title("Twitch dataset embedding", fontsize=20);

In [None]:
## plot - change ticklabels as required
cvals = [lang_dict[i] for i in twitch.language]

figure(figsize=(12, 8))
PyPlot.scatter(twitch.X_tsne, twitch.Y_tsne, c=cvals, cmap="Spectral", s=5)
gca().set_aspect("equal", "datalim")

ax = colorbar(boundaries=0:num_lang .- 0.5, shrink=0.9)
ax.set_ticks(0:num_lang - 1 .+ 0.2)
ax.set_ticklabels(lang_list, fontsize=12)

PyPlot.xticks([])
PyPlot.yticks([])
PyPlot.title("Twitch dataset embedding", fontsize=20);

In [None]:
## plot
subset = filter(row -> row.log_num_views > 5, twitch)
cvals = [lang_dict[i] for i in subset.language]

figure(figsize=(12, 8))
PyPlot.scatter(subset.X_tsne, subset.Y_tsne, c=cvals, cmap="Spectral", s=5)
gca().set_aspect("equal", "datalim")

ax = colorbar(boundaries=0:num_lang .- 0.5, shrink=0.9)
ax.set_ticks(0:num_lang - 1 .+ 0.2)
ax.set_ticklabels(lang_list, fontsize=12)

PyPlot.xticks([])
PyPlot.yticks([])
PyPlot.title("Twitch dataset embedding - subset", fontsize=20);