## Required extra package:

For hypergraphs:
```
using PyCall
run(`$(PyCall.python) -m pip install hypernetx`)
```

Functionality of HypernetX package is described in Python notebook

In [None]:
using PyCall
using PyPlot
using LightGraphs
using GraphPlot
using DataFrames
using Random
using Statistics

In [None]:
hnx = pyimport("hypernetx")

In [None]:
ig = pyimport("igraph")

In [None]:
partition_igraph = pyimport("partition_igraph")

In [None]:
## the data directory
datadir="../Datasets/"

In [None]:
function ig2lg(ig_g)
    lg_g = SimpleGraph(ig_g.vcount())
    for e in ig_g.es()
        add_edge!(lg_g, e.source + 1, e.target + 1)
    end
    return lg_g
end

In [None]:
## Functions for HNX nypergraphs as described above:
## We keep this code in Python as we are updating Python objects using it

py"""

from collections import Counter
import hypernetx as hnx
import numpy as np
from functools import reduce
import itertools
import igraph as ig

def factorial(n): 
    if n < 2: return 1
    return reduce(lambda x, y: x*y, range(2, int(n)+1))

## Precompute some values on HNX hypergraph for computing qH faster
def HNX_precompute(HG):
    ## 1. compute node strenghts (weighted degrees)
    for v in HG.nodes:
        HG.nodes[v].strength = 0
    for e in HG.edges:
        try:
            w = HG.edges[e].weight
        except:
            w = 1
            ## add unit weight if none to simplify other functions
            HG.edges[e].weight = 1 
        for v in list(HG.edges[e]):
            HG.nodes[v].strength += w
    ## 2. compute d-weights        
    ctr = Counter([len(HG.edges[e]) for e in HG.edges])
    for k in ctr.keys():
        ctr[k]=0
    for e in HG.edges:
        ctr[len(HG.edges[e])] += HG.edges[e].weight
    HG.d_weights = ctr
    HG.total_weight = sum(ctr.values())
    ## 3. compute binomial coeffcients (modularity speed-up)
    bin_coef = {}
    for n in HG.d_weights.keys():
        for k in np.arange(n//2+1,n+1):
            bin_coef[(n,k)] = factorial(n)/(factorial(k)*factorial(n-k))
    HG.bin_coef = bin_coef

#########################################

## default: linear w.r.t. c
def linear(d,c):
    return c/d if c>d/2 else 0

## majority
def majority(d,c):
    return 1 if c>d/2 else 0

## strict
def strict(d,c):
    return 1 if c==d else 0

#########################################

## compute vol(A_i)/vol(V) for each part A_i in A (list of sets)
def compute_partition_probas(HG, A):
    p = []
    for part in A:
        vol = 0
        for v in part:
            vol += HG.nodes[v].strength
        p.append(vol)
    s = sum(p)
    return [i/s for i in p]

## degree tax 
def DegreeTax(HG, Pr, wdc):
    DT = 0
    for d in HG.d_weights.keys():
        tax = 0
        for c in np.arange(d//2+1,d+1):
            for p in Pr:
                tax += p**c * (1-p)**(d-c) * HG.bin_coef[(d,c)] * wdc(d,c)
        tax *= HG.d_weights[d]
        DT += tax
    DT /= HG.total_weight
    return DT

## edge contribution, A is list of sets
def EdgeContribution(HG, A, wdc):
    EC = 0
    for e in HG.edges:
        d = HG.size(e)
        for part in A:
            if HG.size(e,part) > d/2:
                EC += wdc(d,HG.size(e,part)) * HG.edges[e].weight
    EC /= HG.total_weight
    return EC

## HG: HNX hypergraph
## A: partition (list of sets)
## wcd: weight function (ex: strict, majority, linear)
def HNX_modularity(HG, A, wdc=linear):
    Pr = compute_partition_probas(HG, A)
    return EdgeContribution(HG, A, wdc) - DegreeTax(HG, Pr, wdc)

#########################################

## 2-section igraph from HG
def HNX_2section(HG):
    s = []
    for e in HG.edges:
        E = HG.edges[e]
         ## random-walk 2-section (preserve nodes' weighted degrees)
        try:
            w = HG.edges[e].weight/(len(E)-1)
        except:
            w = 1/(len(E)-1)
        s.extend([(k[0],k[1],w) for k in itertools.combinations(E,2)])
    G = ig.Graph.TupleList(s,weights=True).simplify(combine_edges='sum')
    return G

#########################################

## we use 2 representations for partitions (0-based part ids):
## (1) dictionary or (2) list of sets

def dict2part(D):
    P = []
    k = list(D.keys())
    v = list(D.values())
    for x in range(max(D.values())+1):
        P.append(set([k[i] for i in range(len(k)) if v[i]==x]))
    return P

def part2dict(A):
    x = []
    for i in range(len(A)):
        x.extend([(a,i) for a in A[i]])
    return {k:v for k,v in x}
"""

# Toy hypergraph example with HNX

In [None]:
## build an hypergraph from a list of sets (the hyperedges)
## using 'enumerate', edges will have integer IDs
E = [Set(["A","B"]),Set(["A","C"]),Set(["A","B","C"]),Set(["A","D","E","F"]),Set(["D","F"]),Set(["E","F"])]
HG = hnx.Hypergraph(Dict(enumerate(E)))
fig = plt.figure()
ax = plt.gca()
hnx.draw(HG, ax=ax)

In [None]:
## dual hypergraph
HD = HG.dual()
fig = plt.figure()
ax = plt.gca()
hnx.draw(HD, ax=ax)

In [None]:
## compute node strength (add unit weight if none), d-degrees, binomial coefficients
py"HNX_precompute"(HG)
## show the edges (unit weights were added by default)
HG.edges.elements

In [None]:
## show the nodes (here strength = degree since all weights are 1)
HG.nodes.elements

In [None]:
## d-weights distribution
HG.d_weights


In [None]:
## compute modularity qH for the following partitions:
A1 = [Set(["A","B","C"]),Set(["D","E","F"])]
A2 = [Set(["B","C"]),Set(["A","D","E","F"])]
A3 = [Set(["A","B","C","D","E","F"])]
A4 = [Set(["A"]),Set(["B"]),Set(["C"]),Set(["D"]),Set(["E"]),Set(["F"])]

println("linear: ", [py"HNX_modularity"(HG,A1), py"HNX_modularity"(HG,A2),
                     py"HNX_modularity"(HG,A3), py"HNX_modularity"(HG,A4)])
println("strict: ", [py"HNX_modularity"(HG,A1, py"strict"), py"HNX_modularity"(HG,A2,py"strict"),
                     py"HNX_modularity"(HG,A3,py"strict"), py"HNX_modularity"(HG,A4,py"strict")])
println("majority: ", [py"HNX_modularity"(HG,A1,py"majority"), py"HNX_modularity"(HG,A2,py"majority"),
                       py"HNX_modularity"(HG,A3,py"majority"), py"HNX_modularity"(HG,A4,py"majority")])

In [None]:
## 2-section graph
G = py"HNX_2section"(HG)
gplot(ig2lg(G),
      NODESIZE=0.05, nodefillc="red",
      nodelabel=G.vs.get_attribute_values("name"),
      nodelabelc="black",
      EDGELINEWIDTH=0.2, edgestrokec="gray")

In [None]:
m = G.community_ecg().membership
[Set(G.vs.get_attribute_values("name")[m .== v]) for v in unique(m)]

In [None]:
## 2-section clustering with ECG
G.vs['community'] = G.community_ecg().membership
dict2part({v['name']:v['community'] for v in G.vs})


# Game of Thrones scenes hypergraph

REF: https://github.com/jeffreylancaster/game-of-thrones

We built an hypergraph from the game of thrones scenes with he following elements:

* **Nodes** are characters in the series
* **Hyperedges** are groups of character appearing in the same scene(s)
* **Hyperedge weights** are total scene(s) duration in seconds involving those characters

We kept hyperedges with at least 2 characters and we discarded characters with degree below 5.

We saved the following:

* *Edges*: list of sets where the nodes are 0-based integers represented as strings: '0', '1', ... 'n-1'
* *Names*: dictionary; mapping of nodes to character names
* *Weights*: list; hyperedge weights (in same order as Edges)


In [None]:
py"""
import pickle

datadir='../Datasets/'

with open(datadir+"GoT/GoT.pkl", "rb") as f:
    Edges, Names, Weights = pickle.load(f)
"""

## Build weighted hypergraph 

In [None]:
py"""
## Nodes are represented as strings from '0' to 'n-1'
HG = hnx.Hypergraph(dict(enumerate(Edges)))
## add edge weights
for e in HG.edges:
    HG.edges[e].weight = Weights[e]
## add full names
for v in HG.nodes:
    HG.nodes[v].name = Names[v]
## pre-compute required quantities for modularity and clustering
HNX_precompute(HG)
"""

In [None]:
print(py"HG".number_of_nodes(), " nodes and ", py"HG".number_of_edges(), " edges")

### EDA on GoT hypergraph

In [None]:
## edge sizes (number of characters per scene)
hist(py"[HG.edges[e].size() for e in HG.edges]", bins=25, color="grey")
xlabel("Edge size",fontsize=14);

In [None]:
## edge weights (total scene durations for each group of characters)
hist(py"[HG.edges[e].weight for e in HG.edges]", bins=25, color="grey")
xlabel("Edge weight",fontsize=14);
print("max = ",maximum(py"[HG.edges[e].weight for e in HG.edges]"))

In [None]:
## node degrees
hist(hnx.degree_dist(py"HG"),bins=20, color="grey")
xlabel("Node degree",fontsize=14);

In [None]:
## node strength (total appearance)
hist(py"[HG.nodes[n].strength for n in HG.nodes]", bins=20, color="grey")
xlabel("Node strength",fontsize=14);

In [None]:
## build dataframe with node characteristics
D = DataFrame(name = py"[HG.nodes[v].name for v in HG.nodes()]",
              degree = py"[HG.degree(v) for v in HG.nodes()]",
              strength = py"[HG.nodes[v].strength for v in HG.nodes()]")
sort(D, :strength, rev=true)

In [None]:
sort(D, :degree, rev=true)

In [None]:
plot(D.degree,D.strength, ".")
xlabel("degree", fontsize=14)
ylabel("strength", fontsize=14);

## Build 2-section graph and compute a few centrality measures

In [None]:
## build 2-section
py"""
G = HNX_2section(HG)
"""

In [None]:
## sanity check -- node ordering 

py"""
## ordering of nodes in HG
ord_HG = list(HG.nodes.elements.keys())

## ordering of nodes in G
ord_G = [v['name'] for v in G.vs]
"""
py"ord_HG" == py"ord_G"

In [None]:
b = py"G".betweenness(directed=false,weights="weight")
n = py"G".vcount()
D.betweenness = [2*x/((n-1)*(n-2)) for x in b]
D.pagerank = py"G".pagerank(directed=false, weights="weight")
sort(D, :strength, rev=true)

In [None]:
sort(D, :betweenness, rev=true)

## Hypergraph modularity and clustering

In [None]:
print("nodes: ",py"G".vcount()," edges: ",py"G".ecount())

In [None]:
## visualize the 2-section graph
py"""
G.vs['size'] = 10
G.vs['color'] = 'lightgrey'
G.vs['label'] = [int(x) for x in G.vs['name']] ## use int(name) as label
G.vs['character'] = [HG.nodes[n].name for n in G.vs['name']]
G.vs['label_size'] = 5
"""
Random.seed!(1234)
gplot(ig2lg(py"G"),
      NODESIZE=0.04, nodefillc="gray",
      nodelabel=py"G".vs.get_attribute_values("name"),
      nodelabelc="black",
      EDGELINEWIDTH=0.2, edgestrokec="gray")

In [None]:
## we see a small clique: Braavosi theater troup
print(py"[HG.nodes[str(x)].name for x in np.arange(166,173)]")

In [None]:
## Modularity (qH) on several random partition with K parts for a range of K's
## This should be close to 0 and can be negative.
py"""
h = []
for K in np.arange(2,21):
    for rep in range(10):
        V = list(HG.nodes)
        p = np.random.choice(K, size=len(V))
        RandPart = dict2part({V[i]:p[i] for i in range(len(V))})
        ## compute qH
        h.append(HNX_modularity(HG, RandPart))
"""
print("range for qH: ",minimum(py"h")," to ",maximum(py"h"))

In [None]:
## Cluster the 2-section graph (with Louvain) and compute qH
## We now see qH >> 0
py"""
G.vs['louvain'] = G.community_multilevel(weights='weight').membership
ML = dict2part({v['name']:v['louvain'] for v in G.vs})
"""
## Compute qH
print(py"HNX_modularity(HG, ML)")

In [None]:
D.cluster = py"G.vs['louvain']";

In [None]:
colors = Dict(0 => "yellow", 1=>"red", 2=>"green", 3=>"blue", 4=>"violet")

Random.seed!(1234)
gplot(ig2lg(py"G"),
      NODESIZE=0.04, nodefillc=[colors[x] for x in D.cluster],
      nodelabel=py"G".vs.get_attribute_values("name"),
      nodelabelc="black",
      EDGELINEWIDTH=0.2, edgestrokec="gray")

In [None]:
dt = filter(:name => ==("Daenerys Targaryen"), D).cluster[1]
sort(filter(:cluster => ==(dt), D), :strength, rev=true)

# Motifs example 

Using HNX draw function to get patterns from Figure 7.1 in the book

In [None]:
## H1 pattern
E = [Set(["A","B"]),Set(["A","C"]),Set(["A","D"]),Set(["B","D"]),Set(["C","D"])]
HG = hnx.Hypergraph(Dict(enumerate(E)))
fig = plt.figure()
ax = plt.gca()
hnx.draw(HG, ax=ax)

In [None]:
## H2 pattern
E = [Set(["A","B", "C"]),Set(["A","D"]),Set(["C","D"])]
HG = hnx.Hypergraph(Dict(enumerate(E)))
fig = plt.figure()
ax = plt.gca()
hnx.draw(HG, ax=ax)

In [None]:
## H3 pattern
E = [Set(["A","B", "C"]),Set(["B", "C","D"])]
HG = hnx.Hypergraph(Dict(enumerate(E)))
fig = plt.figure()
ax = plt.gca()
hnx.draw(HG, ax=ax)

In [None]:
### Counting those patterns -- Table 7.2: see Python codes

## Experiment with simple community random hypergraphs

note: qH-based heuristics are still very experimental; we only provide this for illustration

* 16 hypergraphs each with 1000 nodes, 1400 edges of size 2 to 8 (200 each)
* 10 communities with 0%, 5%, 10% or 15% pure noise edges (mu)
* community edge homogeneity (tau) from 0.5 to 1
* 3 algorithms:
 * qG-based Louvain on 2-section
 * qH-based heuristic clustering algorithm on hypergraph
 * qH+: same but using true homogeneity (tau)
* Experiment results are stored in files taus_xx.pkl with xx in {00, 05, 10, 15}

In [None]:
## load results (here mu = .05)
py"""
with open( datadir+"Hypergraph/taus_05.pkl", "rb" ) as f:
    results = pickle.load(f)
"""

R = combine(groupby(DataFrame(py"results", ["tau","Graph","Hypergraph","Hypergraph+"]), :tau),
            ["Graph","Hypergraph","Hypergraph+"] .=> mean, renamecols=false)
plot(R.tau,R.Graph,"o-",label="qG-based",color="red")
plot(R.tau,R.Hypergraph,"o-",label="qH-based",color="green")
plot(R.tau,R."Hypergraph+","o-",label="qH-based (tuned)",color="blue")
xlabel("homogeneity tau",fontsize=14)
ylabel("AMI",fontsize=14)
legend();

## Community hypergraphs

We have hyperedge list and communities for 3 random hypergraph with communities, namely:

* edges65, comm65: hypergraphs with $\tau_e = \lceil(d*0.65)\rceil$ for all community edges of side $d$
* edges85, comm85: hypergraphs with $\tau_e = \lceil(d*0.85)\rceil$ for all community edges of side $d$
* edges65_unif, comm65_unif: hypergraphs with $\tau_e$ chosen uniformly from $\{\lceil(d*0.65)\rceil,...,d\}$ for all community edges of side $d$

All have 1000 nodes, 1400 edges of size 2 to 8 (200 each) 10 communities and noise parameter $\mu=0.1$.

In [None]:
## load hypergraphs
py"""
with open(datadir+"Hypergraph/hypergraphs.pkl","rb") as f:
    (edges65, comm65, edges85, comm85, edges65_unif, comm65_unif) = pickle.load(f)
"""

In [None]:
## estimating tau

## pick one of the three hypergraphs
py"""
comm = comm65
L = edges65

## true communities
HG = hnx.Hypergraph(dict(enumerate(L)))
x = []
for e in L:
    x.append(max([len(e.intersection(k)) for k in comm])/len(e))
y = []
tv = np.arange(0.501,1,0.025)
for t in tv:
    y.append(sum([i>t for i in x])/len(x))
"""
plot(py"tv", py"y",".-",color="red",label="true communities")

## Louvain
py"""
G = HNX_2section(HG)
G.vs['louvain'] = G.community_multilevel(weights='weight').membership
ML = dict2part({v['name']:v['louvain'] for v in G.vs})
x = []
for e in L:
    x.append(max([len(e.intersection(k)) for k in ML])/len(e))
y = []
for t in tv:
    y.append(sum([i>t for i in x])/len(x))
"""
plot(py"tv", py"y", ".-",color="black",label="Louvain")

PyPlot.grid()
ylabel("Pr(homogeneity > tau",fontsize=14)
xlabel("tau",fontsize=14)
legend()
ylim(0,1);

In [None]:
## distribution of edge homogeneity -- single value for 'tau'
py"""
x = []
for e in edges65:
    x.append(max([len(e.intersection(k)) for k in comm65])/len(e))
"""
hist(py"x",bins="rice",color="grey");

In [None]:
## distribution of edge homogeneity -- range for 'tau' 
## we see many more pure community edges
py"""
x = []
for e in edges65_unif:
    x.append(max([len(e.intersection(k)) for k in comm65_unif])/len(e))
"""
hist(py"x", bins="rice",color="grey");