## Required extra package:

For hypergraphs:
* pip install hypernetx


In [None]:
import pandas as pd
import numpy as np
import igraph as ig
import partition_igraph
import hypernetx as hnx
import pickle
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter
from functools import reduce
import itertools


In [None]:
## the data directory
datadir='../Datasets/'

# Summary of extra functions for HNX hypergraphs

### Build hypergraph and pre-compute key quantities

We build the hypergraph HG using:

```python
HG = hnx.Hypergraph(dict(enumerate(Edges)))
```

where 'Edges' is a list of sets; edges are then indexed as 0-based integers,
so to preserve unique ids, we represent nodes as strings.
For example Edges[0] = {'0','2'}

Once the HNX hypergraph is built, the following function is called to 
compute node strengths, d-degrees and binomial coefficients:

```python
HNX_precompute(HG)
```

### Partitions

We use two representations for partitions: list of sets (the parts) or dictionary.
Those functions are used to map from one to the other:

```python
dict2part(D)
part2dict(A)
```

### H-modularity

The function to compute H-modularity for HG w.r.t. partition A (list of sets covering the vertices):

```python
HNX_modularity(HG, A, wcd=linear)
```

where 'wcd' is the weight function (default = 'linear'). Other choices are 'strict'
and 'majority', or any user-supplied function with the following format:

```python
def linear(d,c):
    return c/d if c>d/2 else 0
```

where d is the edge size, and d>=c>d/2 the number of nodes in the majority class.

### Two-section graph

Build the random-walk based 2-section graph given some hypergraph HG:

```python
G = HNX_2section(HG)
```

where G is an igraph Graph.



In [None]:
## Functions for HNX nypergraphs as described above:

def factorial(n): 
    if n < 2: return 1
    return reduce(lambda x, y: x*y, range(2, int(n)+1))

## Precompute some values on HNX hypergraph for computing qH faster
def HNX_precompute(HG):
    ## 1. compute node strenghts (weighted degrees)
    for v in HG.nodes:
        HG.nodes[v].strength = 0
    for e in HG.edges:
        try:
            w = HG.edges[e].weight
        except:
            w = 1
            ## add unit weight if none to simplify other functions
            HG.edges[e].weight = 1 
        for v in list(HG.edges[e]):
            HG.nodes[v].strength += w
    ## 2. compute d-weights        
    ctr = Counter([len(HG.edges[e]) for e in HG.edges])
    for k in ctr.keys():
        ctr[k]=0
    for e in HG.edges:
        ctr[len(HG.edges[e])] += HG.edges[e].weight
    HG.d_weights = ctr
    HG.total_weight = sum(ctr.values())
    ## 3. compute binomial coeffcients (modularity speed-up)
    bin_coef = {}
    for n in HG.d_weights.keys():
        for k in np.arange(n//2+1,n+1):
            bin_coef[(n,k)] = factorial(n)/(factorial(k)*factorial(n-k))
    HG.bin_coef = bin_coef

#########################################

## some weight function 'wdc' for d-edges with c-majority

## default: linear w.r.t. c
def linear(d,c):
    return c/d if c>d/2 else 0

## majority
def majority(d,c):
    return 1 if c>d/2 else 0

## strict
def strict(d,c):
    return 1 if c==d else 0

#########################################

## compute vol(A_i)/vol(V) for each part A_i in A (list of sets)
def compute_partition_probas(HG, A):
    p = []
    for part in A:
        vol = 0
        for v in part:
            vol += HG.nodes[v].strength
        p.append(vol)
    s = sum(p)
    return [i/s for i in p]

## degree tax 
def DegreeTax(HG, Pr, wdc):
    DT = 0
    for d in HG.d_weights.keys():
        tax = 0
        for c in np.arange(d//2+1,d+1):
            for p in Pr:
                tax += p**c * (1-p)**(d-c) * HG.bin_coef[(d,c)] * wdc(d,c)
        tax *= HG.d_weights[d]
        DT += tax
    DT /= HG.total_weight
    return DT

## edge contribution, A is list of sets
def EdgeContribution(HG, A, wdc):
    EC = 0
    for e in HG.edges:
        d = HG.size(e)
        for part in A:
            if HG.size(e,part) > d/2:
                EC += wdc(d,HG.size(e,part)) * HG.edges[e].weight
    EC /= HG.total_weight
    return EC

## HG: HNX hypergraph
## A: partition (list of sets)
## wcd: weight function (ex: strict, majority, linear)
def HNX_modularity(HG, A, wdc=linear):
    Pr = compute_partition_probas(HG, A)
    return EdgeContribution(HG, A, wdc) - DegreeTax(HG, Pr, wdc)

#########################################

## 2-section igraph from HG
def HNX_2section(HG):
    s = []
    for e in HG.edges:
        E = HG.edges[e]
         ## random-walk 2-section (preserve nodes' weighted degrees)
        try:
            w = HG.edges[e].weight/(len(E)-1)
        except:
            w = 1/(len(E)-1)
        s.extend([(k[0],k[1],w) for k in itertools.combinations(E,2)])
    G = ig.Graph.TupleList(s,weights=True).simplify(combine_edges='sum')
    return G

#########################################

## we use 2 representations for partitions (0-based part ids):
## (1) dictionary or (2) list of sets

def dict2part(D):
    P = []
    k = list(D.keys())
    v = list(D.values())
    for x in range(max(D.values())+1):
        P.append(set([k[i] for i in range(len(k)) if v[i]==x]))
    return P

def part2dict(A):
    x = []
    for i in range(len(A)):
        x.extend([(a,i) for a in A[i]])
    return {k:v for k,v in x}



# Toy hypergraph example with HNX

In [None]:
## build an hypergraph from a list of sets (the hyperedges)
## using 'enumerate', edges will have integer IDs
E = [{'A','B'},{'A','C'},{'A','B','C'},{'A','D','E','F'},{'D','F'},{'E','F'}]
HG = hnx.Hypergraph(dict(enumerate(E)))
hnx.draw(HG)


In [None]:
## dual hypergraph
HD = HG.dual()
hnx.draw(HD)

In [None]:
## compute node strength (add unit weight if none), d-degrees, binomial coefficients
HNX_precompute(HG)
## show the edges (unit weights were added by default)
HG.edges.elements


In [None]:
## show the nodes (here strength = degree since all weights are 1)
HG.nodes.elements


In [None]:
## d-weights distribution
HG.d_weights


In [None]:
## compute modularity qH for the following partitions:
A1 = [{'A','B','C'},{'D','E','F'}]
A2 = [{'B','C'},{'A','D','E','F'}]
A3 = [{'A','B','C','D','E','F'}]
A4 = [{'A'},{'B'},{'C'},{'D'},{'E'},{'F'}]

print('linear:',HNX_modularity(HG,A1),HNX_modularity(HG,A2),HNX_modularity(HG,A3),HNX_modularity(HG,A4))
print('strict:',HNX_modularity(HG,A1,strict),HNX_modularity(HG,A2,strict),HNX_modularity(HG,A3,strict),HNX_modularity(HG,A4,strict))
print('majority:',HNX_modularity(HG,A1,majority),HNX_modularity(HG,A2,majority),HNX_modularity(HG,A3,majority),HNX_modularity(HG,A4,majority))


In [None]:
## 2-section graph
G = HNX_2section(HG)
G.vs['label'] = G.vs['name']
ig.plot(G,bbox=(0,0,250,250))


In [None]:
## 2-section clustering with ECG
G.vs['community'] = G.community_ecg().membership
dict2part({v['name']:v['community'] for v in G.vs})


# Game of Thrones scenes hypergraph

REF: https://github.com/jeffreylancaster/game-of-thrones

We built an hypergraph from the game of thrones scenes with he following elements:

* **Nodes** are characters in the series
* **Hyperedges** are groups of character appearing in the same scene(s)
* **Hyperedge weights** are total scene(s) duration in seconds involving those characters

We kept hyperedges with at least 2 characters and we discarded characters with degree below 5.

We saved the following:

* *Edges*: list of sets where the nodes are 0-based integers represented as strings: '0', '1', ... 'n-1'
* *Names*: dictionary; mapping of nodes to character names
* *Weights*: list; hyperedge weights (in same order as Edges)


In [None]:
with open(datadir+"GoT/GoT.pkl","rb") as f:
    Edges, Names, Weights = pickle.load(f)


## Build weighted hypergraph 

In [None]:
## Nodes are represented as strings from '0' to 'n-1'
HG = hnx.Hypergraph(dict(enumerate(Edges)))
## add edge weights
for e in HG.edges:
    HG.edges[e].weight = Weights[e]
## add full names
for v in HG.nodes:
    HG.nodes[v].name = Names[v]
## pre-compute required quantities for modularity and clustering
HNX_precompute(HG)

In [None]:
print(HG.number_of_nodes(),'nodes and',HG.number_of_edges(),'edges')

### EDA on GoT hypergraph

In [None]:
## edge sizes (number of characters per scene)
plt.hist([HG.edges[e].size() for e in HG.edges], bins=25, color='grey')
plt.xlabel("Edge size",fontsize=14);
#plt.savefig('got_hist_1.eps');


In [None]:
## edge weights (total scene durations for each group of characters)
plt.hist([HG.edges[e].weight for e in HG.edges], bins=25, color='grey')
plt.xlabel("Edge weight",fontsize=14);
#plt.savefig('got_hist_2.eps');
## max edge weight
print('max = ',max([HG.edges[e].weight for e in HG.edges]))

In [None]:
## node degrees
plt.hist(hnx.degree_dist(HG),bins=20, color='grey')
plt.xlabel("Node degree",fontsize=14);
#plt.savefig('got_hist_3.eps');


In [None]:
## node strength (total appearance)
plt.hist([HG.nodes[n].strength for n in HG.nodes], bins=20, color='grey')
plt.xlabel("Node strength",fontsize=14);
#plt.savefig('got_hist_4.eps');


In [None]:
## build dataframe with node characteristics
dg = [HG.degree(v) for v in HG.nodes()]
st = [HG.nodes[v].strength for v in HG.nodes()]
nm = [HG.nodes[v].name for v in HG.nodes()]
D = pd.DataFrame(np.array([nm,dg,st]).transpose(),columns=['name','degree','strength'])
D['degree'] = pd.to_numeric(D['degree'])
D['strength'] = pd.to_numeric(D['strength'])
D.sort_values(by='strength',ascending=False).head()

In [None]:
D.sort_values(by='degree',ascending=False).head()

In [None]:
plt.plot(D['degree'],D['strength'],'.')
plt.xlabel('degree',fontsize=14)
plt.ylabel('strength',fontsize=14);

## Build 2-section graph and compute a few centrality measures

In [None]:
## build 2-section
G = HNX_2section(HG)

In [None]:
## sanity check -- node ordering 

## ordering of nodes in HG
ord_HG = list(HG.nodes.elements.keys())

## ordering of nodes in G
ord_G = [v['name'] for v in G.vs]

ord_HG == ord_G

In [None]:
b = G.betweenness(directed=False,weights='weight')
n = G.vcount()
D['betweenness'] = [2*x/((n-1)*(n-2)) for x in b]
D['pagerank'] = G.pagerank(directed=False,weights='weight')
D.sort_values(by='strength',ascending=False).head(10)

In [None]:
D.sort_values(by='betweenness',ascending=False).head()

## Hypergraph modularity and clustering

In [None]:
## visualize the 2-section graph
print('nodes:',G.vcount(),'edges:',G.ecount())
G.vs['size'] = 10
G.vs['color'] = 'lightgrey'
G.vs['label'] = [int(x) for x in G.vs['name']] ## use int(name) as label
G.vs['character'] = [HG.nodes[n].name for n in G.vs['name']]
G.vs['label_size'] = 5
ly = G.layout_fruchterman_reingold()
ig.plot(G, layout = ly, bbox=(0,0,600,400))

In [None]:
## we see a small clique: Braavosi theater troup
print([HG.nodes[str(x)].name for x in np.arange(166,173)])


In [None]:
## Modularity (qH) on several random partition with K parts for a range of K's
## This should be close to 0 and can be negative.
h = []
for K in np.arange(2,21):
    for rep in range(10):
        V = list(HG.nodes)
        p = np.random.choice(K, size=len(V))
        RandPart = dict2part({V[i]:p[i] for i in range(len(V))})
        ## compute qH
        h.append(HNX_modularity(HG, RandPart))
print('range for qH:',min(h),'to',max(h))

In [None]:
## Cluster the 2-section graph (with Louvain) and compute qH
## We now see qH >> 0
G.vs['louvain'] = G.community_multilevel(weights='weight').membership
D['cluster'] = G.vs['louvain']
ML = dict2part({v['name']:v['louvain'] for v in G.vs})
## Compute qH
print(HNX_modularity(HG, ML))


In [None]:
## plot 2-section w.r.t. the resulting clusters
cl = G.vs['louvain']
pal = ig.GradientPalette("white","black",max(cl)+2)
## uncomment line below for color plot:
pal = ig.ClusterColoringPalette(max(cl)+1)
G.vs['color'] = [pal[x] for x in cl]
G.vs['label_size'] = 5
ig.plot(G, layout = ly, bbox=(0,0,500,400))
#ig.plot(G, target='GoT_clusters.eps', layout = ly, bbox=(0,0,400,400))

In [None]:
## ex: high strength nodes in same cluster with Daenerys Targaryen
dt = int(D[D['name']=='Daenerys Targaryen']['cluster'])
D[D['cluster']==dt].sort_values(by='strength',ascending=False).head()

# Motifs example 

Using HNX draw function to get patterns from Figure 7.1 in the book

In [None]:
## H1 pattern
E = [{'A','B'},{'A','C'},{'A','D'},{'B','D'},{'C','D'}]
HG = hnx.Hypergraph(dict(enumerate(E)))
hnx.draw(HG)

In [None]:
## H2 pattern
E = [{'A','B','C'},{'A','D'},{'C','D'}]
HG = hnx.Hypergraph(dict(enumerate(E)))
hnx.draw(HG)

In [None]:
## H3 pattern
E = [{'A','B','C'},{'B','C','D'}]
HG = hnx.Hypergraph(dict(enumerate(E)))
hnx.draw(HG)


In [None]:
### Counting those patterns -- Table 7.2

## Experiment with simple community random hypergraphs

note: qH-based heuristics are still very experimental; we only provide this for illustration

* 16 hypergraphs each with 1000 nodes, 1400 edges of size 2 to 8 (200 each)
* 10 communities with 0%, 5%, 10% or 15% pure noise edges (mu)
* community edge homogeneity (tau) from 0.5 to 1
* 3 algorithms:
 * qG-based Louvain on 2-section
 * qH-based heuristic clustering algorithm on hypergraph
 * qH+: same but using true homogeneity (tau)
* Experiment results are stored in files taus_xx.pkl with xx in {00, 05, 10, 15}

In [None]:
## load results (here mu = .05)
with open( datadir+"Hypergraph/taus_05.pkl", "rb" ) as f:
    results = pickle.load(f)

R = pd.DataFrame(results,columns=['tau','Graph','Hypergraph','Hypergraph+']).groupby(by='tau').mean()
t = [x for x in np.arange(.501,1,.025)]
pal = ig.GradientPalette("grey","black",3)
#pal = ig.GradientPalette("red","blue",3)
plt.plot(t,R['Graph'],'o-',label='qG-based',color=pal[0])
plt.plot(t,R['Hypergraph'],'o-',label='qH-based',color=pal[1])
plt.plot(t,R['Hypergraph+'],'o-',label='qH-based (tuned)',color=pal[2])
plt.xlabel(r'homogeneity ($\tau$)',fontsize=14)
plt.ylabel('AMI',fontsize=14)
plt.legend();
#plt.savefig('taus_05.eps');

## Community hypergraphs

We have hyperedge list and communities for 3 random hypergraph with communities, namely:

* edges65, comm65: hypergraphs with $\tau_e = \lceil(d*0.65)\rceil$ for all community edges of side $d$
* edges85, comm85: hypergraphs with $\tau_e = \lceil(d*0.85)\rceil$ for all community edges of side $d$
* edges65_unif, comm65_unif: hypergraphs with $\tau_e$ chosen uniformly from $\{\lceil(d*0.65)\rceil,...,d\}$ for all community edges of side $d$

All have 1000 nodes, 1400 edges of size 2 to 8 (200 each) 10 communities and noise parameter $\mu=0.1$.

In [None]:
## load hypergraphs
with open(datadir+"Hypergraph/hypergraphs.pkl","rb") as f:
    (edges65, comm65, edges85, comm85, edges65_unif, comm65_unif) = pickle.load(f)

In [None]:
## estimating tau

## pick one of the three hypergraphs
comm = comm65
L = edges65

## true communities
HG = hnx.Hypergraph(dict(enumerate(L)))
x = []
for e in L:
    x.append(max([len(e.intersection(k)) for k in comm])/len(e))
y = []
for t in np.arange(.501,1,.025):
    y.append(sum([i>t for i in x])/len(x))
plt.plot(np.arange(.501,1,.025),y,'.-',color='grey',label='true communities')

## Louvain
G = HNX_2section(HG)
G.vs['louvain'] = G.community_multilevel(weights='weight').membership
ML = dict2part({v['name']:v['louvain'] for v in G.vs})
x = []
for e in L:
    x.append(max([len(e.intersection(k)) for k in ML])/len(e))
y = []
for t in np.arange(.501,1,.025):
    y.append(sum([i>t for i in x])/len(x))
plt.plot(np.arange(.501,1,.025),y,'.-',color='black',label='Louvain')

plt.grid()
#plt.title(r'Estimating $\tau$ from data',fontsize=14)
plt.ylabel(r'Pr(homogeneity > $\tau$)',fontsize=14)
plt.xlabel(r'$\tau$',fontsize=14)
plt.legend()
plt.ylim(0,1);
#plt.savefig('tau_65.eps');


In [None]:
## distribution of edge homogeneity -- single value for 'tau'
x = []
for e in edges65:
    x.append(max([len(e.intersection(k)) for k in comm65])/len(e))
plt.hist(x,bins='rice',color='grey');
#plt.savefig('hist_65.eps');


In [None]:
## distribution of edge homogeneity -- range for 'tau' 
## we see many more pure community edges
x = []
for e in edges65_unif:
    x.append(max([len(e.intersection(k)) for k in comm65_unif])/len(e))
plt.hist(x, bins='rice',color='grey');
#plt.savefig('hist_65_unif.eps');
