In [None]:
import pandas as pd
import numpy as np
import igraph as ig
import xgi ## pip install xgi is required
import pickle
from collections import Counter
from functools import reduce
import itertools
from scipy.special import comb
import warnings
import random
from itertools import combinations
from itertools import combinations as combs

## Functions to compute various simpliciality measures are found in the included ```simpliciality.py``` file:
import simpliciality as spl 
## new estimate for simpliciality ratio
import sr  

## Set this to the data directory
datadir='../Datasets/'


# XGI's hypergraphs



### get the data directly from XGI

* nodes, edges, avg size
* FES: first including self
* Simplicial ratio: first via sampling from CL model, second with new approx. code

SR (sampling from CL):

* disgenenet: 28.0821
* diseasome: 6.6147
* contact-high-school: 6.5836
* email-eu: 5.1355
* email-enron: 4.8800
* congress-bills: 4.4154
* ndc-substances: 4.0749
* contact-primary-school: 2.7043
* hospital-lyon: 0.9464
* tags-ask-ubuntu:0.6652


In [None]:
def compute_all(fn=""):
    H = xgi.load_xgi_data(fn)
    E = list(set([tuple(sorted(e)) for e in H.edges.members() if len(e)<=11 and len(e)>=2])) ## as in Landry's
    E = [set(e) for e in E]
    V = list(set([i for e in E for i in e]))
    print('\nResults for',fn,':')
    print('n =',len(V),'\nm =',len(E),'\n<s> =',np.mean([len(e) for e in E]))
    print('SF:',spl.get_simplicial_fraction(V,E),
          '\nES:',spl.get_edit_simpliciality(V,E),
          '\nFES with max face:',spl.get_face_edit_simpliciality(V,E,exclude_self=False),
          '\nFES excluding max face:',spl.get_face_edit_simpliciality(V,E,exclude_self=True))
    print('simplicial ratio with CL model:',spl.get_simplicial_ratio(V, E, samples=1000, multisets=False)) ## adjust sampe size here
    r, _a, _b, _c, _d = sr.simplicial_ratio(V,E)
    print('simplicial ratio with estimation:',r)
    

In [None]:
Datasets = ["contact-primary-school", "contact-high-school", "hospital-lyon", 
            "email-enron", "email-eu", "diseasome", "disgenenet", 
            "ndc-substances", "congress-bills", "tags-ask-ubuntu"]

for fn in Datasets:
    compute_all(fn)

In [None]:
## Sanity check -- naive FES (as this is a tiny graph)
H = xgi.load_xgi_data("diseasome")
E = list(set([tuple(sorted(e)) for e in H.edges.members() if len(e)<=11 and len(e)>=2]))
E = [set(e) for e in E]

## naive max set for sanity check
L = []
for e in E:
    for f in E:
        if e<f:
            break
    if e<f:
        continue
    if len(e)>2:
        L.append(e)
max_E = spl.max_subsets(E)

print('checking max set function:',Counter([e in L for e in max_E]),Counter([e in max_E for e in L]))

fes = 0
for e in max_E:
    num=den=0
    for k in range(2,len(e)): ## excluding self
        for f in combs(e,k):
            den +=1
            if set(f) in E:
                num += 1 
    fes += num/den
print('FES:',fes/len(max_E))


## With time info

In [None]:
def compute_all_time(fn):

    ## load dataset
    H = xgi.load_xgi_data(fn)

    ## time-order the edges
    TS = [H.edges[e]['timestamp'] for e in H.edges]
    order = [i for i in np.argsort(TS)]
    M = H.edges.members()
    L = [tuple(sorted(M[i])) for i in order]

    ## keep only first instance for each edge
    seen = set()
    uniq = []
    for x in L:
        if x not in seen:
            if len(x)<=11 and len(x)>=2: ## as in Landry
                uniq.append(x)
            seen.add(x)
    E = [set(e) for e in uniq]

    ## compute SR's (overall, up, down)
    V = list(set([i for e in E for i in e]))
    print(fn)
    s_ratio = spl.get_simplicial_ratio(V, E, samples=100, multisets=False, edge_order=False)
    s_ratio_time = spl.get_simplicial_ratio(V, E, samples=100, multisets=False, edge_order=True)
    print('simplicial ratio (CL):',s_ratio, s_ratio_time[0], s_ratio_time[1]) ## adjust sample size here
    r, _a, _b, _c, _d = sr.simplicial_ratio(V,E) 
    print('simplicial ratios (estimate):',r,_a,_b)


In [None]:
Datasets = ["contact-primary-school", "contact-high-school", "hospital-lyon", 
            "email-enron", "email-eu",
            "congress-bills", "tags-ask-ubuntu"]

for fn in Datasets:
    compute_all_time(fn)