In [1]:
import pandas as pd
import numpy as np
import igraph as ig
import xgi ## pip install xgi is required
import pickle
from collections import Counter
from functools import reduce
import itertools
from scipy.special import comb
import warnings
import random
from itertools import combinations
from itertools import combinations as combs

## Functions to compute various simpliciality measures are found in the included ```simpliciality.py``` file:
import simpliciality as spl 
## new estimate for simpliciality ratio
import sr  

## Set this to the data directory
datadir='../Datasets/'


# XGI's hypergraphs



### get the data directly from XGI

* nodes, edges, avg size
* FES: first including self
* Simplicial ratio: first via sampling from CL model, second with new approx. code

SR (sampling from CL):

* disgenenet: 28.0821
* diseasome: 6.6147
* contact-high-school: 6.5836
* email-eu: 5.1355
* email-enron: 4.8800
* congress-bills: 4.4154
* ndc-substances: 4.0749
* contact-primary-school: 2.7043
* hospital-lyon: 0.9464
* tags-ask-ubuntu:0.6652


In [2]:
def compute_all(fn=""):
    H = xgi.load_xgi_data(fn)
    E = list(set([tuple(sorted(e)) for e in H.edges.members() if len(e)<=11 and len(e)>=2])) ## as in Landry's
    E = [set(e) for e in E]
    V = list(set([i for e in E for i in e]))
    print('\nResults for',fn,':')
    print('n =',len(V),'\nm =',len(E),'\n<s> =',np.mean([len(e) for e in E]))
    print('SF:',spl.get_simplicial_fraction(V,E),
          '\nES:',spl.get_edit_simpliciality(V,E),
          '\nFES with max face:',spl.get_face_edit_simpliciality(V,E,exclude_self=False),
          '\nFES excluding max face:',spl.get_face_edit_simpliciality(V,E,exclude_self=True))
    print('simplicial ratio with CL model:',spl.get_simplicial_ratio(V, E, samples=1000, multisets=False)) ## adjust sampe size here
    r, _a, _b, _c, _d = sr.simplicial_ratio(V,E)
    print('simplicial ratio with estimation:',r)
    

In [3]:
Datasets = ["contact-primary-school", "contact-high-school", "hospital-lyon", 
            "email-enron", "email-eu", "diseasome", "disgenenet", 
            "ndc-substances", "congress-bills", "tags-ask-ubuntu"]

for fn in Datasets:
    compute_all(fn)


Results for contact-primary-school :
n = 242 
m = 12704 
<s> = 2.4188444584382873
SF: 0.8470540758676351 
ES: 0.9175213057922866 
FES with max face: 0.9506445570463496 
FES excluding max face: 0.9372632240823688
simplicial ratio with CL model: 2.700526468460238
simplicial ratio with estimation: 2.7146791484268467

Results for contact-high-school :
n = 327 
m = 7818 
<s> = 2.3269378357636223
SF: 0.8064655172413793 
ES: 0.9270722162931341 
FES with max face: 0.9393321276864209 
FES excluding max face: 0.9230435574995467
simplicial ratio with CL model: 6.5749169649746175
simplicial ratio with estimation: 6.516760322905618

Results for hospital-lyon :
n = 75 
m = 1824 
<s> = 2.4270833333333335
SF: 0.9121338912133892 
ES: 0.9544740973312402 
FES with max face: 0.9774536866642135 
FES excluding max face: 0.9718947368421053
simplicial ratio with CL model: 0.9459274371403484
simplicial ratio with estimation: 0.9656569856027547

Results for email-enron :
n = 143 
m = 1442 
<s> = 2.974341192787

In [4]:
## Sanity check -- naive FES (as this is a tiny graph)
H = xgi.load_xgi_data("diseasome")
E = list(set([tuple(sorted(e)) for e in H.edges.members() if len(e)<=11 and len(e)>=2]))
E = [set(e) for e in E]

## naive max set for sanity check
L = []
for e in E:
    for f in E:
        if e<f:
            break
    if e<f:
        continue
    if len(e)>2:
        L.append(e)
max_E = spl.max_subsets(E)

print('checking max set function:',Counter([e in L for e in max_E]),Counter([e in max_E for e in L]))

fes = 0
for e in max_E:
    num=den=0
    for k in range(2,len(e)): ## excluding self
        for f in combs(e,k):
            den +=1
            if set(f) in E:
                num += 1 
    fes += num/den
print('FES:',fes/len(max_E))


checking max set function: Counter({True: 159}) Counter({True: 159})
FES: 0.04065214786039885


## With time info

In [5]:
def compute_all_time(fn):

    ## load dataset
    H = xgi.load_xgi_data(fn)

    ## time-order the edges
    TS = [H.edges[e]['timestamp'] for e in H.edges]
    order = [i for i in np.argsort(TS)]
    M = H.edges.members()
    L = [tuple(sorted(M[i])) for i in order]

    ## keep only first instance for each edge
    seen = set()
    uniq = []
    for x in L:
        if x not in seen:
            if len(x)<=11 and len(x)>=2: ## as in Landry
                uniq.append(x)
            seen.add(x)
    E = [set(e) for e in uniq]

    ## compute SR's (overall, up, down)
    V = list(set([i for e in E for i in e]))
    print(fn)
    s_ratio = spl.get_simplicial_ratio(V, E, samples=100, multisets=False, edge_order=False)
    s_ratio_time = spl.get_simplicial_ratio(V, E, samples=100, multisets=False, edge_order=True)
    print('simplicial ratio (CL):',s_ratio, s_ratio_time[0], s_ratio_time[1]) ## adjust sample size here
    r, _a, _b, _c, _d = sr.simplicial_ratio(V,E) 
    print('simplicial ratios (estimate):',r,_a,_b)


In [6]:
Datasets = ["contact-primary-school", "contact-high-school", "hospital-lyon", 
            "email-enron", "email-eu",
            "congress-bills", "tags-ask-ubuntu"]

for fn in Datasets:
    compute_all_time(fn)

contact-primary-school
simplicial ratio (CL): 2.7019228203143766 4.74447142102518 0.654214484368673
simplicial ratios (estimate): 2.7273665096684905 4.793726727778336 0.6610062915586452
contact-high-school
simplicial ratio (CL): 6.553400061061647 10.991534173033244 2.1276068552550074
simplicial ratios (estimate): 6.50905490909669 10.906887777670455 2.1112220405229256
hospital-lyon
simplicial ratio (CL): 0.9429207600535587 1.7228503971456803 0.16821175068453884
simplicial ratios (estimate): 0.944196337154723 1.7204183757968785 0.16797429851256737
email-enron
simplicial ratio (CL): 4.871424609587927 6.855382613770078 2.8910084089887214
simplicial ratios (estimate): 4.9080740748348 6.904448139039297 2.9117000106303035
email-eu
simplicial ratio (CL): 5.132514837922883 7.679424688976842 2.575633937415726
simplicial ratios (estimate): 5.2594715285852285 7.877032590265377 2.64191046690508
congress-bills
simplicial ratio (CL): 4.408676655770777 5.1565584543541 3.636166691617646
simplicial rati