In [1]:
import numpy as np
import networkx as nx
import xgi
import json
from os import listdir
from itertools import combinations
from scipy.spatial import distance
from multiprocess import Pool

from netsimile import *
from portrait_divergence import *
from hypergraphs_distances import *

# Load datasets

In [3]:
# load data an create a hypergraph dictionary 
data_dir = ['SocioPatterns', 'Conferences', 'Utah', 'CopNS', 'APS', 'Online', 'Congress']
H_list = []
labels = []

for folder in data_dir:
    
    path = f'../data/{folder}'
    files = sorted([f for f in listdir(path)])
    
    for f in files :
        if f.startswith('.'):
            continue
        with open(f'{path}/{f}') as file: 
            data_ = json.load(file)
        # remove eventual edges of lenght 1
        data = [i for i in data_ if len(i)>1]
        H = xgi.from_hyperedge_list(data)
        # relabel and remove eventual multiple edges
        H.cleanup(isolates=True, singletons=True, connected=False)
        labels.append( f'{folder}_{f}'.replace('.json', '').replace('aggr_15min_cliques_thr1_', '').replace(
            '_hypergraph', '').replace('hyperedges_', '').replace('_simplices', '').replace('_aggr_5min', '') )
        H_list.append(H)

for (l,H) in zip(labels, H_list):
    print(f'{l}:  nodes={len(H.nodes)}, edges={len(H.edges)}, connected={xgi.is_connected(H)}')

SocioPatterns_InVS13:  nodes=92, edges=603, connected=True
SocioPatterns_InVS15:  nodes=217, edges=3279, connected=True
SocioPatterns_LH10:  nodes=76, edges=1102, connected=True
SocioPatterns_LyonSchool:  nodes=242, edges=10848, connected=True
SocioPatterns_SFHH:  nodes=403, edges=6398, connected=True
SocioPatterns_Thiers13:  nodes=327, edges=4795, connected=True
Conferences_ECIR19:  nodes=172, edges=14068, connected=True
Conferences_ECSS18:  nodes=164, edges=12614, connected=True
Utah_elem_day2_0h-12h:  nodes=321, edges=5884, connected=True
Utah_elem_day2_12h-24h:  nodes=314, edges=9875, connected=True
CopNS_day2:  nodes=471, edges=11113, connected=False
CopNS_day4:  nodes=471, edges=11060, connected=False
APS_PRA_1992_1996:  nodes=10329, edges=5339, connected=False
APS_PRB_1992_1996:  nodes=31818, edges=18572, connected=False
APS_PRC_1992_1996:  nodes=7763, edges=3167, connected=False
APS_PRD_1992_1996:  nodes=6465, edges=4135, connected=False
APS_PRD_1997_2001:  nodes=7762, edges=58

In [5]:
# for each dataset, count the occurences of each hyperedge size
s_dict = dict()
for (k,h) in zip(labels, H_list):
    sizes = [len(i) for i in h.edges.members()]
    s_dict[k] = {s: sizes.count(s) for s in set(sizes)}

s_dict

{'SocioPatterns_InVS13': {2: 477, 3: 119, 4: 7},
 'SocioPatterns_InVS15': {2: 1610,
  3: 1179,
  4: 290,
  5: 109,
  6: 53,
  7: 17,
  8: 17,
  9: 3,
  10: 1},
 'SocioPatterns_LH10': {2: 197, 3: 437, 4: 308, 5: 109, 6: 39, 7: 12},
 'SocioPatterns_LyonSchool': {2: 529,
  3: 3340,
  4: 3743,
  5: 2046,
  6: 809,
  7: 298,
  8: 77,
  9: 5,
  10: 1},
 'SocioPatterns_SFHH': {2: 3130,
  3: 2266,
  4: 749,
  5: 191,
  6: 45,
  7: 8,
  8: 2,
  9: 6,
  10: 1},
 'SocioPatterns_Thiers13': {2: 1308, 3: 2150, 4: 1003, 5: 265, 6: 58, 7: 11},
 'Conferences_ECIR19': {2: 3404,
  3: 4787,
  4: 3412,
  5: 1579,
  6: 679,
  7: 196,
  8: 10,
  9: 1},
 'Conferences_ECSS18': {2: 2229,
  3: 3558,
  4: 3278,
  5: 1886,
  6: 1070,
  7: 478,
  8: 101,
  9: 13,
  10: 1},
 'Utah_elem_day2_0h-12h': {2: 373,
  3: 1162,
  4: 1624,
  5: 1338,
  6: 881,
  7: 306,
  8: 114,
  9: 26,
  10: 15,
  11: 41,
  15: 2,
  16: 2},
 'Utah_elem_day2_12h-24h': {2: 420,
  3: 1562,
  4: 2494,
  5: 2501,
  6: 1618,
  7: 686,
  8: 323,


# Compute distances

In [9]:
%%time
G_list = [xgi.to_graph(h) for h in H_list]

# parallelize computations
p = Pool(processes=5)
FVs = p.map(feature_vec, H_list)
print('Feature vectors done.')
HPs = p.map(edge_portrait, H_list)
print('Hyperedge portraits done.')
fvs = p.map(graph_signature, G_list)
print('Graph signatures done.')
nps = p.map(portrait, G_list)
    
# compute distances between all pairs of hypergraphs
ds = distance.pdist(np.array(FVs), metric='canberra')
HNS_dists = list(ds / len(FVs[0]))
HPD_dists = [hyper_portrait_divergence(HPs[i],HPs[j]) 
             for i,j in combinations(range(len(HPs)), 2) ]

# compute distances between all pairs of networks
ds = distance.pdist(np.array(fvs), metric='canberra')
ns_dists = list(ds / len(fvs[0]))
pd_dists = [portrait_divergence(nps[i],nps[j]) 
            for i,j in combinations(range(len(nps)), 2) ]
    
# save results
results = (labels, HNS_dists, HPD_dists, ns_dists, pd_dists)
with open('../results/HNS_HPD_NS_PD_distances_data.json', 'w') as res_file:
    json.dump(results, res_file)

Feature vectors done.
Hyperedge portraits done.
Graph signatures done.
CPU times: user 6min 9s, sys: 4min 9s, total: 10min 19s
Wall time: 21h 35min 49s
