In [1]:
import numpy as np
import networkx as nx
import xgi
import json
from tqdm import tqdm
from os import listdir
from itertools import combinations
from scipy.spatial import distance
from multiprocess import Pool

In [2]:
from netsimile import *
from portrait_divergence import *
from hypergraphs_distances import *
from hypergraphs_null_models import *

# Reshuffling methods

I create several realizations of each null model and compute the similarity matrix with all neasures (HNS, HPD, NS, PD). I also include the original hypergraph. The aim is to see if different reshuffling methods are correctly clustered by the similarity measures. 

Reshuffling methods: 
1) Random edge-shuffling (RS): we keep the number and sizes of hyperedges and place them randomly over the nodes.

2) Degree-proportional edge-shuffling (PS): as before, but the probability for each node to be choosen is proportional to its hyperdegree (i.e. number of hyperedges it is part of) in the original hypergraph. 

3) Degree-preserving edge-shuffling (configuration model) (DS): we preserve the number of hyperedges at each order and the degree of every node at each order. This should only affect the community structure of the original hypergraph.

Note: the reshuffling routine might leave the hypergraph disconnected.

In [6]:
# load data an create a hypergraph dictionary 
data_dir = ['SocioPatterns', 'Conferences', 'Utah', 'CopNS', 'APS', 'Online', 'Congress']
H_dict = dict()
labels = []

for folder in data_dir:
    
    path = f'../data/{folder}'
    files = sorted([f for f in listdir(path)])
    
    for f in files :
        if f.startswith('.'):
            continue
        with open(f'{path}/{f}') as file: 
            data_ = json.load(file)
        # remove eventual edges of lenght 1
        data = [i for i in data_ if len(i)>1]
        H = xgi.from_hyperedge_list(data)
        # relabel and remove eventual multiple edges
        H.cleanup(isolates=True, singletons=True, connected=False)
        key = f'{folder}_{f}'.replace('.json', '').replace('aggr_15min_cliques_thr1_', '').replace(
            '_hypergraph', '').replace('hyperedges_', '').replace('_simplices', '').replace('_aggr_5min', '')
        H_dict[key] = H
        labels.append(key)

for l,H in H_dict.items():
    print(f'{l}:  nodes={len(H.nodes)}, edges={len(H.edges)}, connected={xgi.is_connected(H)}')

SocioPatterns_InVS13:  nodes=92, edges=603, connected=True
SocioPatterns_InVS15:  nodes=217, edges=3279, connected=True
SocioPatterns_LH10:  nodes=76, edges=1102, connected=True
SocioPatterns_LyonSchool:  nodes=242, edges=10848, connected=True
SocioPatterns_SFHH:  nodes=403, edges=6398, connected=True
SocioPatterns_Thiers13:  nodes=327, edges=4795, connected=True
Conferences_ECIR19:  nodes=172, edges=14068, connected=True
Conferences_ECSS18:  nodes=164, edges=12614, connected=True
Utah_elem_day2_0h-12h:  nodes=321, edges=5884, connected=True
Utah_elem_day2_12h-24h:  nodes=314, edges=9875, connected=True
CopNS_day2:  nodes=471, edges=11113, connected=False
CopNS_day4:  nodes=471, edges=11060, connected=False
APS_PRA_1992_1996:  nodes=10329, edges=5339, connected=False
APS_PRB_1992_1996:  nodes=31818, edges=18572, connected=False
APS_PRC_1992_1996:  nodes=7763, edges=3167, connected=False
APS_PRD_1992_1996:  nodes=6465, edges=4135, connected=False
APS_PRD_1997_2001:  nodes=7762, edges=58

## Compute similarity matrices for various datasets

In [9]:
# number of samples of each null model
n_samp = 50  
to_reshuff = ['SocioPatterns_LH10']

for tag in to_reshuff:
    
    H = H_dict[tag]  
    FVs = [feature_vec(H)]
    HPs = [edge_portrait(H)]
    G = xgi.to_graph(H)   
    fvs = [graph_signature(G)]
    nps = [portrait(G)]
    shuff_methods = [edge_shuffled_hypergraph, dp_edge_shuffled_hypergraph, configuration_model_hypergraph]
    
    for F in tqdm(shuff_methods):  
        H_null = [F(H, seed=i) for i in range(n_samp)] 
        G_null = [xgi.to_graph(h) for h in H_null]
        # parallelize computations
        p = Pool(processes=5)
        FVs += p.map(feature_vec, H_null) 
        HPs += p.map(edge_portrait, H_null)
        fvs += p.map(graph_signature, G_null)
        nps += p.map(portrait, G_null)
    
    # compute distances between all pairs of hypergraphs
    ds = distance.pdist(np.array(FVs), metric='canberra')
    norm = len(FVs[0])
    HNS_dists = list(ds / norm)
    HPD_dists = [hyper_portrait_divergence(HPs[i],HPs[j]) 
                 for i,j in combinations(range(len(HPs)), 2) ]
    
    # compute distances between all pairs of networks
    ds = distance.pdist(np.array(fvs), metric='canberra')
    norm = len(fvs[0])
    ns_dists = list(ds / norm)
    pd_dists = [portrait_divergence(nps[i],nps[j]) 
                for i,j in combinations(range(len(nps)), 2) ]
    
    labels = ['original'] 
    for l in ['RS','PS','DS']:
        labels += [f'{l}{i}' for i in range(n_samp)]
        
    results = (labels, HNS_dists, HPD_dists, ns_dists, pd_dists)
        
    # save results
    with open(f'../results/reshuffling_HNS_HPD_NS_PD_{tag}.json', 'w') as res_file:
        json.dump(results, res_file)

100%|████████████████████████████████████████████| 3/3 [23:04<00:00, 461.35s/it]
