In [1]:
#!pip install networkx==3.1
#!pip install scipy==1.10.1

In [2]:
import networkx as nx
import time
import random
from tqdm import tqdm
import networkx.algorithms.community as nx_comm
import pickle
import pandas as pd

import src.riondato as riondato
import src.sweg as sweg
import src.summary_utils as summary_utils

import src.mykcore as k_core

In [3]:
import numpy
import sklearn
import scipy
print('numpy version: ' + str(numpy.__version__))
print('sklearn version: ' + str(sklearn.__version__))
print('scipy version: ' + str(scipy.__version__))
print('pandas version: ' + str(pd.__version__))
print('networkx version: ' + str(nx.__version__))

from importlib.metadata import version
print('tqdm version: ' + version('tqdm'))

from platform import python_version
print('python version: ' + str(python_version()))

numpy version: 1.20.3
sklearn version: 0.24.2
scipy version: 1.10.1
pandas version: 1.2.4
networkx version: 3.1
tqdm version: 4.65.0
python version: 3.9.17


In [None]:
RUN_S2L = True
RUN_sweg = True
graph_type = nx.Graph

In [None]:
sweg_result_path = 'sweg/GSCIS_TBUDS-main-noeclipse/'
def load_sweg_summary(summary_folder_path):
    sweg_partition_path = summary_folder_path + '/G.txt'
    sweg_edges_path = summary_folder_path + '/P.txt'

    partition, node2supernode = sweg.parse_sweg_partition(sweg_partition_path) 
    #print(len(partition), len(node2supernode)) 

    edges = sweg.parse_sweg_edges(sweg_edges_path) 
    #print(len(edges)) 

    s = sweg.get_sweg_summary(partition, edges) 
    #print(s.number_of_nodes(), s.number_of_edges())
    return partition, node2supernode, s

In [None]:
def load_S2L_summary(summary_path):
    with open(summary_path, 'rb') as summary_file:
        (S, S_prob, partition, node2supernode, S_avgweight) = pickle.load(summary_file)
    s = riondato.get_sl2_summary(S, graph_type)
    s_aw = riondato.get_sl2_summary(S_avgweight, graph_type)
    
    return S, S_prob, partition, node2supernode, S_avgweight, s, s_aw

In [None]:
facebook = {'file':'data/fb.csv', 'name':'fb', 'isDirected':False, 'isWeighted':False, 'sep':',', 'ks':[350,500,750], 'sweg_summaries':['compressed_fb_300_0.54','compressed_fb_100_0.72','compressed_fb_80_0.18']}
lastfm = {'file':'data/lastfm-GCC.csv', 'name':'lastfmGCC', 'isDirected':False, 'isWeighted':False, 'sep':',', 'ks':[500,750,1000], 'sweg_summaries':['compressed_lastfm-GCC_300_0.72','compressed_lastfm-GCC_200_0.36','compressed_lastfm-GCC_80_0.54']}
enron = {'file':'data/en-GCC.csv', 'name':'enGCC', 'isDirected':False, 'isWeighted':False, 'sep':',', 'ks':[1000,1500,2000], 'sweg_summaries':['compressed_en-GCC_300_0.9','compressed_en-GCC_100_0.9','compressed_en-GCC_80_0.18']}
gnutella = {'file':'data/gnutella.csv', 'name':'gnutella', 'isDirected':True, 'isWeighted':False, 'sep':',', 'ks':[500,750,1000]}
ubuntu = {'file':'data/ubuntu.csv', 'name':'ubuntu', 'isDirected':False, 'isWeighted':True, 'sep':',', 'ks':[350,500,750]}
#asskitter = {'file':'data/as-skitter.txt', 'name':'as-skitter', 'isDirected':False, 'isWeighted':False, 'sep':' ', 'ks':[1000,10000,25000], 'sweg_summaries':['compressed_as-skitter_100_0.72','compressed_as-skitter_100_0.18','compressed_as-skitter_80_0.18']}

#all_datasets_and_parameters = [facebook, lastfm, enron, gnutella, ubuntu, asskitter]
all_datasets_and_parameters = [facebook, lastfm, enron, gnutella, ubuntu]

In [None]:
graph_type = nx.Graph
for dataset in all_datasets_and_parameters:
    input_graph_path = dataset['file']
    input_graph_name = dataset['name']
    sep = dataset['sep']
    ks = dataset['ks']
    G = summary_utils.load_original_graph(input_graph_path, sep)
    print('--------------------')
    for ik in range(len(ks)):
        k = ks[ik]
        summary_path = 'data/' + input_graph_name + '_summary_S2L_' + str(k) + '.pickle'
        with open(summary_path, 'rb') as summary_file:
            (S, S_prob, partition, node2supernode, S_avgweight) = pickle.load(summary_file)
        s = riondato.get_sl2_summary(S, graph_type)
        s_aw = riondato.get_sl2_summary(S_avgweight, graph_type)
        lcc_size, ccs = summary_utils.get_size_largest_cc_and_number_cc(s)
        print(input_graph_name, G.number_of_nodes(), G.number_of_edges(), s.number_of_nodes(), s.number_of_edges(), len(list(nx.nodes_with_selfloops(s))), ccs, lcc_size, round(lcc_size/s.number_of_nodes(),3))

In [None]:
graph_type = nx.Graph
for dataset in all_datasets_and_parameters:
    input_graph_name = dataset['name']
    sep = dataset['sep']
    ks = dataset['ks']
    for ik in range(len(ks)):
        k = ks[ik]
        summary_path = 'data/' + input_graph_name + '_summary_S2L_' + str(k) + '.pickle'
        with open(summary_path, 'rb') as summary_file:
            (S, S_prob, partition, node2supernode, S_avgweight) = pickle.load(summary_file)
            s = riondato.get_sl2_summary(S, graph_type)
            s_aw = riondato.get_sl2_summary(S_avgweight, graph_type)
    
            avgw = 0
            for edge in s.edges:
                u = int(edge[0])
                v = int(edge[1])
                avgw += float(s.edges[u,v]['weight'])
            avgw /= len(s_aw.edges)
            
            lcc_size, ccs = summary_utils.get_size_largest_cc_and_number_cc(s)
            
            print(summary_path + '\t' + str(s.number_of_nodes()) + '\t' + str(s.number_of_edges()) + '\t' + str(len(list(nx.nodes_with_selfloops(s)))) + '\t' + str(ccs) + '\t' + str(lcc_size) + '\t' + str(round(lcc_size/s.number_of_nodes(),3)))
            #print(summary_path,avgw,len(s.edges),len(s_aw.edges),len(list(nx.nodes_with_selfloops(s))), ccs, lcc_size, round(lcc_size/s.number_of_nodes(),3))

In [None]:
graph_type = nx.Graph
for dataset in all_datasets_and_parameters:
    if 'sweg_summaries' in dataset.keys():
        for sweg_summary in dataset['sweg_summaries']:
            summary_path = sweg_result_path + sweg_summary
            partition, node2supernode, s = load_sweg_summary(sweg_result_path + sweg_summary)
            lcc_size, ccs = summary_utils.get_size_largest_cc_and_number_cc(s)
            print(summary_path + '\t' + str(s.number_of_nodes()) + '\t' + str(s.number_of_edges()) + '\t' + str(len(list(nx.nodes_with_selfloops(s)))) + '\t' + str(ccs) + '\t' + str(lcc_size) + '\t' + str(round(lcc_size/s.number_of_nodes(),3)))
            #print(summary_path,s.number_of_nodes(), s.number_of_edges(), len(list(nx.nodes_with_selfloops(s))), ccs, lcc_size, round(lcc_size/s.number_of_nodes(),3))

# Numerical (global) & partitioning queries

In [None]:
def numerical_queries(input_graph_path, sep, input_graph_name, summary_path, summary_type, k, graph_type, n_worlds, max_iter, seed, out_sep, rounding, results, use_tqdm=False, compute_on_input_graph=True):    
    #loading input graph
    print("loading input graph")
    G = summary_utils.load_original_graph(input_graph_path, sep)
    G_weight = None if not nx.is_weighted(G) else 'weight'    
    
    actual = ''
    naive_unweighted = ''
    naive_weighted = ''
    prob_unweighted = ''
    prob_weighted_avg = ''
    prob_weighted_exp = ''
    
    #loading summary
    print("loading summary")
    if summary_type == 'S2L':
        S, S_prob, partition, node2supernode, S_avgweight, s, s_aw = load_S2L_summary(summary_path)
    elif summary_type == 'sweg':
        partition, node2supernode, s = load_sweg_summary(summary_path)
    summary_nodes = s.number_of_nodes()
    summary_edges = s.number_of_edges()
    summary_node_edges = str(summary_nodes) + '-' + str(summary_edges)
    
    
    #generating worlds of ProbabilisticGPQPS method
    print("generating worlds of ProbabilisticGPQPS method")
    if summary_type == 'S2L':
        possible_worlds = []
        possible_worlds_aw = []
        for i in tqdm(range(n_worlds), disable=not use_tqdm):
            W, W_aw = summary_utils.world(graph_type, S, S_avgweight, S_prob)
            possible_worlds.append(W)
            possible_worlds_aw.append(W_aw)

    
    #average clustering coefficient
    #input graph
    if compute_on_input_graph:
        print("computing avg clustering coefficient on input graph")
        actual += input_graph_name + out_sep + '-' + out_sep + 'Actual' + out_sep
        results['DATASET'].append(input_graph_name)
        results['METHOD'].append('Actual')
        results['#SUPERNODES'].append('-')

        t0 = time.time()
        score = nx.average_clustering(G, weight=G_weight)
        runtime = time.time() - t0
        actual += str(round(score,rounding)) + out_sep + str(round(runtime,rounding))
        results['CLUST COEFF'].append(str(round(score,rounding)))
        results['CLUST COEFF - TIME (S)'].append(str(round(runtime,rounding)))
    
    #NaiveGPQPS
    print("computing avg clustering coefficient for NaiveGPQPS-unweighted")
    dataset_string = '' if compute_on_input_graph else input_graph_name
    naive_unweighted += dataset_string + out_sep + summary_node_edges + out_sep + 'NaiveGPQPS - unweighted' + out_sep
    results['DATASET'].append(dataset_string)
    results['METHOD'].append('NaiveGPQPS - unweighted')
    results['#SUPERNODES'].append(summary_node_edges)
    t0 = time.time()
    score = nx.average_clustering(s,weight=None)
    runtime = time.time() - t0
    naive_unweighted += str(round(score,rounding)) + out_sep + str(round(runtime,rounding))
    results['CLUST COEFF'].append(str(round(score,rounding)))
    results['CLUST COEFF - TIME (S)'].append(str(round(runtime,rounding)))
    
    if summary_type == 'S2L':
        print("computing avg clustering coefficient for NaiveGPQPS-weighted")
        naive_weighted += '' + out_sep + summary_node_edges + out_sep + 'NaiveGPQPS - weighted' + out_sep
        results['DATASET'].append('')
        results['METHOD'].append('NaiveGPQPS - weighted')
        results['#SUPERNODES'].append(summary_node_edges)
        t0 = time.time()
        score = nx.average_clustering(s,weight='weight')
        runtime = time.time() - t0
        naive_weighted += str(round(score,rounding)) + out_sep + str(round(runtime,rounding))
        results['CLUST COEFF'].append(str(round(score,rounding)))
        results['CLUST COEFF - TIME (S)'].append(str(round(runtime,rounding)))
    
    #ProbabilisticGPQPS
    if summary_type == 'S2L':
        print("computing avg clustering coefficient for ProbabilisticGPQPS-unweighted")
        prob_unweighted += '' + out_sep + summary_node_edges + out_sep + 'ProbabilisticGPQPS - unweighted' + out_sep
        results['DATASET'].append('')
        results['METHOD'].append('ProbabilisticGPQPS - unweighted')
        results['#SUPERNODES'].append(summary_node_edges)
        t0 = time.time()
        score = summary_utils.avg_cluster_coefficient_probabilistic(possible_worlds, None, use_tqdm=use_tqdm)
        runtime = time.time() - t0
        prob_unweighted += str(round(score,rounding)) + out_sep + str(round(runtime,rounding))
        results['CLUST COEFF'].append(str(round(score,rounding)))
        results['CLUST COEFF - TIME (S)'].append(str(round(runtime,rounding)))
        
        print("computing avg clustering coefficient for ProbabilisticGPQPS-weighted-avg")
        prob_weighted_avg += '' + out_sep + summary_node_edges + out_sep + 'ProbabilisticGPQPS - weighted, avg' + out_sep
        results['DATASET'].append('')
        results['METHOD'].append('ProbabilisticGPQPS - weighted, avg')
        results['#SUPERNODES'].append(summary_node_edges)
        t0 = time.time()
        score = summary_utils.avg_cluster_coefficient_probabilistic(possible_worlds_aw, 'weight', use_tqdm=use_tqdm)
        runtime = time.time() - t0
        prob_weighted_avg += str(round(score,rounding)) + out_sep + str(round(runtime,rounding))
        results['CLUST COEFF'].append(str(round(score,rounding)))
        results['CLUST COEFF - TIME (S)'].append(str(round(runtime,rounding)))
        
        print("computing avg clustering coefficient for ProbabilisticGPQPS-weighted-exp")
        prob_weighted_exp += '' + out_sep + summary_node_edges + out_sep + 'ProbabilisticGPQPS - weighted, exp' + out_sep
        results['DATASET'].append('')
        results['METHOD'].append('ProbabilisticGPQPS - weighted, exp')
        results['#SUPERNODES'].append(summary_node_edges)
        t0 = time.time()
        score = summary_utils.avg_cluster_coefficient_probabilistic(possible_worlds, 'weight', use_tqdm=use_tqdm)
        runtime = time.time() - t0
        prob_weighted_exp += str(round(score,rounding)) + out_sep + str(round(runtime,rounding))
        results['CLUST COEFF'].append(str(round(score,rounding)))
        results['CLUST COEFF - TIME (S)'].append(str(round(runtime,rounding)))
        
        
    #modularity
    #input graph
    if compute_on_input_graph:
        print("computing communities on input graph")
        t0 = time.time()
        #community_G = nx_comm.greedy_modularity_communities(G, weight=G_weight)
        community_G = nx_comm.louvain_communities(G, weight=G_weight)
        runtime = time.time() - t0
        print("communities on input graph computed; runtime: " + str(runtime))
        print("computing modularity of communities on input graph")
        score = nx_comm.modularity(G, community_G, weight=G_weight)
        actual += out_sep + str(round(score,rounding)) + out_sep + str(round(runtime,rounding))
        print(actual)
        results['MODULARITY'].append(str(round(score,rounding)))
        results['MODULARITY - TIME (S)'].append(str(round(runtime,rounding)))
    
    #NaiveGPQPS
    print("computing modularity for NaiveGPQPS-unweighted")
    t0 = time.time()
    #community_s = nx_comm.greedy_modularity_communities(s, weight=None)
    community_s = nx_comm.louvain_communities(s, weight=None)
    community_sG = summary_utils.assign_community(community_s, partition)
    runtime = time.time() - t0
    score = nx_comm.modularity(G, community_sG, weight=None)
    naive_unweighted += out_sep + str(round(score,rounding)) + out_sep + str(round(runtime,rounding))
    print(naive_unweighted)
    results['MODULARITY'].append(str(round(score,rounding)))
    results['MODULARITY - TIME (S)'].append(str(round(runtime,rounding)))

    if summary_type == 'S2L':
        print("computing modularity for NaiveGPQPS-weighted")
        t0 = time.time()
        #community_s = nx_comm.greedy_modularity_communities(s, weight='weight')
        community_s = nx_comm.louvain_communities(s, weight='weight')
        community_sG = summary_utils.assign_community(community_s, partition)
        runtime = time.time() - t0
        score = nx_comm.modularity(G, community_sG, weight='weight')
        naive_weighted += out_sep + str(round(score,rounding)) + out_sep + str(round(runtime,rounding))
        print(naive_weighted)
        results['MODULARITY'].append(str(round(score,rounding)))
        results['MODULARITY - TIME (S)'].append(str(round(runtime,rounding)))

    #ProbabilisticGPQPS
    if summary_type == 'S2L':
        print("computing modularity for ProbabilisticGPQPS-unweighted")
        t0 = time.time()
        score = summary_utils.modularity_probabilistic_agg(G, possible_worlds, partition, None, G.number_of_nodes(), max_iter, seed, use_tqdm=use_tqdm)
        runtime = time.time() - t0
        prob_unweighted += out_sep + str(round(score,rounding)) + out_sep + str(round(runtime,rounding))
        print(prob_unweighted)
        results['MODULARITY'].append(str(round(score,rounding)))
        results['MODULARITY - TIME (S)'].append(str(round(runtime,rounding)))
        
        print("computing modularity for ProbabilisticGPQPS-weighted-avg")
        t0 = time.time()
        score = summary_utils.modularity_probabilistic_agg(G, possible_worlds_aw, partition, 'weight', G.number_of_nodes(), max_iter, seed, use_tqdm=use_tqdm)
        runtime = time.time() - t0
        prob_weighted_avg += out_sep + str(round(score,rounding)) + out_sep + str(round(runtime,rounding))
        print(prob_weighted_avg)
        results['MODULARITY'].append(str(round(score,rounding)))
        results['MODULARITY - TIME (S)'].append(str(round(runtime,rounding)))
        
        print("computing modularity for ProbabilisticGPQPS-weighted-exp")
        t0 = time.time()
        score = summary_utils.modularity_probabilistic_agg(G, possible_worlds, partition, 'weight', G.number_of_nodes(), max_iter, seed, use_tqdm=use_tqdm)
        runtime = time.time() - t0
        prob_weighted_exp += out_sep + str(round(score,rounding)) + out_sep + str(round(runtime,rounding))
        print(prob_weighted_exp)
        results['MODULARITY'].append(str(round(score,rounding)))
        results['MODULARITY - TIME (S)'].append(str(round(runtime,rounding)))

In [None]:
#RUN NUMERICAL & PARTITIONING QUERIES
use_tqdm = True
out_sep = '\t'
column_names = ['DATASET', '#SUPERNODES', 'METHOD', 'CLUST COEFF', 'CLUST COEFF - TIME (S)', 'MODULARITY', 'MODULARITY - TIME (S)']
header = out_sep.join(column_names)
print(header)
results_numerical = {}
for col in column_names:
    results_numerical[col] = []

graph_type = nx.Graph
n_worlds = 100
max_iter = 20
seed = 123
out_sep = '\t'
rounding = 5

for dataset in all_datasets_and_parameters:
    input_graph_path = dataset['file']
    input_graph_name = dataset['name']
    sep = dataset['sep']
    if RUN_S2L:
        ks = dataset['ks']
        for ik in range(len(ks)):
            k = ks[ik]
            summary_path = 'data/' + input_graph_name + '_summary_S2L_' + str(k) + '.pickle'
            compute_on_input_graph = True if ik == 0 else False
            numerical_queries(input_graph_path, sep, input_graph_name, summary_path, 'S2L', k, graph_type, max_iter, seed, n_worlds, out_sep, rounding, results_numerical, use_tqdm=use_tqdm, compute_on_input_graph=compute_on_input_graph)
    if RUN_sweg:
        sweg_summaries = dataset['sweg_summaries']
        for iss in range(len(sweg_summaries)):
            sweg_summary = sweg_summaries[iss]
            summary_path = sweg_result_path + sweg_summary
            #compute_on_input_graph = True if iss == 0 else False
            compute_on_input_graph = False
            numerical_queries(input_graph_path, sep, input_graph_name, summary_path, 'sweg', -1, graph_type, max_iter, seed, n_worlds, out_sep, rounding, results_numerical, use_tqdm=use_tqdm, compute_on_input_graph=compute_on_input_graph)    
    

In [None]:
pd.set_option('display.max_rows', None)
df = pd.DataFrame(data=results_numerical)
df

# Numerical queries - Centrality

In [None]:
sample_nodes_for_closeness = False
nodes_tobe_sampled = 1000

In [None]:
def centrality_queries_graph(G, G_weight, sep, input_graph_name, out_sep, rounding, centrality_type, results, node_set=None):    
    actual = input_graph_name + out_sep + '-' + out_sep + 'Actual' + out_sep
    results['DATASET'].append(input_graph_name)
    results['METHOD'].append('Actual')
    results['#SUPERNODES'].append('-')

    t0 = time.time()
    if centrality_type == nx.closeness_centrality:
        if not node_set:
            scores_G = centrality_type(G, distance=G_weight)
        else:
            scores_G = {}
            print("COMPUTING CLOSENESS CENTRALITY FOR SAMPLED NODES")
            for sampled_node in node_set:
                scores_G[sampled_node] = centrality_type(G, u=sampled_node, distance=G_weight)
    else:
        scores_G = centrality_type(G, weight=G_weight)
    runtime = time.time() - t0
    actual += str(round(runtime,rounding))
    results['TIME (S)'].append(str(round(runtime,rounding)))
    for col in results.keys():
        if not col in {'DATASET','METHOD','#SUPERNODES','TIME (S)'}:
            results[col].append('-')
            
    print(actual)
    return scores_G

In [None]:
def centrality_queries_summary(summary_path, summary_type, k, graph_type, n_worlds, out_sep, rounding, n_nodes, centrality_type, tot_relevant, tot_retrieved_list, centralities_G, results, use_tqdm=False, node_set=None):    
    #loading summary
    if summary_type == 'S2L':
        S, S_prob, partition, node2supernode, S_avgweight, s, s_aw = load_S2L_summary(summary_path)
    elif summary_type == 'sweg':
        partition, node2supernode, s = load_sweg_summary(summary_path)
    summary_nodes = s.number_of_nodes()
    summary_edges = s.number_of_edges()
    summary_node_edges = str(summary_nodes) + '-' + str(summary_edges)
    
    
    #generating worlds of ProbabilisticGPQPS method
    if summary_type == 'S2L':   
        possible_worlds = []
        possible_worlds_aw = []
        for i in tqdm(range(n_worlds), disable=not use_tqdm):
            W, W_aw = summary_utils.world(graph_type, S, S_avgweight, S_prob)
            possible_worlds.append(W)
            possible_worlds_aw.append(W_aw)
        
    
    #NaiveGPQPS
    #unweighted
    t0 = time.time()
    centralities_sG = summary_utils.centrality_summary(s, partition, node2supernode, centrality_type, None, n_nodes, node_set, print_output=False)
    runtime = time.time() - t0
    scores = summary_utils.compute_centrality_metrics(centralities_G, centralities_sG, tot_relevant, tot_retrieved_list, print_output=False)
    
    naive_unweighted = '' + out_sep + summary_node_edges + out_sep + 'NaiveGPQPS - unweighted' + out_sep + str(round(runtime,rounding))
    results['DATASET'].append('')
    results['METHOD'].append('NaiveGPQPS - unweighted')
    results['#SUPERNODES'].append(summary_node_edges)
    results['TIME (S)'].append(str(round(runtime,rounding)))
    for tot_retrieved in tot_retrieved_list:
        prf = scores[(tot_relevant,tot_retrieved)]
        p = round(prf[0],2)
        r = round(prf[1],2)
        f = round(prf[2],2)
        naive_unweighted += out_sep + str(p) + out_sep + str(r) + out_sep + str(f)
        
        prefix = str(tot_relevant)+'-'+str(tot_retrieved)
        results[prefix+' P'].append(str(p))
        results[prefix+' R'].append(str(r))
        results[prefix+' F'].append(str(f))
    print(naive_unweighted)

    #weighted
    if summary_type == 'S2L':
        t0 = time.time()
        centralities_sG = summary_utils.centrality_summary(s, partition, node2supernode, centrality_type, 'weight', n_nodes, node_set, print_output=False)
        runtime = time.time() - t0
        scores = summary_utils.compute_centrality_metrics(centralities_G, centralities_sG, tot_relevant, tot_retrieved_list, print_output=False)

        naive_weighted = '' + out_sep + summary_node_edges + out_sep + 'NaiveGPQPS - weighted' + out_sep + str(round(runtime,rounding))
        results['DATASET'].append('')
        results['METHOD'].append('NaiveGPQPS - weighted')
        results['#SUPERNODES'].append(summary_node_edges)
        results['TIME (S)'].append(str(round(runtime,rounding)))
        for tot_retrieved in tot_retrieved_list:
            prf = scores[(tot_relevant,tot_retrieved)]
            p = round(prf[0],2)
            r = round(prf[1],2)
            f = round(prf[2],2)
            naive_weighted += out_sep + str(p) + out_sep + str(r) + out_sep + str(f)

            prefix = str(tot_relevant)+'-'+str(tot_retrieved)
            results[prefix+' P'].append(str(p))
            results[prefix+' R'].append(str(r))
            results[prefix+' F'].append(str(f))
        print(naive_weighted)

    #ProbabilisticGPQPS
    if summary_type == 'S2L':
        #unweighted
        t0 = time.time()
        centralities_sG = summary_utils.centrality_summary_probabilistic(possible_worlds_aw, partition, node2supernode, centrality_type, None, 'unweighted', node_set, print_output=False)
        scores = summary_utils.compute_centrality_metrics_probabilistic(centralities_G, centralities_sG, tot_relevant, tot_retrieved_list, print_output=False)
        runtime = time.time() - t0

        prob_unweighted = '' + out_sep + summary_node_edges + out_sep + 'ProbabilisticGPQPS - unweighted' + out_sep + str(round(runtime,rounding))
        results['DATASET'].append('')
        results['METHOD'].append('ProbabilisticGPQPS - unweighted')
        results['#SUPERNODES'].append(summary_node_edges)
        results['TIME (S)'].append(str(round(runtime,rounding)))
        for tot_retrieved in tot_retrieved_list:
            prf = scores[(tot_relevant,tot_retrieved)]
            p = round(prf[0],2)
            r = round(prf[1],2)
            f = round(prf[2],2)
            prob_unweighted += out_sep + str(p) + out_sep + str(r) + out_sep + str(f)

            prefix = str(tot_relevant)+'-'+str(tot_retrieved)
            results[prefix+' P'].append(str(p))
            results[prefix+' R'].append(str(r))
            results[prefix+' F'].append(str(f))
        print(prob_unweighted)

        #weighted avg        
        t0 = time.time()
        centralities_sG = summary_utils.centrality_summary_probabilistic(possible_worlds_aw, partition, node2supernode, centrality_type, 'weight', 'weighted, avg', node_set, print_output=False)
        scores = summary_utils.compute_centrality_metrics_probabilistic(centralities_G, centralities_sG, tot_relevant, tot_retrieved_list, print_output=False)
        runtime = time.time() - t0

        prob_weighted_avg = '' + out_sep + summary_node_edges + out_sep + 'ProbabilisticGPQPS - weighted, avg' + out_sep + str(round(runtime,rounding))
        results['DATASET'].append('')
        results['METHOD'].append('ProbabilisticGPQPS - weighted, avg')
        results['#SUPERNODES'].append(summary_node_edges)
        results['TIME (S)'].append(str(round(runtime,rounding)))
        for tot_retrieved in tot_retrieved_list:
            prf = scores[(tot_relevant,tot_retrieved)]
            p = round(prf[0],2)
            r = round(prf[1],2)
            f = round(prf[2],2)
            prob_weighted_avg += out_sep + str(p) + out_sep + str(r) + out_sep + str(f)

            prefix = str(tot_relevant)+'-'+str(tot_retrieved)
            results[prefix+' P'].append(str(p))
            results[prefix+' R'].append(str(r))
            results[prefix+' F'].append(str(f))
        print(prob_weighted_avg)

        #weighted exp
        t0 = time.time()
        centralities_sG = summary_utils.centrality_summary_probabilistic(possible_worlds, partition, node2supernode, centrality_type, 'weight', 'weighted, exp', node_set, print_output=False)
        scores = summary_utils.compute_centrality_metrics_probabilistic(centralities_G, centralities_sG, tot_relevant, tot_retrieved_list, print_output=False)
        runtime = time.time() - t0

        prob_weighted_exp = '' + out_sep + summary_node_edges + out_sep + 'ProbabilisticGPQPS - weighted, exp' + out_sep + str(round(runtime,rounding))
        results['DATASET'].append('')
        results['METHOD'].append('ProbabilisticGPQPS - weighted, exp')
        results['#SUPERNODES'].append(summary_node_edges)
        results['TIME (S)'].append(str(round(runtime,rounding)))
        for tot_retrieved in tot_retrieved_list:
            prf = scores[(tot_relevant,tot_retrieved)]
            p = round(prf[0],2)
            r = round(prf[1],2)
            f = round(prf[2],2)
            prob_weighted_exp += out_sep + str(p) + out_sep + str(r) + out_sep + str(f)

            prefix = str(tot_relevant)+'-'+str(tot_retrieved)
            results[prefix+' P'].append(str(p))
            results[prefix+' R'].append(str(r))
            results[prefix+' F'].append(str(f))
        print(prob_weighted_exp)

In [None]:
tot_relevant = 100
tot_retrieved_list = [100, 200, 500]

In [None]:
def centrality_queries(centrality_type, results_centrality_overall):
    use_tqdm = True
    out_sep = '\t'
    graph_type = nx.Graph
    n_worlds = 100
    rounding = 5

    column_names = ['DATASET', '#SUPERNODES', 'METHOD', 'TIME (S)']
    for tot_retrieved in tot_retrieved_list:
        prefix = str(tot_relevant)+'-'+str(tot_retrieved)
        column_names.append(prefix+' P')
        column_names.append(prefix+' R')
        column_names.append(prefix+' F')
    header = out_sep.join(column_names)


    print(centrality_type)
    print(header)
    results_centrality = {}
    for col in column_names:
        results_centrality[col] = []

    for dataset in all_datasets_and_parameters:
        input_graph_path = dataset['file']
        input_graph_name = dataset['name']
        sep = dataset['sep']
        ks = dataset['ks']

        #loading input graph
        G = summary_utils.load_original_graph(input_graph_path, sep)
        G_weight = None if not nx.is_weighted(G) else 'weight'
        
        nodes_for_closeness = None
        if sample_nodes_for_closeness:
            #compute closeness for a bunch of nodes (sampled at random) only; larger graphs
            seed = 123
            random.seed(seed)
            nodes_for_closeness = random.sample(G.nodes, nodes_tobe_sampled)
        
        centralities_G = centrality_queries_graph(G, G_weight, sep, input_graph_name, out_sep, rounding, centrality_type, results_centrality, node_set=nodes_for_closeness)
        if RUN_S2L:
            for ik in range(len(ks)):
                k = ks[ik]
                summary_path = 'data/' + input_graph_name + '_summary_S2L_' + str(k) + '.pickle'
                centrality_queries_summary(summary_path, 'S2L', k, graph_type, n_worlds, out_sep, rounding, G.number_of_nodes(), centrality_type, tot_relevant, tot_retrieved_list, centralities_G, results_centrality, use_tqdm=use_tqdm, node_set=nodes_for_closeness)
        if RUN_sweg:
            sweg_summaries = dataset['sweg_summaries']
            for iss in range(len(sweg_summaries)):
                sweg_summary = sweg_summaries[iss]
                summary_path = sweg_result_path + sweg_summary
                centrality_queries_summary(summary_path, 'sweg', -1, graph_type, n_worlds, out_sep, rounding, G.number_of_nodes(), centrality_type, tot_relevant, tot_retrieved_list, centralities_G, results_centrality, use_tqdm=use_tqdm, node_set=nodes_for_closeness)
                
    results_centrality_overall[centrality_type] = results_centrality

In [None]:
results_centrality_overall = {}

### PageRank

In [None]:
centrality_queries(nx.pagerank, results_centrality_overall)

In [None]:
pd.set_option('display.max_rows', None)
df = pd.DataFrame(data=results_centrality_overall[nx.pagerank])
df

### Closeness

In [None]:
centrality_queries(nx.closeness_centrality, results_centrality_overall)

In [None]:
pd.set_option('display.max_rows', None)
df = pd.DataFrame(data=results_centrality_overall[nx.closeness_centrality])
df

# Vertex-set queries - Core decomposition

In [None]:
def core_queries_graph(G, G_weight, sep, input_graph_name, out_sep, rounding, results):    
    actual = input_graph_name + out_sep + '-' + out_sep + 'Actual' + out_sep
    results['DATASET'].append(input_graph_name)
    results['METHOD'].append('Actual')
    results['#SUPERNODES'].append('-')

    t0 = time.time()
    core_G = k_core.core_number(G, G_weight)
    runtime = time.time() - t0
    actual += str(round(runtime,rounding))
    results['TIME (S)'].append(str(round(runtime,rounding)))
    for col in results.keys():
        if not col in {'DATASET','METHOD','#SUPERNODES','TIME (S)'}:
            results[col].append('-')
            
    print(actual)
    return core_G

In [None]:
def core_queries_summary(summary_path, summary_type, k, graph_type, n_worlds, out_sep, rounding, n_nodes, tot_core_relevant, tot_core_retrieved_list, cores_G, results, use_tqdm=False):    
    #loading summary
    if summary_type == 'S2L':
        S, S_prob, partition, node2supernode, S_avgweight, s, s_aw = load_S2L_summary(summary_path)
    elif summary_type == 'sweg':
        partition, node2supernode, s = load_sweg_summary(summary_path)
    summary_nodes = s.number_of_nodes()
    summary_edges = s.number_of_edges()
    summary_node_edges = str(summary_nodes) + '-' + str(summary_edges)
    
    
    #generating worlds of ProbabilisticGPQPS method
    if summary_type == 'S2L':
        possible_worlds = []
        possible_worlds_aw = []
        for i in tqdm(range(n_worlds), disable=not use_tqdm):
            W, W_aw = summary_utils.world(graph_type, S, S_avgweight, S_prob)
            possible_worlds.append(W)
            possible_worlds_aw.append(W_aw)
        
    
    #NaiveGPQPS
    #unweighted
    t0 = time.time()
    core_s_unweighted = k_core.core_number(s, None)
    core_sG_unweighted = summary_utils.assign_core_number(core_s_unweighted, node2supernode)
    runtime = time.time() - t0    
    scores = summary_utils.compute_core_metrics(cores_G, core_sG_unweighted, tot_core_relevant, tot_core_retrieved_list, print_output=False)
    
    naive_unweighted = '' + out_sep + summary_node_edges + out_sep + 'NaiveGPQPS - unweighted' + out_sep + str(round(runtime,rounding))
    results['DATASET'].append('')
    results['METHOD'].append('NaiveGPQPS - unweighted')
    results['#SUPERNODES'].append(summary_node_edges)
    results['TIME (S)'].append(str(round(runtime,rounding)))
    for tot_core_retrieved in tot_core_retrieved_list:
        prf = scores[(tot_core_relevant,tot_core_retrieved)]
        p = round(prf[0],2)
        r = round(prf[1],2)
        f = round(prf[2],2)
        naive_unweighted += out_sep + str(p) + out_sep + str(r) + out_sep + str(f)
        
        prefix = str(tot_core_relevant)+'-'+str(tot_core_retrieved)
        results[prefix+' P'].append(str(p))
        results[prefix+' R'].append(str(r))
        results[prefix+' F'].append(str(f))
    print(naive_unweighted)

    #weighted
    if summary_type == 'S2L':
        t0 = time.time()
        core_s_weighted = k_core.core_number(s, 'weight')
        core_sG_weighted = summary_utils.assign_core_number(core_s_weighted, node2supernode)
        runtime = time.time() - t0
        scores = summary_utils.compute_core_metrics(cores_G, core_sG_weighted, tot_core_relevant, tot_core_retrieved_list, print_output=False)

        naive_weighted = '' + out_sep + summary_node_edges + out_sep + 'NaiveGPQPS - weighted' + out_sep + str(round(runtime,rounding))
        results['DATASET'].append('')
        results['METHOD'].append('NaiveGPQPS - weighted')
        results['#SUPERNODES'].append(summary_node_edges)
        results['TIME (S)'].append(str(round(runtime,rounding)))
        for tot_core_retrieved in tot_core_retrieved_list:
            prf = scores[(tot_core_relevant,tot_core_retrieved)]
            p = round(prf[0],2)
            r = round(prf[1],2)
            f = round(prf[2],2)
            naive_weighted += out_sep + str(p) + out_sep + str(r) + out_sep + str(f)

            prefix = str(tot_core_relevant)+'-'+str(tot_core_retrieved)
            results[prefix+' P'].append(str(p))
            results[prefix+' R'].append(str(r))
            results[prefix+' F'].append(str(f))
        print(naive_weighted)

    #ProbabilisticGPQPS
    if summary_type == 'S2L':
        #unweighted
        t0 = time.time()
        core_worlds_unweighted_S = k_core.core_number_worlds(possible_worlds_aw, None)
        core_worlds_unweighted = summary_utils.assign_core_number_probabilistic(core_worlds_unweighted_S, node2supernode)
        runtime = time.time() - t0
        scores = summary_utils.compute_core_metrics_probabilistic(cores_G, core_worlds_unweighted, tot_core_relevant, tot_core_retrieved_list, print_output=False)

        prob_unweighted = '' + out_sep + summary_node_edges + out_sep + 'ProbabilisticGPQPS - unweighted' + out_sep + str(round(runtime,rounding))
        results['DATASET'].append('')
        results['METHOD'].append('ProbabilisticGPQPS - unweighted')
        results['#SUPERNODES'].append(summary_node_edges)
        results['TIME (S)'].append(str(round(runtime,rounding)))
        for tot_core_retrieved in tot_core_retrieved_list:
            prf = scores[(tot_core_relevant,tot_core_retrieved)]
            p = round(prf[0],2)
            r = round(prf[1],2)
            f = round(prf[2],2)
            prob_unweighted += out_sep + str(p) + out_sep + str(r) + out_sep + str(f)

            prefix = str(tot_core_relevant)+'-'+str(tot_core_retrieved)
            results[prefix+' P'].append(str(p))
            results[prefix+' R'].append(str(r))
            results[prefix+' F'].append(str(f))
        print(prob_unweighted)

        #weighted avg        
        t0 = time.time()
        core_worlds_weighted_avg_S = k_core.core_number_worlds(possible_worlds_aw, 'weight')
        core_worlds_weighted_avg = summary_utils.assign_core_number_probabilistic(core_worlds_weighted_avg_S, node2supernode)
        runtime = time.time() - t0
        scores = summary_utils.compute_core_metrics_probabilistic(cores_G, core_worlds_weighted_avg, tot_core_relevant, tot_core_retrieved_list, print_output=False)

        prob_weighted_avg = '' + out_sep + summary_node_edges + out_sep + 'ProbabilisticGPQPS - weighted, avg' + out_sep + str(round(runtime,rounding))
        results['DATASET'].append('')
        results['METHOD'].append('ProbabilisticGPQPS - weighted, avg')
        results['#SUPERNODES'].append(summary_node_edges)
        results['TIME (S)'].append(str(round(runtime,rounding)))
        for tot_core_retrieved in tot_core_retrieved_list:
            prf = scores[(tot_core_relevant,tot_core_retrieved)]
            p = round(prf[0],2)
            r = round(prf[1],2)
            f = round(prf[2],2)
            prob_weighted_avg += out_sep + str(p) + out_sep + str(r) + out_sep + str(f)

            prefix = str(tot_core_relevant)+'-'+str(tot_core_retrieved)
            results[prefix+' P'].append(str(p))
            results[prefix+' R'].append(str(r))
            results[prefix+' F'].append(str(f))
        print(prob_weighted_avg)

        #weighted exp
        t0 = time.time()
        core_worlds_weighted_exp_S = k_core.core_number_worlds(possible_worlds, 'weight')
        core_worlds_weighted_exp = summary_utils.assign_core_number_probabilistic(core_worlds_weighted_exp_S, node2supernode)
        runtime = time.time() - t0
        scores = summary_utils.compute_core_metrics_probabilistic(cores_G, core_worlds_weighted_exp, tot_core_relevant, tot_core_retrieved_list, print_output=False)

        prob_weighted_exp = '' + out_sep + summary_node_edges + out_sep + 'ProbabilisticGPQPS - weighted, exp' + out_sep + str(round(runtime,rounding))
        results['DATASET'].append('')
        results['METHOD'].append('ProbabilisticGPQPS - weighted, exp')
        results['#SUPERNODES'].append(summary_node_edges)
        results['TIME (S)'].append(str(round(runtime,rounding)))
        for tot_core_retrieved in tot_core_retrieved_list:
            prf = scores[(tot_core_relevant,tot_core_retrieved)]
            p = round(prf[0],2)
            r = round(prf[1],2)
            f = round(prf[2],2)
            prob_weighted_exp += out_sep + str(p) + out_sep + str(r) + out_sep + str(f)

            prefix = str(tot_core_relevant)+'-'+str(tot_core_retrieved)
            results[prefix+' P'].append(str(p))
            results[prefix+' R'].append(str(r))
            results[prefix+' F'].append(str(f))
        print(prob_weighted_exp)

In [None]:
tot_core_relevant = 1
tot_core_retrieved_list = [1, 2, 5, 10]

In [None]:
use_tqdm = True
out_sep = '\t'
graph_type = nx.Graph
n_worlds = 100
rounding = 5

column_names = ['DATASET', '#SUPERNODES', 'METHOD', 'TIME (S)']
for tot_core_retrieved in tot_core_retrieved_list:
    prefix = str(tot_core_relevant)+'-'+str(tot_core_retrieved)
    column_names.append(prefix+' P')
    column_names.append(prefix+' R')
    column_names.append(prefix+' F')
header = out_sep.join(column_names)

print(header)
results_core = {}
for col in column_names:
    results_core[col] = []

for dataset in all_datasets_and_parameters:
    input_graph_path = dataset['file']
    input_graph_name = dataset['name']
    sep = dataset['sep']
    ks = dataset['ks']

    #loading input graph
    G = summary_utils.load_original_graph(input_graph_path, sep)
    G_weight = None if not nx.is_weighted(G) else 'weight'
    cores_G = core_queries_graph(G, G_weight, sep, input_graph_name, out_sep, rounding, results_core)
    if RUN_S2L:
        for ik in range(len(ks)):
            k = ks[ik]
            summary_path = 'data/' + input_graph_name + '_summary_S2L_' + str(k) + '.pickle'
            core_queries_summary(summary_path, 'S2L', k, graph_type, n_worlds, out_sep, rounding, G.number_of_nodes(), tot_core_relevant, tot_core_retrieved_list, cores_G, results_core, use_tqdm=use_tqdm)
    if RUN_sweg:
        sweg_summaries = dataset['sweg_summaries']
        for iss in range(len(sweg_summaries)):
            sweg_summary = sweg_summaries[iss]
            summary_path = sweg_result_path + sweg_summary
            core_queries_summary(summary_path, 'sweg', -1, graph_type, n_worlds, out_sep, rounding, G.number_of_nodes(), tot_core_relevant, tot_core_retrieved_list, cores_G, results_core, use_tqdm=use_tqdm)

In [None]:
pd.set_option('display.max_rows', None)
df = pd.DataFrame(data=results_core)
df