In [1]:
from scipy.stats import permutation_test
import pandas as pd
import numpy as np
import os
from ogb.nodeproppred import NodePropPredDataset
import networkx as nx
import pandas as pd
import random
import seaborn as sns
import matplotlib.pyplot as plt




In [2]:
# download and load the obg dataset
d_name = 'arxiv'
root = os.path.join(os.path.realpath('../'), 'data', d_name)
dataset = NodePropPredDataset(f'ogbn-{d_name}', root)

Loading necessary files...
This might take a while.
Processing graphs...


100%|██████████| 1/1 [00:00<00:00, 19508.39it/s]

Saving...





## Calculate centrality scores

In [3]:
G = nx.DiGraph()
G.add_nodes_from(range(dataset[0][0]['num_nodes']))
G.add_edges_from(dataset[0][0]['edge_index'].T.tolist())

depth_list = ['shallow', 'deep', 'undecided']
rand_depths = [random.choice(depth_list) for _ in range(dataset[0][0]['num_nodes'])]
score_funcs_list = [(nx.centrality.degree_centrality, 'degree_centrality'), (nx.centrality.eigenvector_centrality, 'eigenvector_centrality'), 
                    (nx.centrality.katz_centrality, 'katz_centrality'), (nx.centrality.closeness_centrality, 'closeness_centrality')]

scores_df = pd.DataFrame(list(G.nodes()), columns=['node_id'])
for cent_func, cent_name in score_funcs_list:
    print(cent_name)
    cent_func_stats = cent_func(G)
    temp_scores_df = pd.DataFrame(cent_func_stats.items(), columns=['node_id', cent_name])
    scores_df = scores_df.merge(temp_scores_df, on='node_id')
    
scores_df = pd.DataFrame(list(G.nodes()), columns=['node_id'])
for cent_func, cent_name in score_funcs_list:
    print(cent_name)
    cent_func_stats = cent_func(G)
    temp_scores_df = pd.DataFrame(cent_func_stats.items(), columns=['node_id', cent_name])
    scores_df = scores_df.merge(temp_scores_df, on='node_id')

degree_centrality
eigenvector_centrality
katz_centrality
closeness_centrality


In [None]:
GCN_depths_df = pd.read_csv('results\\ids_to_pvalue_two_tailed_GCN.csv')
SAGE_depths_df = pd.read_csv('results\\ids_to_pvalue_two_tailed_GAT.csv')
GAT_depths_df = pd.read_csv('results\\ids_to_pvalue_two_tailed_GAT.csv')

GCN_scores_df = scores_df.merge(GCN_depths_df, left_on='node_id', right_on='id')
SAGE_scores_df = scores_df.merge(SAGE_depths_df, left_on='node_id', right_on='id')
GAT_scores_df = scores_df.merge(GAT_depths_df, left_on='node_id', right_on='id')

## Centrality hypotheses testing

In [14]:
def statistic(x, y, axis):
    return np.mean(x, axis=axis) - np.mean(y, axis=axis)

def get_pvalue(left, right, n_permutations=10000):
    res = permutation_test(
        (left, right), 
        statistic, vectorized=True, 
        n_resamples=n_permutations, 
        alternative='two-sided'
        )
    return res.pvalue

model_scores_df = {'GAT':GAT_scores_df, 'GCN':GCN_scores_df, 'SAGE':SAGE_scores_df}
centrality_measures = ['eigenvector_centrality', 'degree_centrality', 'closeness_centrality', 'katz_centrality']
vertrex_classes = ['deep','shallow','undecided']
for model, score_df in model_scores_df.items():
    print("")
    for centrality_measure in centrality_measures:
        print("")
        for vertex_class_left in vertrex_classes:
            for vertex_class_right in vertrex_classes:
                if vertex_class_left != vertex_class_right:
                    scores_left = score_df[score_df['vertex_class'] == vertex_class_left][centrality_measure].values
                    scores_right = score_df[score_df['vertex_class'] == vertex_class_right][centrality_measure].values
                    pvalue = get_pvalue(scores_left, scores_right, model, centrality_measure)
                    print(f'{model} {centrality_measure} {vertex_class_left} vs {vertex_class_right} pvalue = {pvalue:.4f}')




GAT eigenvector_centrality deep vs shallow pvalue = 0.0884
GAT eigenvector_centrality deep vs undecided pvalue = 0.7655
GAT eigenvector_centrality shallow vs deep pvalue = 0.0978
GAT eigenvector_centrality shallow vs undecided pvalue = 0.0306
GAT eigenvector_centrality undecided vs deep pvalue = 0.7641
GAT eigenvector_centrality undecided vs shallow pvalue = 0.0322

GAT degree_centrality deep vs shallow pvalue = 0.0002
GAT degree_centrality deep vs undecided pvalue = 0.0002
GAT degree_centrality shallow vs deep pvalue = 0.0004
GAT degree_centrality shallow vs undecided pvalue = 0.0002
GAT degree_centrality undecided vs deep pvalue = 0.0002
GAT degree_centrality undecided vs shallow pvalue = 0.0002

GAT closeness_centrality deep vs shallow pvalue = 0.6267
GAT closeness_centrality deep vs undecided pvalue = 0.0002
GAT closeness_centrality shallow vs deep pvalue = 0.6331
GAT closeness_centrality shallow vs undecided pvalue = 0.0002
GAT closeness_centrality undecided vs deep pvalue = 0.0