In [1]:
import torch
import random 
import numpy as np 
from tqdm import tqdm 
from scipy.spatial.distance import cdist, cosine
from scipy.optimize import linear_sum_assignment
from utils.corrupt_graph import remove_edge, remove_node, add_edge, add_node
from utils.query_machine import get_candidates
from python_emb import *
import torch.nn.functional as F
from scipy.spatial.distance import cosine
from scipy.stats import pearsonr, spearmanr
from collections import defaultdict

In [4]:
for dataset in ['yeast']:
    dim=4
    ori_graph_data = load_data('./dataspace/graph/{}/{}'.format(dataset, dataset), supervised=False, max_degree=5, multiclass=False, use_random_walks=False)

    ori_emb_model = SupervisedGraphSage(ori_graph_data.raw_feats.shape[1], dim, ori_graph_data.num_class)
    ori_emb_model = ori_emb_model.cuda()
    ori_emb_model.load_state_dict(torch.load('model_dim/{}_sup_50_dim{}.pt'.format(dataset, dim)))
    ori_emb_model.set_params(ori_graph_data.full_adj, ori_graph_data.deg, ori_graph_data.feats)
    ori_emb_model.eval()

    ori_graph_emb = F.normalize(ori_emb_model.aggregator(list(range(ori_graph_data.raw_feats.shape[0]))), dim = 1)
    ori_graph_emb = ori_graph_emb.detach().cpu().numpy()


    for i, node in enumerate(ori_graph_data.G.nodes):
        ori_graph_data.G.nodes[node]['label'] = ori_graph_data.multi2single_label[tuple(ori_graph_data.G.nodes[node]['label'])]

    query_machine = GraphQuery(ori_emb_model, 
            ori_graph_emb,
            ori_graph_data.G,
            ori_graph_data.id_map,
            ori_graph_data.feats,
            ori_graph_data.raw_feats,
            ori_graph_data.full_adj,
            ori_graph_data.deg)

    def get_embedding_subgraph(subgraph):
        sub_id_map, sub_raw_feats, all_sub_adj, sub_degree = query_machine.create_subgraph_map(subgraph)
        embedding_subgraph = query_machine.embedding_subgraph(sub_raw_feats, all_sub_adj, sub_degree)
        embedding_subgraph = embedding_subgraph.detach().cpu().numpy()

        return embedding_subgraph, sub_id_map


    mcs2emb = defaultdict(list)
    GRAPH_SIZE = 20
    N_REMOVE = 3
    N_CORE = 1000
    graphs = []
    if N_CORE<= len(query_machine.ori_graph):
        core_nodes = random.sample([node for node in query_machine.ori_graph], N_CORE)
    else:
        core_nodes = random.choices([node for node in query_machine.ori_graph], k=N_CORE)
    for i, core_node in tqdm(enumerate(core_nodes)):
        biggraph = query_machine.create_subgraph_from_core(core_node, GRAPH_SIZE)
        if len(biggraph) != GRAPH_SIZE:
            continue
        sub_id_map, sub_raw_feats, all_sub_adj, sub_degree = query_machine.create_subgraph_map(biggraph)

        embedding_biggraph = query_machine.embedding_subgraph(sub_raw_feats, all_sub_adj, sub_degree)
        embedding_biggraph = embedding_biggraph.detach().cpu().numpy()
        subgraphs = {}
        for dist in range(1,8):
            connected = False
            n_tries  = 0
            while(not connected and n_tries<=10):
                subgraph = remove_edge(biggraph, dist)
                n_tries += 1
                if len(subgraph)==0: continue
                connected = nx.is_connected(subgraph)
                if (n_tries>10): break
            if n_tries <=10 and nx.is_connected(subgraph):
                sub_id_map, sub_raw_feats, all_sub_adj, sub_degree = query_machine.create_subgraph_map(subgraph)
                embedding_subgraph = query_machine.embedding_subgraph(sub_raw_feats, all_sub_adj, sub_degree)
                embedding_subgraph = embedding_subgraph.detach().cpu().numpy()
                emb_sim = ((embedding_biggraph.mean(0)- embedding_subgraph.mean(0))**2).sum()
                if not np.isnan(emb_sim):
                    mcs2emb[dist].append(emb_sim)
                    subgraphs[dist] = subgraph
        graphs.append([biggraph, subgraphs])
    X = []
    Y = []
    for k,v in mcs2emb.items():
        X.append(k)
        Y.append(sum(v)/len(v))
    print(dataset, dim, spearmanr(X, Y), pearsonr(X,Y))
        
    for dim in [8, 16, 32, 64, 128, 512]:
        ori_graph_data = load_data('./dataspace/graph/{}/{}'.format(dataset, dataset), supervised=False, max_degree=5, multiclass=False, use_random_walks=False)

        ori_emb_model = SupervisedGraphSage(ori_graph_data.raw_feats.shape[1], dim, ori_graph_data.num_class)
        ori_emb_model = ori_emb_model.cuda()
        ori_emb_model.load_state_dict(torch.load('model_dim/{}_sup_50_dim{}.pt'.format(dataset, dim)))
        ori_emb_model.set_params(ori_graph_data.full_adj, ori_graph_data.deg, ori_graph_data.feats)
        ori_emb_model.eval()

        ori_graph_emb = F.normalize(ori_emb_model.aggregator(list(range(ori_graph_data.raw_feats.shape[0]))), dim = 1)
        ori_graph_emb = ori_graph_emb.detach().cpu().numpy()


        for i, node in enumerate(ori_graph_data.G.nodes):
            ori_graph_data.G.nodes[node]['label'] = ori_graph_data.multi2single_label[tuple(ori_graph_data.G.nodes[node]['label'])]

        query_machine = GraphQuery(ori_emb_model, 
                ori_graph_emb,
                ori_graph_data.G,
                ori_graph_data.id_map,
                ori_graph_data.feats,
                ori_graph_data.raw_feats,
                ori_graph_data.full_adj,
                ori_graph_data.deg)

        def get_embedding_subgraph(subgraph):
            sub_id_map, sub_raw_feats, all_sub_adj, sub_degree = query_machine.create_subgraph_map(subgraph)
            embedding_subgraph = query_machine.embedding_subgraph(sub_raw_feats, all_sub_adj, sub_degree)
            embedding_subgraph = embedding_subgraph.detach().cpu().numpy()

            return embedding_subgraph, sub_id_map


        mcs2emb = defaultdict(list)
        GRAPH_SIZE = 20
        N_REMOVE = 3
        N_CORE = 1000

        for biggraph, subgraphs in graphs:
            sub_id_map, sub_raw_feats, all_sub_adj, sub_degree = query_machine.create_subgraph_map(biggraph)

            embedding_biggraph = query_machine.embedding_subgraph(sub_raw_feats, all_sub_adj, sub_degree)
            embedding_biggraph = embedding_biggraph.detach().cpu().numpy()
            for dist, subgraph in subgraphs.items():                
                sub_id_map, sub_raw_feats, all_sub_adj, sub_degree = query_machine.create_subgraph_map(subgraph)
                embedding_subgraph = query_machine.embedding_subgraph(sub_raw_feats, all_sub_adj, sub_degree)
                embedding_subgraph = embedding_subgraph.detach().cpu().numpy()
                emb_sim = ((embedding_biggraph.mean(0)- embedding_subgraph.mean(0))**2).sum()
                if not np.isnan(emb_sim):
                    mcs2emb[dist].append(emb_sim)
        X = []
        Y = []
        for k,v in mcs2emb.items():
            X.append(k)
            Y.append(sum(v)/len(v))
        print(dataset, dim, spearmanr(X, Y), pearsonr(X,Y))


Set max degree to 5
-----------------------------------------------
Loading data:
Loading graph data from ./dataspace/graph/yeast/yeast-G.json
Removed 0 nodes that lacked proper annotations due to networkx versioning issues
File loaded successfully
Loading feature from ./dataspace/graph/yeast/yeast-G.json
File loaded successfully
Loading classmap data from ./dataspace/graph/yeast/yeast-class_map.json
File loaded successfully
Loaded data.. now preprocessing..
Use original edges
Generate train edges
Number of training edges: 12519
Preprocessing finished, graph info:
Name: yeast
Type: Graph
Number of nodes: 3101
Number of edges: 12519
Average degree:   8.0742


1000it [00:27, 36.91it/s]


yeast 4 SpearmanrResult(correlation=1.0, pvalue=0.0) (0.9847552195799616, 5.466391093349874e-05)
Set max degree to 5
-----------------------------------------------
Loading data:
Loading graph data from ./dataspace/graph/yeast/yeast-G.json
Removed 0 nodes that lacked proper annotations due to networkx versioning issues
File loaded successfully
Loading feature from ./dataspace/graph/yeast/yeast-G.json
File loaded successfully
Loading classmap data from ./dataspace/graph/yeast/yeast-class_map.json
File loaded successfully
Loaded data.. now preprocessing..
Use original edges
Generate train edges
Number of training edges: 12519
Preprocessing finished, graph info:
Name: yeast
Type: Graph
Number of nodes: 3101
Number of edges: 12519
Average degree:   8.0742
yeast 8 SpearmanrResult(correlation=1.0, pvalue=0.0) (0.9639369348829978, 0.0004652303520267607)
Set max degree to 5
-----------------------------------------------
Loading data:
Loading graph data from ./dataspace/graph/yeast/yeast-G.jso

In [5]:
for dataset in ['human', 'cora', 'citeseer', 'pubmed', 'wordnet']:
    dim=4
    ori_graph_data = load_data('./dataspace/graph/{}/{}'.format(dataset, dataset), supervised=False, max_degree=5, multiclass=False, use_random_walks=False)

    ori_emb_model = SupervisedGraphSage(ori_graph_data.raw_feats.shape[1], dim, ori_graph_data.num_class)
    ori_emb_model = ori_emb_model.cuda()
    ori_emb_model.load_state_dict(torch.load('model_dim/{}_sup_50_dim{}.pt'.format(dataset, dim)))
    ori_emb_model.set_params(ori_graph_data.full_adj, ori_graph_data.deg, ori_graph_data.feats)
    ori_emb_model.eval()

    ori_graph_emb = F.normalize(ori_emb_model.aggregator(list(range(ori_graph_data.raw_feats.shape[0]))), dim = 1)
    ori_graph_emb = ori_graph_emb.detach().cpu().numpy()


    for i, node in enumerate(ori_graph_data.G.nodes):
        ori_graph_data.G.nodes[node]['label'] = ori_graph_data.multi2single_label[tuple(ori_graph_data.G.nodes[node]['label'])]

    query_machine = GraphQuery(ori_emb_model, 
            ori_graph_emb,
            ori_graph_data.G,
            ori_graph_data.id_map,
            ori_graph_data.feats,
            ori_graph_data.raw_feats,
            ori_graph_data.full_adj,
            ori_graph_data.deg)

    def get_embedding_subgraph(subgraph):
        sub_id_map, sub_raw_feats, all_sub_adj, sub_degree = query_machine.create_subgraph_map(subgraph)
        embedding_subgraph = query_machine.embedding_subgraph(sub_raw_feats, all_sub_adj, sub_degree)
        embedding_subgraph = embedding_subgraph.detach().cpu().numpy()

        return embedding_subgraph, sub_id_map


    mcs2emb = defaultdict(list)
    GRAPH_SIZE = 20
    N_REMOVE = 3
    N_CORE = 1000
    graphs = []
    if N_CORE<= len(query_machine.ori_graph):
        core_nodes = random.sample([node for node in query_machine.ori_graph], N_CORE)
    else:
        core_nodes = random.choices([node for node in query_machine.ori_graph], k=N_CORE)
    for i, core_node in tqdm(enumerate(core_nodes)):
        biggraph = query_machine.create_subgraph_from_core(core_node, GRAPH_SIZE)
        if len(biggraph) != GRAPH_SIZE:
            continue
        sub_id_map, sub_raw_feats, all_sub_adj, sub_degree = query_machine.create_subgraph_map(biggraph)

        embedding_biggraph = query_machine.embedding_subgraph(sub_raw_feats, all_sub_adj, sub_degree)
        embedding_biggraph = embedding_biggraph.detach().cpu().numpy()
        subgraphs = {}
        for dist in range(1,8):
            connected = False
            n_tries  = 0
            while(not connected and n_tries<=10):
                subgraph = remove_edge(biggraph, dist)
                n_tries += 1
                if len(subgraph)==0: continue
                connected = nx.is_connected(subgraph)
                if (n_tries>10): break
            if n_tries <=10 and nx.is_connected(subgraph):
                sub_id_map, sub_raw_feats, all_sub_adj, sub_degree = query_machine.create_subgraph_map(subgraph)
                embedding_subgraph = query_machine.embedding_subgraph(sub_raw_feats, all_sub_adj, sub_degree)
                embedding_subgraph = embedding_subgraph.detach().cpu().numpy()
                emb_sim = ((embedding_biggraph.mean(0)- embedding_subgraph.mean(0))**2).sum()
                if not np.isnan(emb_sim):
                    mcs2emb[dist].append(emb_sim)
                    subgraphs[dist] = subgraph
        graphs.append([biggraph, subgraphs])
    X = []
    Y = []
    for k,v in mcs2emb.items():
        X.append(k)
        Y.append(sum(v)/len(v))
    print(dataset, dim, spearmanr(X, Y), pearsonr(X,Y))
        
    for dim in [8, 16, 32, 64, 128, 512]:
        ori_graph_data = load_data('./dataspace/graph/{}/{}'.format(dataset, dataset), supervised=False, max_degree=5, multiclass=False, use_random_walks=False)

        ori_emb_model = SupervisedGraphSage(ori_graph_data.raw_feats.shape[1], dim, ori_graph_data.num_class)
        ori_emb_model = ori_emb_model.cuda()
        ori_emb_model.load_state_dict(torch.load('model_dim/{}_sup_50_dim{}.pt'.format(dataset, dim)))
        ori_emb_model.set_params(ori_graph_data.full_adj, ori_graph_data.deg, ori_graph_data.feats)
        ori_emb_model.eval()

        ori_graph_emb = F.normalize(ori_emb_model.aggregator(list(range(ori_graph_data.raw_feats.shape[0]))), dim = 1)
        ori_graph_emb = ori_graph_emb.detach().cpu().numpy()


        for i, node in enumerate(ori_graph_data.G.nodes):
            ori_graph_data.G.nodes[node]['label'] = ori_graph_data.multi2single_label[tuple(ori_graph_data.G.nodes[node]['label'])]

        query_machine = GraphQuery(ori_emb_model, 
                ori_graph_emb,
                ori_graph_data.G,
                ori_graph_data.id_map,
                ori_graph_data.feats,
                ori_graph_data.raw_feats,
                ori_graph_data.full_adj,
                ori_graph_data.deg)

        def get_embedding_subgraph(subgraph):
            sub_id_map, sub_raw_feats, all_sub_adj, sub_degree = query_machine.create_subgraph_map(subgraph)
            embedding_subgraph = query_machine.embedding_subgraph(sub_raw_feats, all_sub_adj, sub_degree)
            embedding_subgraph = embedding_subgraph.detach().cpu().numpy()

            return embedding_subgraph, sub_id_map


        mcs2emb = defaultdict(list)
        GRAPH_SIZE = 20
        N_REMOVE = 3
        N_CORE = 1000

        for biggraph, subgraphs in graphs:
            sub_id_map, sub_raw_feats, all_sub_adj, sub_degree = query_machine.create_subgraph_map(biggraph)

            embedding_biggraph = query_machine.embedding_subgraph(sub_raw_feats, all_sub_adj, sub_degree)
            embedding_biggraph = embedding_biggraph.detach().cpu().numpy()
            for dist, subgraph in subgraphs.items():                
                sub_id_map, sub_raw_feats, all_sub_adj, sub_degree = query_machine.create_subgraph_map(subgraph)
                embedding_subgraph = query_machine.embedding_subgraph(sub_raw_feats, all_sub_adj, sub_degree)
                embedding_subgraph = embedding_subgraph.detach().cpu().numpy()
                emb_sim = ((embedding_biggraph.mean(0)- embedding_subgraph.mean(0))**2).sum()
                if not np.isnan(emb_sim):
                    mcs2emb[dist].append(emb_sim)
        X = []
        Y = []
        for k,v in mcs2emb.items():
            X.append(k)
            Y.append(sum(v)/len(v))
        print(dataset, dim, spearmanr(X, Y), pearsonr(X,Y))


Set max degree to 5
-----------------------------------------------
Loading data:
Loading graph data from ./dataspace/graph/human/human-G.json
Removed 0 nodes that lacked proper annotations due to networkx versioning issues
File loaded successfully
Loading feature from ./dataspace/graph/human/human-G.json
File loaded successfully
Loading classmap data from ./dataspace/graph/human/human-class_map.json
File loaded successfully
Loaded data.. now preprocessing..
Use original edges
Generate train edges
Number of training edges: 86282
Preprocessing finished, graph info:
Name: human
Type: Graph
Number of nodes: 4674
Number of edges: 86282
Average degree:  36.9200


1000it [00:22, 43.81it/s]


human 4 SpearmanrResult(correlation=1.0, pvalue=0.0) (0.9947450820173536, 3.833937247924874e-06)
Set max degree to 5
-----------------------------------------------
Loading data:
Loading graph data from ./dataspace/graph/human/human-G.json
Removed 0 nodes that lacked proper annotations due to networkx versioning issues
File loaded successfully
Loading feature from ./dataspace/graph/human/human-G.json
File loaded successfully
Loading classmap data from ./dataspace/graph/human/human-class_map.json
File loaded successfully
Loaded data.. now preprocessing..
Use original edges
Generate train edges
Number of training edges: 86282
Preprocessing finished, graph info:
Name: human
Type: Graph
Number of nodes: 4674
Number of edges: 86282
Average degree:  36.9200
human 8 SpearmanrResult(correlation=1.0, pvalue=0.0) (0.9950602292602331, 3.285266291171457e-06)
Set max degree to 5
-----------------------------------------------
Loading data:
Loading graph data from ./dataspace/graph/human/human-G.jso

1000it [00:19, 50.41it/s]


cora 4 SpearmanrResult(correlation=0.7500000000000002, pvalue=0.05218140045705776) (0.8740133771640446, 0.010099626727467757)
Set max degree to 5
-----------------------------------------------
Loading data:
Loading graph data from ./dataspace/graph/cora/cora-G.json
Removed 0 nodes that lacked proper annotations due to networkx versioning issues
File loaded successfully
Loading feature from ./dataspace/graph/cora/cora-G.json
File loaded successfully
Loading classmap data from ./dataspace/graph/cora/cora-class_map.json
File loaded successfully
Loaded data.. now preprocessing..
Use original edges
Generate train edges
Number of training edges: 5278
Preprocessing finished, graph info:
Name: cora
Type: Graph
Number of nodes: 2708
Number of edges: 5278
Average degree:   3.8981
cora 8 SpearmanrResult(correlation=0.9642857142857145, pvalue=0.0004541491691941689) (0.889944305993234, 0.007267536371408224)
Set max degree to 5
-----------------------------------------------
Loading data:
Loading g

1000it [00:13, 76.89it/s]


citeseer 4 SpearmanrResult(correlation=0.9642857142857145, pvalue=0.0004541491691941689) (0.9062420403082333, 0.004912521580759013)
Set max degree to 5
-----------------------------------------------
Loading data:
Loading graph data from ./dataspace/graph/citeseer/citeseer-G.json
Removed 0 nodes that lacked proper annotations due to networkx versioning issues
File loaded successfully
Loading feature from ./dataspace/graph/citeseer/citeseer-G.json
File loaded successfully
Loading classmap data from ./dataspace/graph/citeseer/citeseer-class_map.json
File loaded successfully
Loaded data.. now preprocessing..
Use original edges
Generate train edges
Number of training edges: 4600
Preprocessing finished, graph info:
Name: citeseer
Type: Graph
Number of nodes: 3327
Number of edges: 4600
Average degree:   2.7653
citeseer 8 SpearmanrResult(correlation=0.9642857142857145, pvalue=0.0004541491691941689) (0.9653085089805733, 0.00042256339220102143)
Set max degree to 5
------------------------------

1000it [00:22, 44.33it/s]


pubmed 4 SpearmanrResult(correlation=0.9642857142857145, pvalue=0.0004541491691941689) (0.9550700980227959, 0.0008021550064754211)
Set max degree to 5
-----------------------------------------------
Loading data:
Loading graph data from ./dataspace/graph/pubmed/pubmed-G.json
Removed 0 nodes that lacked proper annotations due to networkx versioning issues
File loaded successfully
Loading feature from ./dataspace/graph/pubmed/pubmed-G.json
File loaded successfully
Loading classmap data from ./dataspace/graph/pubmed/pubmed-class_map.json
File loaded successfully
Loaded data.. now preprocessing..
Use original edges
Generate train edges
Number of training edges: 44324
Preprocessing finished, graph info:
Name: pubmed
Type: Graph
Number of nodes: 19717
Number of edges: 44324
Average degree:   4.4960
pubmed 8 SpearmanrResult(correlation=0.8214285714285715, pvalue=0.023448808345691505) (0.8317541818201694, 0.020323780616571564)
Set max degree to 5
-----------------------------------------------

1000it [00:21, 47.02it/s]


wordnet 4 SpearmanrResult(correlation=-0.28571428571428575, pvalue=0.5345092286010406) (-0.35658597165650857, 0.4323820305103962)
Set max degree to 5
-----------------------------------------------
Loading data:
Loading graph data from ./dataspace/graph/wordnet/wordnet-G.json
Removed 0 nodes that lacked proper annotations due to networkx versioning issues
File loaded successfully
Loading feature from ./dataspace/graph/wordnet/wordnet-G.json
File loaded successfully
Loading classmap data from ./dataspace/graph/wordnet/wordnet-class_map.json
File loaded successfully
Loaded data.. now preprocessing..
Use original edges
Generate train edges
Number of training edges: 127124
Preprocessing finished, graph info:
Name: wordnet
Type: Graph
Number of nodes: 82670
Number of edges: 127124
Average degree:   3.0755
wordnet 8 SpearmanrResult(correlation=-0.8214285714285715, pvalue=0.023448808345691505) (-0.7312901263471525, 0.06181604915325591)
Set max degree to 5
-------------------------------------