In [11]:
import os
import sys

import pandas as pd
import numpy as np

import torch 
import torch.nn as nn
from torch_geometric.data import Data, Dataset, DataLoader
from torch_geometric.datasets import AMiner, Yelp
from torch_geometric.utils import negative_sampling, structured_negative_sampling

from datasets import DBLP_MAGNN

# Creating a PyG Dataset from DBLP extract from MAGNN paper

In [12]:
# import numpy as np
# from scipy.sparse import csr_matrix
# labeled_authors = [1, 10, 2, 4, 15]
# feats = [[1,2], [3,1], [5,4], [0,6], [10, 5]]
# feats = [np.array(feat) for feat in feats]
# lol = np.array(list(zip(*sorted(zip(labeled_authors, feats), key=lambda tup: tup[0])))[1])
# print(lol)
# sm = csr_matrix(lol)

In [13]:
dataset=DBLP_MAGNN(root="/home/ubuntu/msandal_code/PyG_playground/dblp", use_MAGNN_init_feats=True)

In [14]:
dataset[0].initial_embeddings['term'][1][1].size

50

In [4]:
print(dataset[0].node_id_bag_of_words.keys())
dataset[0].node_id_bag_of_words['author'].head()

dict_keys(['author', 'paper', 'term', 'conf'])


Unnamed: 0,author_id,author_name
0,192,David Hogg
1,226,Martial Hebert
2,234,Gady Agam
3,435,Takeo Kanade
4,444,Hong Zhang


In [5]:
print(dataset[0].edge_index_dict.keys())
dataset[0].edge_index_dict[('paper', 'author')].head()

dict_keys([('paper', 'author'), ('paper', 'term'), ('paper', 'conf')])


Unnamed: 0,paper_id,author_id
0,7601,15135
1,7604,15138
2,7605,15138
3,7605,15142
4,7610,15151


In [6]:
dataset[0].id_label['author'].head()

Unnamed: 0,author_id,label
0,192,2
1,226,2
2,234,3
3,435,2
4,444,1


In [18]:
dataset[0].initial_embeddings['term'][1][1]

array([ 1.2031   , -0.40028  ,  0.073991 ,  1.0415   ,  0.051753 ,
        0.41166  , -0.98656  , -0.79466  ,  0.36033  ,  0.54428  ,
        0.29395  ,  0.5747   , -0.5576   , -0.61278  , -0.087423 ,
        0.5456   , -0.22013  , -0.0081278, -0.58155  , -0.016229 ,
        1.1811   , -0.42891  , -1.0388   , -0.87459  , -0.96912  ,
       -0.66649  , -0.23569  , -0.40309  ,  0.36778  , -0.031145 ,
        2.1525   ,  0.014425 ,  0.064602 , -0.011762 ,  0.17265  ,
       -0.89641  , -0.7655   ,  0.16825  ,  0.04137  , -0.71456  ,
        0.38339  , -0.57219  , -0.16915  ,  0.13984  , -0.7743   ,
       -0.061819 ,  0.21887  ,  1.3262   , -0.33245  ,  0.8198   ])

# Playing with built-in AMiner dataset

because apparently it is the only built-in heterogeneous graph in PyG. Oh, life.

In [2]:
aminer = AMiner(root="/home/ubuntu/msandal_code/PyG_playground/aminer_pyg")
print("#graphs : " + str(len(aminer)))

#graphs : 1


In [3]:
graph = aminer[0]
graph

Data(
  edge_index_dict={
    ('paper', 'written by', 'author')=[2, 9323605],
    ('author', 'wrote', 'paper')=[2, 9323605],
    ('paper', 'published in', 'venue')=[2, 3194405],
    ('venue', 'published', 'paper')=[2, 3194405]
  },
  num_nodes_dict={
    paper=3194405,
    author=1693531,
    venue=3883
  },
  y_dict={
    author=[246678],
    venue=[134]
  },
  y_index_dict={
    author=[246678],
    venue=[134]
  }
)

It is worth noting here that this structure is no standard in PyG :) But then again, its heterogeneous, so what did I expect...

In [4]:
graph.edge_index_dict[('author', 'wrote', 'paper')]

tensor([[      0,       0,       0,  ..., 1693528, 1693529, 1693530],
        [      0,   45988,  124807,  ..., 3194371, 3194387, 3194389]])

In [5]:
negative_author_paper = structured_negative_sampling(graph.edge_index_dict[('author', 'wrote', 'paper')],
                                                     num_nodes = graph.num_nodes_dict['paper'])
negative_author_paper

(tensor([      0,       0,       0,  ..., 1693528, 1693529, 1693530]),
 tensor([      0,   45988,  124807,  ..., 3194371, 3194387, 3194389]),
 tensor([ 328474, 1564601, 1994989,  ...,  492042, 2681395, 2529168]))

!!The function is unaware that the nodes are of different types!! But I don't really think it is an issue...

In [1]:
# explore NMI ARI permutation invariance
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.metrics.cluster import adjusted_rand_score
import numpy as np

labels1 = np.array([0, 0, 1, 1, 3, 3, 2, 2, 2])
labels2 = np.array([3, 3, 2, 2, 0, 0, 1, 1, 1])
print(normalized_mutual_info_score(labels1, labels2))
print(adjusted_rand_score(labels1, labels2))

1.0
1.0
