In [2]:
import os
import sys

import pandas as pd
import numpy as np

import torch 
import torch.nn as nn
from torch_geometric.data import Data, Dataset, DataLoader
from torch_geometric.datasets import AMiner, Yelp
from torch_geometric.utils import negative_sampling, structured_negative_sampling



# NSHE DBLP: do the labels actually correspond?

In [3]:
os.listdir(os.getcwd())

['.ipynb_checkpoints', 'playground.ipynb']

In [5]:
sys.path.insert(0, '../')
from datasets import DBLP_ACM_from_NSHE

Using backend: pytorch


In [6]:
ds = DBLP_ACM_from_NSHE(root='/home/ubuntu/msandal_code/PyG_playground/data/NSHE', name='dblp')[0]

In [7]:
ds # 0=author, 1=paper, 2=conference

Data(
  edge_index_dict={
    ('0', '1')=[2, 18304],
    ('1', '0')=[2, 18304],
    ('1', '2')=[2, 9556],
    ('2', '1')=[2, 9556]
  },
  node_features=[11576, 128],
  node_id_node_label=[2, 1263],
  node_type_mask=[11576]
)

In [10]:
ds['node_id_node_label'][0][:104]

tensor([11260, 10002, 11442,  8954,  2860,  8174,  5854,  7202, 10412,  5367,
         4761,  8152,  6876,  5244,  3437,  9703,  7388,  6395,  3059,  6388,
         5682,  9466,  8738,  4977,  4751,  5908,  5924,  8216, 11405,  4143,
        11232,  8440,  3615, 11088, 11233,  2518, 10221,  5489,  8741,  6465,
         3483,  2746,  5736,  9933,  9523,  2191, 11234, 11251, 10846,  7336,
         4718,  6575,  6507,  3469,  7208,  8165,  2066,  5902,  2722,  5002,
         3279,  4750,  4009, 10195,  4907,  4942,  9521,  4141,  8935,  3349,
         9158,  9899, 10099,  2283,  7353,  4905,  6380,  6638,  8100,  7570,
         5279,  9702, 10464, 11163,  5812,  4206,  3492,  7012,  9154, 10665,
         6319,  2733,  8655,  3861,  3311, 11453,  9541,  9706,  7357,  2147,
         7848,  3888,  4984,  3408])

In [21]:
# have to label conferences, idea: propagate labels from papers
# this version of DBLP has 104 labeled papers which go first in the 'node_id_node_label' attribute
paper_label = [ds['node_id_node_label'][0][:104].tolist(),
               ds['node_id_node_label'][1][:104].tolist()]
paper_conf = pd.DataFrame(data=ds['edge_index_dict'][('1', '2')].numpy().T, 
                          columns=['paper', 'conf'])
paper_conf = paper_conf[paper_conf['paper'].isin(paper_label[0])]
print(paper_conf.shape)
print(paper_conf.head())
print(paper_conf.conf.nunique())

(104, 2)
     paper   conf
4     5736  11562
11    8100  11565
103   2518  11571
111   2191  11572
198   3483  11569
19


Fuck. 1 conference will be without label. Well, we'll do what we can, eh?

In [22]:
paper_label = pd.DataFrame(data=np.array(paper_label).T, columns=['paper_id', 'paper_label'])
paper_label.head(2)

Unnamed: 0,paper_id,paper_label
0,11260,0
1,10002,0


In [23]:
paper_conf = paper_conf.merge(paper_label, how='inner', left_on='paper', right_on='paper_id')
print(paper_conf.head())

   paper   conf  paper_id  paper_label
0   5736  11562      5736            1
1   8100  11565      8100            2
2   2518  11571      2518            2
3   2191  11572      2191            1
4   3483  11569      3483            0


In [26]:
conf_label = paper_conf[['conf', 'paper_label']].drop_duplicates().reset_index(drop=True)
conf_label.columns = ['conf_id', 'conf_label']
print(conf_label)

    conf_id  conf_label
0     11562           1
1     11565           2
2     11571           2
3     11572           1
4     11569           0
5     11562           2
6     11557           2
7     11570           0
8     11568           3
9     11568           0
10    11561           2
11    11566           1
12    11564           3
13    11567           0
14    11556           3
15    11560           0
16    11573           0
17    11574           2
18    11561           3
19    11563           1
20    11558           3
21    11570           2
22    11559           3


In [27]:
conf_label.groupby(['conf_id']).agg({'conf_label': list})

Unnamed: 0_level_0,conf_label
conf_id,Unnamed: 1_level_1
11556,[3]
11557,[2]
11558,[3]
11559,[3]
11560,[0]
11561,"[2, 3]"
11562,"[1, 2]"
11563,[1]
11564,[3]
11565,[2]


# Creating a PyG Dataset from DBLP extract from MAGNN paper

In [12]:
# import numpy as np
# from scipy.sparse import csr_matrix
# labeled_authors = [1, 10, 2, 4, 15]
# feats = [[1,2], [3,1], [5,4], [0,6], [10, 5]]
# feats = [np.array(feat) for feat in feats]
# lol = np.array(list(zip(*sorted(zip(labeled_authors, feats), key=lambda tup: tup[0])))[1])
# print(lol)
# sm = csr_matrix(lol)

In [13]:
dataset=DBLP_MAGNN(root="/home/ubuntu/msandal_code/PyG_playground/dblp", use_MAGNN_init_feats=True)

In [14]:
dataset[0].initial_embeddings['term'][1][1].size

50

In [4]:
print(dataset[0].node_id_bag_of_words.keys())
dataset[0].node_id_bag_of_words['author'].head()

dict_keys(['author', 'paper', 'term', 'conf'])


Unnamed: 0,author_id,author_name
0,192,David Hogg
1,226,Martial Hebert
2,234,Gady Agam
3,435,Takeo Kanade
4,444,Hong Zhang


In [5]:
print(dataset[0].edge_index_dict.keys())
dataset[0].edge_index_dict[('paper', 'author')].head()

dict_keys([('paper', 'author'), ('paper', 'term'), ('paper', 'conf')])


Unnamed: 0,paper_id,author_id
0,7601,15135
1,7604,15138
2,7605,15138
3,7605,15142
4,7610,15151


In [6]:
dataset[0].id_label['author'].head()

Unnamed: 0,author_id,label
0,192,2
1,226,2
2,234,3
3,435,2
4,444,1


In [18]:
dataset[0].initial_embeddings['term'][1][1]

array([ 1.2031   , -0.40028  ,  0.073991 ,  1.0415   ,  0.051753 ,
        0.41166  , -0.98656  , -0.79466  ,  0.36033  ,  0.54428  ,
        0.29395  ,  0.5747   , -0.5576   , -0.61278  , -0.087423 ,
        0.5456   , -0.22013  , -0.0081278, -0.58155  , -0.016229 ,
        1.1811   , -0.42891  , -1.0388   , -0.87459  , -0.96912  ,
       -0.66649  , -0.23569  , -0.40309  ,  0.36778  , -0.031145 ,
        2.1525   ,  0.014425 ,  0.064602 , -0.011762 ,  0.17265  ,
       -0.89641  , -0.7655   ,  0.16825  ,  0.04137  , -0.71456  ,
        0.38339  , -0.57219  , -0.16915  ,  0.13984  , -0.7743   ,
       -0.061819 ,  0.21887  ,  1.3262   , -0.33245  ,  0.8198   ])

# Playing with built-in AMiner dataset

because apparently it is the only built-in heterogeneous graph in PyG. Oh, life.

In [2]:
aminer = AMiner(root="/home/ubuntu/msandal_code/PyG_playground/aminer_pyg")
print("#graphs : " + str(len(aminer)))

#graphs : 1


In [3]:
graph = aminer[0]
graph

Data(
  edge_index_dict={
    ('paper', 'written by', 'author')=[2, 9323605],
    ('author', 'wrote', 'paper')=[2, 9323605],
    ('paper', 'published in', 'venue')=[2, 3194405],
    ('venue', 'published', 'paper')=[2, 3194405]
  },
  num_nodes_dict={
    paper=3194405,
    author=1693531,
    venue=3883
  },
  y_dict={
    author=[246678],
    venue=[134]
  },
  y_index_dict={
    author=[246678],
    venue=[134]
  }
)

It is worth noting here that this structure is no standard in PyG :) But then again, its heterogeneous, so what did I expect...

In [4]:
graph.edge_index_dict[('author', 'wrote', 'paper')]

tensor([[      0,       0,       0,  ..., 1693528, 1693529, 1693530],
        [      0,   45988,  124807,  ..., 3194371, 3194387, 3194389]])

In [5]:
negative_author_paper = structured_negative_sampling(graph.edge_index_dict[('author', 'wrote', 'paper')],
                                                     num_nodes = graph.num_nodes_dict['paper'])
negative_author_paper

(tensor([      0,       0,       0,  ..., 1693528, 1693529, 1693530]),
 tensor([      0,   45988,  124807,  ..., 3194371, 3194387, 3194389]),
 tensor([ 328474, 1564601, 1994989,  ...,  492042, 2681395, 2529168]))

!!The function is unaware that the nodes are of different types!! But I don't really think it is an issue...

In [1]:
# explore NMI ARI permutation invariance
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.metrics.cluster import adjusted_rand_score
import numpy as np

labels1 = np.array([0, 0, 1, 1, 3, 3, 2, 2, 2])
labels2 = np.array([3, 3, 2, 2, 0, 0, 1, 1, 1])
print(normalized_mutual_info_score(labels1, labels2))
print(adjusted_rand_score(labels1, labels2))

1.0
1.0
