In [1]:
import os
import sys
import re
import pickle

import pandas as pd
import numpy as np

import torch 
import torch.nn as nn
from torch_geometric.data import Data, Dataset, DataLoader
from torch_geometric.datasets import AMiner, Yelp
from torch_geometric.utils import negative_sampling, structured_negative_sampling

sys.path.insert(0, '../')
from datasets import IMDB_ACM_DBLP
sys.path.remove('../')

# IMDB, DBLP, ACM - from GTN paper

12772 nodes, 37288 edges, 4 edge types, 1256 feats, 300:300:2339 train-val-test

In [8]:
ds = IMDB_ACM_DBLP(root='/home/ubuntu/msandal_code/PyG_playground/data/dblp', name='DBLP')

Downloading https://drive.google.com/uc?export=download&id=1qOZ3QjqWMIIvWjzrIdRe3EA4iKzPi6S5
Extracting /home/ubuntu/msandal_code/PyG_playground/data/dblp/raw/uc?export=download&id=1qOZ3QjqWMIIvWjzrIdRe3EA4iKzPi6S5
Processing...
Done!


In [9]:
ds[0]

Data(
  edge_index_dict={
    ('1', '0')=[2, 19645],
    ('0', '1')=[2, 19645],
    ('1', '2')=[2, 14328],
    ('2', '1')=[2, 14328]
  },
  node_features=[18405, 334],
  node_type_mask=[18405],
  test_id_label=[2, 2857],
  train_id_label=[2, 800],
  valid_id_label=[2, 400]
)

In [2]:
data = dict()
for file in os.listdir('../data/acm/raw/'):
    with open(os.path.join('../data/acm/raw', file), 'rb') as f:
        data[re.sub(".pkl", "", file)] = pickle.load(f)

In [8]:
np.nonzero(np.asarray(data['edges'][0].todense()))

(array([   0,    0,    0, ..., 3024, 3024, 3024]),
 array([3025, 3026, 3027, ..., 8934, 8935, 8936]))

In [11]:
np.asarray(data['edges'][0].todense())[0,3027]

1

In [4]:
print(type(data['node_features']))
print(data['node_features'].shape)

<class 'numpy.ndarray'>
(8994, 1902)


In [5]:
print(type(data['edges']))
print(len(data['edges']))
print(type(data['edges'][0]))
for i in range(len(data['edges'])):
    print(data['edges'][i].shape)

<class 'list'>
4
<class 'scipy.sparse.csr.csr_matrix'>
(8994, 8994)
(8994, 8994)
(8994, 8994)
(8994, 8994)


In [25]:
print(len(data['labels']))
print(len(data['labels'][2]))

train_idx_label = torch.tensor(np.array(data['labels'][0]).T)
train_idx_label

3
2125


tensor([[ 826, 1823, 1382,  ..., 2480, 2596, 2958],
        [   0,    0,    0,  ...,    2,    2,    2]])

In [30]:
import numpy as np
tups = [(0, 90), (0, 100), (101, 204), (205, 1000)]

merged = list()
finished = False
while not finished:
    for tup in tups:
        print('tups:',tups)
        mask = [check_interval_overlap(tup, elem) for elem in tups]
        merged.append(merge_tups(list(np.array(tups)[mask])))
    merged = list(set(merged))
    for tup in merged:
        mask = [check_interval_overlap(tup, elem) for elem in merged]
        true_counts = len([elem for elem in mask if elem == True])
        if true_counts == 1:
            finished=True
            continue
        else:
            finished=False
            tups=merged
            merged=list()
            break
print(merged)

tups: [(0, 90), (0, 100), (101, 204), (205, 1000)]
tups: [(0, 90), (0, 100), (101, 204), (205, 1000)]
tups: [(0, 90), (0, 100), (101, 204), (205, 1000)]
tups: [(0, 90), (0, 100), (101, 204), (205, 1000)]
[(101, 204), (205, 1000), (0, 100)]


In [8]:
def check_interval_overlap(int1, int2):
    cond1 = (int1[0] >= int2[0] and int1[0] <= int2[1])
    cond2 = (int1[1] >= int2[0] and int1[1] <= int2[1])
    cond3 = (int1[0] <= int2[0] and int1[1] >= int2[1])
    return cond1 or cond2 or cond3

def merge_tups(tups):
    return (min([x[0] for x in tups]), max([x[1] for x in tups]))

In [18]:
num_nodes = data['edges'][1].todense().shape[0]
edge_type_ids_dict = dict()
tups = list()
for edge_type in range(len(data['edges'])):
    non_zero_idx1=list()
    non_zero_idx2=list()
    matrix = np.asarray(data['edges'][edge_type].todense())
    for i in range(num_nodes):
        local_non0 = np.nonzero(matrix[i])[0]
        if local_non0.size > 0:
            non_zero_idx2.append(i)
        non_zero_idx1 = non_zero_idx1 + local_non0.tolist()
    ids1=sorted(list(set(non_zero_idx1)))
    ids2=sorted(non_zero_idx2)
    edge_type_ids_dict[edge_type] = ((min(ids1), max(ids1), len(ids1)),
                                     (min(ids2), max(ids2), len(ids2)))
    tups.append((min(ids1), max(ids1), len(ids1)))
    tups.append((min(ids2), max(ids2), len(ids2)))
tups = list(set(tups))
tups = sorted([(elem[0], elem[1]) for elem in list(set(tups))], key=lambda tup: tup[0])

print(tups)
node_type_mask = list()
for i in range(len(tups)):
    node_type_mask = node_type_mask + [i]*(tups[i][1]-tups[i][0]+1)
node_type_mask = torch.tensor(node_type_mask)

[(3025, 8936, 5912), (0, 3024, 3020), (0, 3024, 3020), (3025, 8936, 5912), (8937, 8993, 57), (0, 3024, 3025), (0, 3024, 3025), (8937, 8993, 57)]
[(3025, 8936, 5912), (8937, 8993, 57), (0, 3024, 3025), (0, 3024, 3020)]
[(0, 3024), (0, 3024), (3025, 8936), (8937, 8993)]


In [16]:
num_nodes

8994

In [15]:
node_type_mask.shape

torch.Size([12019])

# Creating a PyG Dataset from DBLP extract from MAGNN paper

In [12]:
# import numpy as np
# from scipy.sparse import csr_matrix
# labeled_authors = [1, 10, 2, 4, 15]
# feats = [[1,2], [3,1], [5,4], [0,6], [10, 5]]
# feats = [np.array(feat) for feat in feats]
# lol = np.array(list(zip(*sorted(zip(labeled_authors, feats), key=lambda tup: tup[0])))[1])
# print(lol)
# sm = csr_matrix(lol)

In [13]:
dataset=DBLP_MAGNN(root="/home/ubuntu/msandal_code/PyG_playground/dblp", use_MAGNN_init_feats=True)

In [14]:
dataset[0].initial_embeddings['term'][1][1].size

50

In [4]:
print(dataset[0].node_id_bag_of_words.keys())
dataset[0].node_id_bag_of_words['author'].head()

dict_keys(['author', 'paper', 'term', 'conf'])


Unnamed: 0,author_id,author_name
0,192,David Hogg
1,226,Martial Hebert
2,234,Gady Agam
3,435,Takeo Kanade
4,444,Hong Zhang


In [5]:
print(dataset[0].edge_index_dict.keys())
dataset[0].edge_index_dict[('paper', 'author')].head()

dict_keys([('paper', 'author'), ('paper', 'term'), ('paper', 'conf')])


Unnamed: 0,paper_id,author_id
0,7601,15135
1,7604,15138
2,7605,15138
3,7605,15142
4,7610,15151


In [6]:
dataset[0].id_label['author'].head()

Unnamed: 0,author_id,label
0,192,2
1,226,2
2,234,3
3,435,2
4,444,1


In [18]:
dataset[0].initial_embeddings['term'][1][1]

array([ 1.2031   , -0.40028  ,  0.073991 ,  1.0415   ,  0.051753 ,
        0.41166  , -0.98656  , -0.79466  ,  0.36033  ,  0.54428  ,
        0.29395  ,  0.5747   , -0.5576   , -0.61278  , -0.087423 ,
        0.5456   , -0.22013  , -0.0081278, -0.58155  , -0.016229 ,
        1.1811   , -0.42891  , -1.0388   , -0.87459  , -0.96912  ,
       -0.66649  , -0.23569  , -0.40309  ,  0.36778  , -0.031145 ,
        2.1525   ,  0.014425 ,  0.064602 , -0.011762 ,  0.17265  ,
       -0.89641  , -0.7655   ,  0.16825  ,  0.04137  , -0.71456  ,
        0.38339  , -0.57219  , -0.16915  ,  0.13984  , -0.7743   ,
       -0.061819 ,  0.21887  ,  1.3262   , -0.33245  ,  0.8198   ])

# Playing with built-in AMiner dataset

because apparently it is the only built-in heterogeneous graph in PyG. Oh, life.

In [2]:
aminer = AMiner(root="/home/ubuntu/msandal_code/PyG_playground/aminer_pyg")
print("#graphs : " + str(len(aminer)))

#graphs : 1


In [3]:
graph = aminer[0]
graph

Data(
  edge_index_dict={
    ('paper', 'written by', 'author')=[2, 9323605],
    ('author', 'wrote', 'paper')=[2, 9323605],
    ('paper', 'published in', 'venue')=[2, 3194405],
    ('venue', 'published', 'paper')=[2, 3194405]
  },
  num_nodes_dict={
    paper=3194405,
    author=1693531,
    venue=3883
  },
  y_dict={
    author=[246678],
    venue=[134]
  },
  y_index_dict={
    author=[246678],
    venue=[134]
  }
)

It is worth noting here that this structure is no standard in PyG :) But then again, its heterogeneous, so what did I expect...

In [4]:
graph.edge_index_dict[('author', 'wrote', 'paper')]

tensor([[      0,       0,       0,  ..., 1693528, 1693529, 1693530],
        [      0,   45988,  124807,  ..., 3194371, 3194387, 3194389]])

In [5]:
negative_author_paper = structured_negative_sampling(graph.edge_index_dict[('author', 'wrote', 'paper')],
                                                     num_nodes = graph.num_nodes_dict['paper'])
negative_author_paper

(tensor([      0,       0,       0,  ..., 1693528, 1693529, 1693530]),
 tensor([      0,   45988,  124807,  ..., 3194371, 3194387, 3194389]),
 tensor([ 328474, 1564601, 1994989,  ...,  492042, 2681395, 2529168]))

!!The function is unaware that the nodes are of different types!! But I don't really think it is an issue...