In [36]:
import torch_geometric
from collections import defaultdict
import numpy as np

In [37]:
attributes = defaultdict(dict)
labels = dict()
with open('Pubmed-Diabetes/data/Pubmed-Diabetes.NODE.paper.tab') as f:
    for num, line in enumerate(f.read().splitlines()):
        tokens = line.split('\t')
        if num == 1: # Parse attribute header
            tokens = [token.split(':') for token in tokens]
            assert tokens[0][1] == 'label', f'Expected label field first in PubMed attributes but have {tokens[0][1]}'
            attribute_to_idx = {attr : idx for idx, attr in enumerate(token[1] for token in tokens[1:-1])}
            assert len(attribute_to_idx) == 500, f'Expected 500 attributes in PubMed attributes, but have {len(attribute_to_idx)}'
        elif num > 1:
            for text in tokens[1:]:
                attr, value = text.split('=')
                if attr in attribute_to_idx: 
                    attributes[tokens[0]][attr] = float(value)
                elif attr == 'label':
                    labels[tokens[0]] = value
                elif attr == 'summary':
                    continue
                else:
                    raise RuntimeError(f'Unexpected node attribute {attr}')
label_to_idx = {label : idx for idx, label in enumerate(labels.values())}
                    

In [41]:

X = np.zeros((len(attributes), len(attribute_to_idx)))
vertex_to_idx = {vertex : idx for idx, vertex in enumerate(attributes.keys())}
for vertex, attrs in attributes.items():
    for attr, value in attrs.items():
        X[vertex_to_idx[vertex], attribute_to_idx[attr]] = value
    

In [54]:
edge_idxs = []
unknown = []
with open('Pubmed-Diabetes/data/Pubmed-Diabetes.DIRECTED.cites.tab') as f:
    for num, line in enumerate(f.read().splitlines()):
        tokens = line.split('\t')
        if num > 1:
            assert tokens[2] == '|', f"Expected '|' character in line {num}, but got '{tokens[2]}'"
            unknown.append(int(tokens[0]))
            u, v = tokens[1], tokens[3]
            assert u.startswith('paper:') and v.startswith('paper:')
            u, v = u.replace('paper:', ''), v.replace('paper:', '')
            assert u in vertex_to_idx, f"Unknown node id {u} in line {num}"
            assert v in vertex_to_idx, f"Unknown node id {v} in line {num}"
            edge_idxs.append([vertex_to_idx[u], vertex_to_idx[v]])
edge_idxs = np.array(edge_idxs, dtype=int).T

In [56]:
edge_idxs.shape

(2, 44338)

In [63]:
sunknown = set(unknown)
not_in_unknown = [i for i in range(max(sunknown) + 1) if i not in sunknown]

In [65]:
len(set(unknown)), len(unknown), len(not_in_unknown), max(sunknown)

(44338, 44338, 8015, 52352)

In [21]:
attribute_to_idx

{'w-rat': 0,
 'w-common': 1,
 'w-use': 2,
 'w-examin': 3,
 'w-pathogenesi': 4,
 'w-retinopathi': 5,
 'w-mous': 6,
 'w-studi': 7,
 'w-anim': 8,
 'w-model': 9,
 'w-metabol': 10,
 'w-abnorm': 11,
 'w-contribut': 12,
 'w-develop': 13,
 'w-investig': 14,
 'w-mice': 15,
 'w-2': 16,
 'w-month': 17,
 'w-compar': 18,
 'w-obtain': 19,
 'w-method': 20,
 'w-induc': 21,
 'w-6': 22,
 'w-inject': 23,
 'w-experiment': 24,
 'w-normal': 25,
 'w-diet': 26,
 'w-30': 27,
 'w-hyperglycemia': 28,
 'w-level': 29,
 'w-lipid': 30,
 'w-oxid': 31,
 'w-activ': 32,
 'w-protein': 33,
 'w-kinas': 34,
 'w-c': 35,
 'w-measur': 36,
 'w-result': 37,
 'w-increas': 38,
 'w-retin': 39,
 'w-stress': 40,
 'w-3': 41,
 'w-similar': 42,
 'w-observ': 43,
 'w-conclus': 44,
 'w-play': 45,
 'w-import': 46,
 'w-role': 47,
 'w-present': 48,
 'w-p': 49,
 'w-m': 50,
 'w-r': 51,
 'w-muscl': 52,
 'w-control': 53,
 'w-chang': 54,
 'w-dure': 55,
 'w-lower': 56,
 'w-higher': 57,
 'w-mass': 58,
 'w-correl': 59,
 'w-decreas': 60,
 'w-determin'