## Quick and dirty nb to create the ogbn-arxiv dataset with text information

- Download data from: http://snap.stanford.edu/ogb/data/nodeproppred/arxiv.zip to ~/data/ogbn-arxiv
- Download text data from: https://snap.stanford.edu/ogb/data/misc/ogbn_arxiv/titleabs.tsv.gz

In [1]:
import numpy as np
import scipy.sparse as sp

In [4]:
import gzip
import os.path as osp

In [9]:
base_dir = osp.expanduser(osp.join('~', 'data', 'ogbn-arxiv'))
osp.exists(base_dir)

True

In [84]:
with gzip.open(osp.join(base_dir, 'raw', 'edge.csv.gz')) as f:
    edges = [list(map(int, line.decode('utf-8').split(','))) for line in f.read().splitlines()]

In [44]:
with gzip.open(osp.join(base_dir, 'raw', 'node-label.csv.gz')) as f:
    y = [int(line.decode('utf-8')) for line in f.read().splitlines()]

In [34]:
with gzip.open(osp.join(base_dir, 'raw', 'node-feat.csv.gz')) as f:
    x = [[float(v) for v in line.decode('utf-8').split(',')] for line in f.read().splitlines()]

In [45]:
with gzip.open(osp.join(base_dir, 'raw', 'node_year.csv.gz')) as f:
    year = [int(line.decode('utf-8')) for line in f.read().splitlines()]

In [48]:
x = np.array(x)
y = np.array(y)
year = np.array(year)

In [50]:
year.shape, y.shape, x.shape

((169343,), (169343,), (169343, 128))

In [75]:
with gzip.open(osp.join(base_dir, 'mapping', 'labelidx2arxivcategeory.csv.gz')) as f:
    idx_to_label = {}
    for line in [line.decode('utf-8') for line in f.read().splitlines()][1:]:
        idx, label = line.split(',')
        idx_to_label[int(idx)] = label
    
    

In [76]:
set(range(len(idx_to_label))) == set(idx_to_label.keys())

True

In [78]:
# nodeidx2paperid.csv.gz
with gzip.open(osp.join(base_dir, 'mapping', 'nodeidx2paperid.csv.gz')) as f:
    idx_to_vertex = {}
    for line in [line.decode('utf-8') for line in f.read().splitlines()][1:]:
        idx, vertex = line.split(',')
        idx_to_vertex[int(idx)] = vertex

In [79]:
set(range(len(idx_to_vertex))) == set(idx_to_vertex.keys())

True

In [87]:
with gzip.open(osp.join(base_dir, 'split', 'time', 'train.csv.gz')) as f:
    idx_train = [int(line.decode('utf-8')) for line in f.read().splitlines()]
with gzip.open(osp.join(base_dir, 'split', 'time', 'valid.csv.gz')) as f:
    idx_val = [int(line.decode('utf-8')) for line in f.read().splitlines()]
with gzip.open(osp.join(base_dir, 'split', 'time', 'test.csv.gz')) as f:
    idx_test = [int(line.decode('utf-8')) for line in f.read().splitlines()]

In [90]:
set(idx_train).intersection(set(idx_val)), set(idx_train).intersection(set(idx_test)), set(idx_val).intersection(set(idx_test))

(set(), set(), set())

In [91]:
len(idx_train + idx_val + idx_test)

169343

In [99]:
with open(osp.join(base_dir, 'titleabs.tsv')) as f:
    tokens = [line.split('\t') for line in f.read().splitlines()][1:]

In [109]:
from tqdm import tqdm
texts = {}
for idx, t in tqdm(enumerate(tokens)):
    if len(t)  != 3 :
        print(idx)
    else:
        texts[t[0]] = t[1] + '. ' + t[2]

80252it [00:00, 394959.60it/s]

16891
16892
16893
63262


179723it [00:00, 440035.74it/s]

179722





In [116]:
for v in idx_to_vertex.values():
    if v not in texts:
        print(v)

In [118]:
edges = np.array(edges)

In [121]:
n = len(idx_to_vertex)
A = sp.coo_matrix((np.ones(edges.T.shape[1]), edges.T), shape=(n, n)).tocsr()

In [125]:
A.shape, x.shape, y.shape, year.shape

((169343, 169343), (169343, 128), (169343,), (169343,))

In [129]:
mask_train = np.zeros(n, dtype=bool)
mask_train[idx_train] = True
mask_val = np.zeros(n, dtype=bool)
mask_val[idx_val] = True
mask_test = np.zeros(n, dtype=bool)
mask_test[idx_test] = True

In [134]:
masks = np.stack((mask_train, mask_val, mask_test), 1)
masks.astype(int).sum(0), (masks.astype(int).sum(1) == 1).all()

(array([90941, 29799, 48603]), True)

In [136]:
texts_arr = []
for idx in range(n):
    texts_arr.append(texts[idx_to_vertex[idx]])
    

In [137]:
len(texts_arr)

169343

In [141]:
for i in range(n):
    if not texts_arr[i] == texts[idx_to_vertex[i]]:
        raise RuntimeError

In [151]:
np.arange(4) or np.arange(3)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [143]:
%cd ..

/nfs/homedirs/fuchsgru/MastersThesis


In [146]:
A.data, A.indptr, A.indices

(array([1., 1., 1., ..., 1., 1., 1.]),
 array([      0,       2,       3, ..., 1166224, 1166241, 1166243],
       dtype=int32),
 array([ 52893,  93487, 141692, ..., 163274,  27824, 158981], dtype=int32))

In [149]:
np.savez_compressed('data/raw/ogbn-arxiv.npz', 
    adj_data = A.data,
    adj_indices = A.indices,
    adj_indptr = A.indptr,
    adj_shape = A.shape,
    attr_text = texts_arr,
    features = x,
    labels = y,
    year = year,
    idx_to_class = idx_to_label,
    idx_to_node = idx_to_vertex,
    mask_train = mask_train,
    mask_val = mask_val,
    mask_test = mask_test,    
)