### We're going to create [PyTorch Geometric](https://pytorch-geometric.readthedocs.io/en/latest/) dataset from sctratch. <br> <br> For this purpose we use <font color='Blue'> Cora </font> dataset: 
https://github.com/kimiyoung/planetoid/tree/master/data

#### Our goal is to classify graph nodes into one of 7 classes. <br><br> Each node represents one of 2708 publications. Each publication is formed of words from dictionary (containing totally 1433 words). <br>  So, each node in dataset is described by (0/1 valued) feature-vector that encodes the presence of the corresponding words from dictionary.

#### Data contains several <font color='Green'> pickle </font> files with the following extensions: <font color='Green'> [.x, .y, .tx, .ty, .allx, .ally, .graph] </font>.

#### <font color='Green'> .x </font> - contains features of training nodes (140 nodes)
#### <font color='Green'> .y </font>- contains one-hot encoded class labels of training nodes
#### <font color='Green'> .tx </font> - contains features of test nodes (last 1000 nodes)
#### <font color='Green'> .ty </font> - contains labels of test nodes
#### <font color='Green'> .allx </font> - contains features of all non-test nodes
#### <font color='Green'> .ally </font> - contains labels of all non-test nodes
#### <font color='Green'> .graph </font> - contains dictionary of lists <font color='Grey'> {index: [index_of_neighbor_nodes]} </font>, representing neighbored nodes.
#### <font color='Green'> .test.index </font> - contains indices of test nodes.

#### To sum up, the full dataset contains len(.tx) + len(.allx) = <font color= Blue> 2708 </font> nodes. <br> We use len(.x) = <font color= Blue> 140 </font> nodes for training, len(.tx) = <font color= Blue> 1000 </font> for testing. For validation we use <font color= Blue> 500 </font> nodes from .allx that follow 140 training nodes.

#### <u>Load and preprocess dataset </u>

In [1]:
import sys
import os

import pickle as pkl

import numpy as np
import scipy.sparse as sp
from itertools import repeat


DATA_PATH = r'Cora\raw'
dir = os.getcwd()
DATA_PATH = os.path.join(dir, DATA_PATH)

dataset_str = 'cora'

In [2]:
# load data files

names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
objects = []
for i in range(len(names)):
    with open(os.path.join(DATA_PATH, "ind.{}.{}".format(dataset_str, names[i])), 'rb') as f:
        if sys.version_info > (3, 0):
            objects.append(pkl.load(f, encoding='latin1'))
        else:
            objects.append(pkl.load(f))

In [3]:
print('There are ' + str(len(objects)) + ' objects',)
print('Type of objects:', type(objects[0]), end='\n\n')

print('There are ' + str(objects[0].toarray().shape[0]) + ' train objects', end='\n\n')

print('There are ' + str(objects[2].toarray().shape[0]) + ' test objects', end='\n\n')

print('There are ' + str(objects[4].toarray().shape[0]) + ' other objects', end='\n\n')

print('.tx + .allx shape is:', objects[2].toarray().shape[0] + objects[4].toarray().shape[0], end=',\n')
print('which equals total number of instances')


There are 7 objects
Type of objects: <class 'scipy.sparse.csr.csr_matrix'>

There are 140 train objects

There are 1000 test objects

There are 1708 other objects

.tx + .allx shape is: 2708,
which equals total number of instances


In [4]:
# let us check indices of test nodes
index_file = os.path.join(DATA_PATH, "ind.{}.test.index".format(dataset_str))

test_index=[]
for line in open(index_file):
        test_index.append(int(line.strip()))

print(len(test_index))
print('first 10 test inds: ', test_index[:10])
print('last 10 test inds: ', test_index[-10:])

1000
first 10 test inds:  [2692, 2532, 2050, 1715, 2362, 2609, 2622, 1975, 2081, 1767]
last 10 test inds:  [1885, 2305, 2354, 2135, 2601, 1770, 1995, 2504, 1749, 2157]


In [5]:
# as we can see test indices are not ordered. We shall order them
sorted_test_index = np.sort(test_index)
print('first 10 test inds (ordered):', sorted_test_index[:10])
print('last 10 test inds (ordered):', sorted_test_index[-10:])

first 10 test inds (ordered): [1708 1709 1710 1711 1712 1713 1714 1715 1716 1717]
last 10 test inds (ordered): [2698 2699 2700 2701 2702 2703 2704 2705 2706 2707]


In [6]:
# unpack items
x, y, tx, ty, allx, ally, graph = objects

In [7]:
# merge all features data X in one matrix
print('allx shape:', allx.toarray().shape)
print('tx shape:', tx.toarray().shape, end='\n\n')

features = sp.vstack((allx, tx)).toarray()
print('merged features shape:', features.shape, end='\n\n')

# do the same for labels Y
labels = np.vstack((ally, ty)).argmax(axis=-1)
print('merged labels shape:', labels.shape)


allx shape: (1708, 1433)
tx shape: (1000, 1433)

merged features shape: (2708, 1433)

merged labels shape: (2708,)


In [8]:
# test part is unordered so we order it using indices defined above
features[test_index, :] = features[sorted_test_index, :] #scipy version
labels[test_index] = labels[sorted_test_index]

#### <u>Create PyTorch Geometric Dataset </u>

In [9]:
# convert dicts of lists to edge indices

import torch

row, col = [], []
for key, value in graph.items():
    row += repeat(key, len(value))
    col += value

edge_index = np.vstack([row, col])
print('edge_index shape (duplicated edges):', edge_index.shape, sep='\n')

# there are some duplicated edges in the original dataset. Lets delete them
edge_index = np.unique(edge_index, axis=-1)
print('edge_index shape:', edge_index.shape, sep='\n')

edge_index = torch.LongTensor(edge_index)
edge_index

edge_index shape (duplicated edges):
(2, 10858)
edge_index shape:
(2, 10556)


tensor([[   0,    0,    0,  ..., 2707, 2707, 2707],
        [ 633, 1862, 2582,  ...,  598, 1473, 2706]])

In [10]:
# convert to Torch tensors and create masks

features = torch.FloatTensor(features)
labels = torch.LongTensor(labels)

# use first len(y)=140 elements for training
train_index = torch.arange(y.shape[0], dtype=torch.long)
train_mask = torch.zeros((labels.shape[0], ), dtype=torch.bool)
train_mask[train_index] = 1

# use next 500 elements for validation
val_index = torch.arange(y.shape[0], y.shape[0] + 500, dtype=torch.long)
val_mask = torch.zeros((labels.shape[0], ), dtype=torch.bool)
val_mask[val_index] = 1

# use next 1000 elements defined in test_index for testing
test_index = torch.LongTensor(test_index)
test_mask = torch.zeros((labels.shape[0], ), dtype=torch.bool)
test_mask[test_index] = 1

In [11]:
# create PyTorch Geometric dataset

from torch_geometric.data import Data

data = Data(edge_index=edge_index,
            x=features,
            y=labels,
            train_mask=train_mask,
            val_mask=val_mask,
            test_mask=test_mask)

torch.save(data, 'cora_dataset.pt')

data


Data(edge_index=[2, 10556], test_mask=[2708], train_mask=[2708], val_mask=[2708], x=[2708, 1433], y=[2708])