# Advance PyTorch Geometric Tutorial
## Tutorial 1
#### Antonio Longa, 15 Nov 2021

# Open Graph Benchmark and PyG
original [code](https://github.com/snap-stanford/ogb/tree/master/examples/nodeproppred/arxiv) made by Matthias Fey

In [1]:
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-1.10.0+cu113.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-1.10.0+cu113.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

In [86]:
# import libraries 
import torch
import torch.nn.functional as F

import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv, SAGEConv


In [87]:
# define GCN
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers,
                 dropout):
        super(GCN, self).__init__()

        self.convs = torch.nn.ModuleList()
        self.convs.append(GCNConv(in_channels, hidden_channels, cached=True))
        self.bns = torch.nn.ModuleList()
        self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
        for _ in range(num_layers - 2):
            self.convs.append(
                GCNConv(hidden_channels, hidden_channels, cached=True))
            self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
        self.convs.append(GCNConv(hidden_channels, out_channels, cached=True))

        self.dropout = dropout

    def reset_parameters(self):
        for conv in self.convs:
            conv.reset_parameters()
        for bn in self.bns:
            bn.reset_parameters()

    def forward(self, x, adj_t):
        for i, conv in enumerate(self.convs[:-1]):
            x = conv(x, adj_t)
            x = self.bns[i](x)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.convs[-1](x, adj_t)
        return x.log_softmax(dim=-1)

In [114]:
#Define train and test
def train(model, data, train_idx, optimizer):
    model.train()

    optimizer.zero_grad()
    out = model(data.x, data.adj_t)[train_idx]
    loss = F.nll_loss(out, data.y.squeeze(1)[train_idx])
    loss.backward()
    optimizer.step()

    return loss.item()




@torch.no_grad()
def test(model, data, split_idx, evaluator):
    model.eval()

    out = model(data.x, data.adj_t)
    y_pred = out.argmax(dim=-1, keepdim=True)
    
    # update the evaluator
    train_acc = evaluator.eval({'y_true': data.y[split_idx['train']],
                                'y_pred': y_pred[split_idx['train']],
                               })['acc']
    valid_acc = evaluator.eval({'y_true': data.y[split_idx['valid']],
                                'y_pred': y_pred[split_idx['valid']],
                               })['acc']
    test_acc = evaluator.eval({'y_true': data.y[split_idx['test']],
                               'y_pred': y_pred[split_idx['test']],
                              })['acc']

    return train_acc, valid_acc, test_acc

In [89]:
# define the device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device(device)

### OGB get the dataset

In [102]:
#From node property prediction import :
from ogb.nodeproppred import PygNodePropPredDataset, Evaluator


# download the dataset
dataset = PygNodePropPredDataset(name='ogbn-arxiv',transform=T.ToSparseTensor())
# I have already downloaded the dataset, so it is loaded from my pc

Downloading http://snap.stanford.edu/ogb/data/nodeproppred/arxiv.zip


Downloaded 0.08 GB: 100%|██████████| 81/81 [00:14<00:00,  5.48it/s]


Extracting dataset/arxiv.zip


Processing...


Loading necessary files...
This might take a while.
Processing graphs...


100%|██████████| 1/1 [00:00<00:00, 8035.07it/s]


Converting graphs into PyG objects...


100%|██████████| 1/1 [00:00<00:00, 2880.70it/s]

Saving...



Done!


In [103]:
data = dataset[0]
data.adj_t = data.adj_t.to_symmetric()
data = data.to(device) # move the data to the device

In [104]:
split_idx = dataset.get_idx_split()

In [105]:
# it is a Data object for PyG
data

Data(num_nodes=169343, x=[169343, 128], node_year=[169343, 1], y=[169343, 1], adj_t=[169343, 169343, nnz=2315598])

In [106]:
import pandas as pd

In [107]:
data.num_nodes

169343

In [108]:
data_xr=pd.DataFrame((data.x).numpy())
data_x

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,-0.057943,-0.052530,-0.072603,-0.026555,0.130435,-0.241386,-0.449242,-0.018443,-0.087218,0.112320,...,0.211490,-0.226118,-0.185603,0.053230,0.332873,0.104175,0.007408,0.173364,-0.172796,-0.140059
1,-0.124500,-0.070665,-0.325202,0.007779,-0.001559,0.074189,-0.191013,0.049689,0.026369,0.099364,...,0.106316,0.052926,-0.258378,0.021567,0.281503,-0.173423,0.202082,0.068524,-0.372111,-0.301036
2,-0.080242,-0.023328,-0.183787,-0.180707,0.075765,-0.125818,-0.394573,-0.219078,-0.108931,0.056966,...,0.019453,-0.070291,-0.177562,-0.214012,0.182186,-0.121589,-0.073642,0.109919,0.117589,-0.139883
3,-0.145044,0.054915,-0.126666,0.039971,-0.055909,-0.101278,-0.339202,-0.115801,-0.080058,-0.001633,...,-0.065752,0.042735,0.066338,-0.226921,0.188418,-0.017295,0.063449,0.017816,0.085364,-0.081804
4,-0.071154,0.070766,-0.281432,-0.161892,-0.165246,-0.029116,-0.338593,-0.138727,0.100015,0.132794,...,-0.056130,0.047475,-0.263795,0.026462,0.376349,-0.253772,0.084472,0.098033,-0.075347,-0.111687
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169338,-0.321351,-0.039335,-0.011121,0.024434,-0.010934,-0.125384,-0.490196,0.170707,0.065013,0.136422,...,0.189792,-0.138938,-0.281634,-0.031904,0.435017,0.036357,-0.126336,0.069767,-0.003338,-0.242013
169339,-0.151212,-0.124702,-0.221386,-0.089370,-0.013997,-0.067091,-0.234838,-0.063793,0.208427,0.134944,...,-0.119714,-0.107352,-0.213279,-0.178972,0.307967,0.008814,0.104214,0.120340,-0.062759,-0.316333
169340,-0.220530,-0.036568,-0.402199,0.023286,-0.073210,-0.039768,-0.302267,0.170705,0.145221,0.063539,...,0.066331,-0.163570,-0.023279,-0.027070,0.369248,0.131152,0.333832,0.113360,-0.161393,-0.145171
169341,-0.138236,0.040885,-0.251811,-0.225074,0.250449,-0.216719,-0.342307,0.174997,0.169612,0.370719,...,0.074614,-0.054579,0.075444,0.084619,0.481700,0.004581,0.096515,-0.089290,-0.041253,-0.376132


In [111]:
data_node_year=pd.DataFrame((data.node_year).numpy())
data_node_year

Unnamed: 0,0
0,2013
1,2015
2,2014
3,2014
4,2014
...,...
169338,2020
169339,2020
169340,2020
169341,2020


In [112]:
data_y=pd.DataFrame((data.y).numpy())
data_y

Unnamed: 0,0
0,4
1,5
2,28
3,8
4,27
...,...
169338,4
169339,24
169340,10
169341,4


In [113]:
#data_adj_t=pd.DataFrame((data.adj_t).numpy())
data.adj_t

SparseTensor(row=tensor([     0,      0,      0,  ..., 169341, 169342, 169342]),
             col=tensor([   411,    640,   1162,  ..., 163274,  27824, 158981]),
             size=(169343, 169343), nnz=2315598, density=0.01%)

### PygNodePropPredDataset allows to get datasets.
### Are there other datasets? (for node property predictions)

In [8]:
dataset2 = PygNodePropPredDataset(name='ogbn')

ValueError: Invalid dataset name ogbn.
Available datasets are as follows:
ogbn-proteins
ogbn-products
ogbn-arxiv
ogbn-mag
ogbn-papers100M

# Instanciate the GNN

In [115]:
train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]

In [125]:
hidden_channels = 64
num_layers = 2
dropout = 0.5 
epochs = 50
print_steps = 1


split_idx = dataset.get_idx_split()
train_idx = split_idx['train'].to(device)

model = GCN(data.num_features, hidden_channels,
            dataset.num_classes, num_layers,
            dropout).to(device)

# Evaluator

In [126]:
evaluator = Evaluator(name='ogbn-arxiv')

evaluator

<ogb.nodeproppred.evaluate.Evaluator at 0x7f33073e8a50>

In [127]:
print(evaluator.expected_input_format) 

==== Expected input format of Evaluator for ogbn-arxiv
{'y_true': y_true, 'y_pred': y_pred}
- y_true: numpy ndarray or torch tensor of shape (num_node, num_task)
- y_pred: numpy ndarray or torch tensor of shape (num_node, num_task)
where y_pred stores predicted class label (integer),
num_task is 1, and each row corresponds to one node.



In [128]:
print(evaluator.expected_output_format)

==== Expected output format of Evaluator for ogbn-arxiv
{'acc': acc}
- acc (float): Accuracy score averaged across 1 task(s)



### NOTE:
we have different evaluators for node property predictions, graphs predictions and link predictions

In [130]:
# link property prediction
from ogb.linkproppred import Evaluator as ev
ev(name='ogbl-ppa').expected_input_format

"==== Expected input format of Evaluator for ogbl-ppa\n{'y_pred_pos': y_pred_pos, 'y_pred_neg': y_pred_neg}\n- y_pred_pos: numpy ndarray or torch tensor of shape (num_edge, ). Torch tensor on GPU is recommended for efficiency.\n- y_pred_neg: numpy ndarray or torch tensor of shape (num_edge, ). Torch tensor on GPU is recommended for efficiency.\ny_pred_pos is the predicted scores for positive edges.\ny_pred_neg is the predicted scores for negative edges.\nNote: As the evaluation metric is ranking-based, the predicted scores need to be different for different edges."

In [131]:
# link property prediction
from ogb.graphproppred import Evaluator as ev
ev(name='ogbg-molesol').expected_input_format

"==== Expected input format of Evaluator for ogbg-molesol\n{'y_true': y_true, 'y_pred': y_pred}\n- y_true: numpy ndarray or torch tensor of shape (num_graph, num_task)\n- y_pred: numpy ndarray or torch tensor of shape (num_graph, num_task)\nwhere num_task is 1, and each row corresponds to one graph.\nnan values in y_true are ignored during evaluation.\n"

### END NOTE.

## Train as usual 

In [133]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
for epoch in range(1, 1 + epochs):
    loss = train(model, data, train_idx, optimizer)
    result = test(model, data, split_idx, evaluator)

    if epoch % print_steps == 0:
        train_acc, valid_acc, test_acc = result
        print(f'Epoch: {epoch:02d}, '
              f'Loss: {loss:.4f}, '
              f'Train: {100 * train_acc:.2f}%, '
              f'Valid: {100 * valid_acc:.2f}% '
              f'Test: {100 * test_acc:.2f}%')


Epoch: 01, Loss: 3.8387, Train: 3.13%, Valid: 3.50% Test: 3.05%
Epoch: 02, Loss: 3.7599, Train: 6.91%, Valid: 11.72% Test: 10.12%
Epoch: 03, Loss: 3.6827, Train: 10.27%, Valid: 20.56% Test: 18.83%
Epoch: 04, Loss: 3.6160, Train: 11.34%, Valid: 23.01% Test: 21.72%
Epoch: 05, Loss: 3.5406, Train: 11.57%, Valid: 23.51% Test: 22.15%
Epoch: 06, Loss: 3.4743, Train: 12.00%, Valid: 23.75% Test: 22.22%
Epoch: 07, Loss: 3.4187, Train: 14.14%, Valid: 24.40% Test: 22.72%
Epoch: 08, Loss: 3.3502, Train: 17.28%, Valid: 25.72% Test: 23.49%
Epoch: 09, Loss: 3.2883, Train: 20.09%, Valid: 26.78% Test: 24.07%
Epoch: 10, Loss: 3.2392, Train: 22.03%, Valid: 27.60% Test: 24.58%
Epoch: 11, Loss: 3.1865, Train: 23.41%, Valid: 28.13% Test: 25.04%
Epoch: 12, Loss: 3.1393, Train: 24.44%, Valid: 28.61% Test: 25.48%
Epoch: 13, Loss: 3.0874, Train: 25.27%, Valid: 28.95% Test: 25.82%
Epoch: 14, Loss: 3.0547, Train: 25.98%, Valid: 29.19% Test: 26.07%
Epoch: 15, Loss: 2.9981, Train: 26.52%, Valid: 29.46% Test: 26.33%

In [16]:
pwd

'/home/kangjunekoo/MachineLearning/MachineLearning/MyStudy/AdvancePyTorchGeometricTutorials-main/Tutorial1/.ipynb_checkpoints'

In [17]:
ls

[0m[01;34mdataset[0m/  Tutorial_1-checkpoint.ipynb


In [18]:
cd dataset/ogbn_arxiv

/home/kangjunekoo/MachineLearning/MachineLearning/MyStudy/AdvancePyTorchGeometricTutorials-main/Tutorial1/.ipynb_checkpoints/dataset/ogbn_arxiv


In [19]:
pwd

'/home/kangjunekoo/MachineLearning/MachineLearning/MyStudy/AdvancePyTorchGeometricTutorials-main/Tutorial1/.ipynb_checkpoints/dataset/ogbn_arxiv'

In [20]:
ls

[0m[01;34mmapping[0m/  [01;34mprocessed[0m/  [01;34mraw[0m/  RELEASE_v1.txt  [01;34msplit[0m/


In [21]:
cd mapping

/home/kangjunekoo/MachineLearning/MachineLearning/MyStudy/AdvancePyTorchGeometricTutorials-main/Tutorial1/.ipynb_checkpoints/dataset/ogbn_arxiv/mapping


In [30]:
gzip -d nodeidx2paperid.csv.gz

SyntaxError: invalid syntax (2234638568.py, line 1)