In [2]:
import torch
import os
print(torch.__version__)

1.10.0+cu111


In [None]:
!pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cu111.html
!pip install torch-sparse -f https://pytorch-geometric.com/whl/torch-1.10.0+cu111.html
!pip install torch-geometric
!pip install ogb

In [4]:
import pandas as pd
import torch.nn.functional as F
from torch.nn import ModuleList, BatchNorm1d
from torch_geometric.nn import GCNConv
import torch_geometric.transforms as T
from ogb.nodeproppred import PygNodePropPredDataset, Evaluator

Load and preprocess dataset


In [5]:
dataset_name = "ogbn-arxiv"
dataset = PygNodePropPredDataset(name = dataset_name, transform=T.ToSparseTensor())

Downloading http://snap.stanford.edu/ogb/data/nodeproppred/arxiv.zip


Downloaded 0.08 GB: 100%|██████████| 81/81 [00:10<00:00,  7.70it/s]


Extracting dataset/arxiv.zip


Processing...


Loading necessary files...
This might take a while.
Processing graphs...


100%|██████████| 1/1 [00:00<00:00, 8112.77it/s]


Converting graphs into PyG objects...


100%|██████████| 1/1 [00:00<00:00, 1697.41it/s]

Saving...



Done!


In [6]:
print(dataset[0])
data = dataset[0]
data.adj_t = data.adj_t.to_symmetric()
device = 'cuda' if torch.cuda.is_available() else 'cpu'
data = data.to(device)
split_idx = dataset.get_idx_split()
train_idx = split_idx['train'].to(device)

Data(x=[169343, 128], node_year=[169343, 1], y=[169343, 1], adj_t=[169343, 169343, nnz=1166243])


In [7]:
class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers,
                 dropout, return_embeds=False):

        super(GCN, self).__init__()

        self.convs = ModuleList()
        for i in range(num_layers):
          if i == 0:
            self.convs.append(GCNConv(input_dim, hidden_dim))
          elif i == num_layers :
            self.convs.append(GCNConv(hidden_dim, output_dim))
          else:
            self.convs.append(GCNConv(hidden_dim, hidden_dim))

        self.bns = ModuleList()
        for i in range(num_layers -1):
         self.bns.append(BatchNorm1d(hidden_dim))

        self.softmax = torch.nn.LogSoftmax()
          
        self.dropout = dropout

        self.return_embeds = return_embeds

    def reset_parameters(self):
        for conv in self.convs:
            conv.reset_parameters()
        for bn in self.bns:
            bn.reset_parameters()

    def forward(self, x, adj_t):
        for i, conv in enumerate(self.convs):
          if i < len(self.convs) -1:
            x = conv(x, adj_t)
            x = self.bns[i](x)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout)
          else:
            if self.return_embeds:
              x = conv(x, adj_t)
            else:
              x = conv(x, adj_t)
              x = self.softmax(x)
        out = x

        return out

In [8]:
def train(model, data, train_idx, optimizer, loss_fn):
  model.train()
  loss = 0

  optimizer.zero_grad()
  output = model(data.x, data.adj_t)
  output_train = output[train_idx]
  output_train_label = data.y[train_idx]
  loss = loss_fn(output_train, output_train_label.reshape(-1))

  loss.backward()
  optimizer.step()

  return loss.item()


In [9]:
# Test function here
@torch.no_grad()
def test(model, data, split_idx, evaluator, save_model_results=False):
    model.eval()

    out = model(data.x, data.adj_t)

    y_pred = out.argmax(dim=-1, keepdim=True)

    train_acc = evaluator.eval({
        'y_true': data.y[split_idx['train']],
        'y_pred': y_pred[split_idx['train']],
    })['acc']
    valid_acc = evaluator.eval({
        'y_true': data.y[split_idx['valid']],
        'y_pred': y_pred[split_idx['valid']],
    })['acc']
    test_acc = evaluator.eval({
        'y_true': data.y[split_idx['test']],
        'y_pred': y_pred[split_idx['test']],
    })['acc']

    if save_model_results:
      print ("Saving Model Predictions")

      data = {}
      data['y_pred'] = y_pred.view(-1).cpu().detach().numpy()

      df = pd.DataFrame(data=data)
      # Save locally as csv
      df.to_csv('ogbn-arxiv_node.csv', sep=',', index=False)


    return train_acc, valid_acc, test_acc

In [10]:
args = {
      'device': device,
      'num_layers': 3,
      'hidden_dim': 256,
      'dropout': 0.5,
      'lr': 0.01,
      'epochs': 100,
}

In [11]:
model = GCN(data.num_features, args['hidden_dim'],
              dataset.num_classes, args['num_layers'],
              args['dropout']).to(device)
evaluator = Evaluator(name='ogbn-arxiv')

In [12]:
model.reset_parameters()
import copy

optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'])
loss_fn = F.nll_loss

best_model = None
best_valid_acc = 0

for epoch in range(1, 1 + args["epochs"]):
  loss = train(model, data, train_idx, optimizer, loss_fn)
  result = test(model, data, split_idx, evaluator)
  train_acc, valid_acc, test_acc = result
  if valid_acc > best_valid_acc:
      best_valid_acc = valid_acc
      best_model = copy.deepcopy(model)
  print(f'Epoch: {epoch:02d}, '
        f'Loss: {loss:.4f}, '
        f'Train: {100 * train_acc:.2f}%, '
        f'Valid: {100 * valid_acc:.2f}% '
        f'Test: {100 * test_acc:.2f}%')



Epoch: 01, Loss: 5.9345, Train: 24.34%, Valid: 28.74% Test: 25.88%
Epoch: 02, Loss: 3.1830, Train: 36.28%, Valid: 45.48% Test: 48.39%
Epoch: 03, Loss: 2.2471, Train: 36.88%, Valid: 36.69% Test: 40.46%
Epoch: 04, Loss: 1.8740, Train: 37.35%, Valid: 34.18% Test: 35.12%
Epoch: 05, Loss: 1.6785, Train: 34.59%, Valid: 28.41% Test: 27.04%
Epoch: 06, Loss: 1.5625, Train: 33.78%, Valid: 25.35% Test: 23.04%
Epoch: 07, Loss: 1.4677, Train: 34.15%, Valid: 26.19% Test: 23.55%
Epoch: 08, Loss: 1.3995, Train: 35.39%, Valid: 29.31% Test: 27.26%
Epoch: 09, Loss: 1.3547, Train: 37.86%, Valid: 33.35% Test: 31.50%
Epoch: 10, Loss: 1.3107, Train: 40.49%, Valid: 38.07% Test: 38.20%
Epoch: 11, Loss: 1.2791, Train: 43.48%, Valid: 42.82% Test: 44.99%
Epoch: 12, Loss: 1.2534, Train: 45.78%, Valid: 45.75% Test: 48.72%
Epoch: 13, Loss: 1.2324, Train: 47.81%, Valid: 47.25% Test: 50.41%
Epoch: 14, Loss: 1.2149, Train: 50.95%, Valid: 51.08% Test: 53.37%
Epoch: 15, Loss: 1.1970, Train: 53.73%, Valid: 55.07% Test: 56