In [1]:
!pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cu111.html
!pip install torch-sparse -f https://pytorch-geometric.com/whl/torch-1.10.0+cu111.html
!pip install torch-geometric
!pip install ogb 

Looking in links: https://pytorch-geometric.com/whl/torch-1.10.0+cu111.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-1.10.0%2Bcu113/torch_scatter-2.0.9-cp37-cp37m-linux_x86_64.whl (7.9 MB)
[K     |████████████████████████████████| 7.9 MB 2.8 MB/s 
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.0.9
Looking in links: https://pytorch-geometric.com/whl/torch-1.10.0+cu111.html
Collecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-1.10.0%2Bcu113/torch_sparse-0.6.12-cp37-cp37m-linux_x86_64.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 2.9 MB/s 
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.12
Collecting torch-geometric
  Downloading torch_geometric-2.0.3.tar.gz (370 kB)
[K     |████████████████████████████████| 370 kB 5.1 MB/s 
Collecting rdflib
  Downloading rdflib-6.1.1-py3-none-any.whl (482 kB)
[K     |████████████████████████████████| 482 kB 60.2

In [2]:
from ogb import graphproppred
from ogb.graphproppred import PygGraphPropPredDataset, Evaluator
from torch_geometric.data import DataLoader
import torch
import torch.nn
from ogb.graphproppred.mol_encoder import AtomEncoder
from torch.nn import ModuleList, BatchNorm1d
from torch_geometric.nn import GCNConv
import torch.nn.functional as F
from tqdm.notebook import tqdm

In [3]:
dataset = PygGraphPropPredDataset("ogbg-molhiv")
#create dataset object

Downloading http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/hiv.zip


Downloaded 0.00 GB: 100%|██████████| 3/3 [00:01<00:00,  2.82it/s]
Processing...


Extracting dataset/hiv.zip
Loading necessary files...
This might take a while.
Processing graphs...


100%|██████████| 41127/41127 [00:00<00:00, 90971.73it/s]


Converting graphs into PyG objects...


100%|██████████| 41127/41127 [00:01<00:00, 31381.15it/s]


Saving...


Done!


In [4]:
split_idx = dataset.get_idx_split()
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [5]:
#merge data object from Dataset to mini-batch, elements in batches are processed paralel
train_loader = DataLoader(dataset[split_idx["train"]], batch_size=32, shuffle=True, num_workers=0)
valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=32, shuffle=False, num_workers=0)
test_loader = DataLoader(dataset[split_idx["test"]], batch_size=32, shuffle=False, num_workers=0)



In [6]:
class GCN(torch.nn.Module):
  def __init__(self, dropout, num_layers , input_dim, hidden_dim, output_dim, return_embeds):
    super(GCN, self).__init__()
    self.convs = ModuleList()
    for i in range(num_layers):
          if i == 0:
            self.convs.append(GCNConv(input_dim, hidden_dim))
          elif i == num_layers :
            self.convs.append(GCNConv(hidden_dim, output_dim))
          else:
            self.convs.append(GCNConv(hidden_dim, hidden_dim))

    self.bns = ModuleList()
    for i in range(num_layers -1):
      self.bns.append(BatchNorm1d(hidden_dim))

    self.softmax = torch.nn.LogSoftmax()
    self.dropout = dropout
    self.return_embeds = return_embeds


  def reset_parameters(self):
    for conv in self.convs:
      conv.reset_parameters()
    for bns in self.bns:
      bns.reset_parameters()

  def forward(self, x, adj_t):
    for i, conv in enumerate(self.convs):
      if i == len(self.convs) -1:
        if self.return_embeds:
          x= conv(x, adj_t)
        else:
          x = conv(x, adj_t)
          x = self.softmax(x)
      else:
        x = conv(x, adj_t)
        x = self.bns[i](x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout)
    out = x
    return out



In [7]:
args = {
    'device': device,
    'num_layers' : 5,
    'hidden_dim' : 256,
    'dropout': 0.5,
    'lr': 0.001,
    'epochs': 5
}

In [8]:
from torch_geometric.nn import global_add_pool, global_mean_pool
class GCN_Graph(torch.nn.Module):
  def __init__(self,hidden_dim, output_dim, num_layers, dropout):
    super(GCN_Graph, self).__init__()
    #encode atoms 
    self.node_encoder = AtomEncoder(hidden_dim)
    #our GCN layer te get us embeddings of nodes after
    self.node_em_model = GCN(dropout, num_layers, hidden_dim, hidden_dim, hidden_dim, return_embeds = True)
    #create graph level embeddings that can be used to predict properties for the each graph
    self.pool = global_mean_pool
    
    self.linear = torch.nn.Linear(hidden_dim, output_dim)

  def reset_parameters(self):
    self.node_em_model.reset_parameters()
    self.linear.reset_parameters()

  def forward(self, batched_data):
    x, edge_index, batch = batched_data.x, batched_data.edge_index, batched_data.batch

    embed = self.node_encoder(x)
    #calculate representations of nodes
    x = self.node_em_model(embed, edge_index)
    #pool
    x = self.pool(x, batch)
    x = self.linear(x)

    return x


In [9]:
def train(model, device, data_loader, optimizer, loss_fn):
  model.train()
  loss = 0

  for step, batch in enumerate(tqdm(data_loader)):
    batch = batch.to(device)
    if batch.x.shape[0] == 1 or batch.batch[-1] == 0:
      pass
    else:
      is_labeled = batch.y == batch.y
      optimizer.zero_grad()
      output = model(batch)
      output = output[is_labeled]
      labels = batch.y[is_labeled].float()
      loss = loss_fn(output, labels)
      loss.backward()
      optimizer.step()
  
  return loss.item()


In [10]:
def eval(model, device, loader, evaluator):
  model.eval()
  y_true = []
  y_pred = []

  for step, batch in enumerate(tqdm(loader)):
    batch = batch.to(device)
    if batch.x.shape[0] == 1:
      pass
    else: 
      with torch.no_grad():
        pred = model(batch)
      y_true.append(batch.y.view(pred.shape).detach().cpu())
      y_pred.append(pred.detach().cpu())


  y_true = torch.cat(y_true, dim = 0).numpy()
  y_pred = torch.cat(y_pred, dim = 0).numpy()

  input_dict = {"y_true": y_true, "y_pred": y_pred}

  return evaluator.eval(input_dict)

In [11]:
model = GCN_Graph(args['hidden_dim'],
              dataset.num_tasks, args['num_layers'],
              args['dropout']).to(device)
evaluator = Evaluator(name='ogbg-molhiv')

In [12]:
import copy
model.reset_parameters()
optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'])
loss_fn = torch.nn.BCEWithLogitsLoss()

best_model = None
best_valid_acc = 0

for epoch in range(1, 1 + args["epochs"]):
  print('Training...')
  loss = train(model,device, train_loader, optimizer, loss_fn)

  print('Evaluating...')
  train_result = eval(model, device, train_loader, evaluator)
  val_result = eval(model, device, valid_loader, evaluator)
  test_result = eval(model, device, test_loader, evaluator)

  train_acc, valid_acc, test_acc = train_result[dataset.eval_metric], val_result[dataset.eval_metric], test_result[dataset.eval_metric]
  if valid_acc > best_valid_acc:
      best_valid_acc = valid_acc
      best_model = copy.deepcopy(model)
  print(f'Epoch: {epoch:02d}, '
        f'Loss: {loss:.4f}, '
        f'Train: {100 * train_acc:.2f}%, '
        f'Valid: {100 * valid_acc:.2f}% '
        f'Test: {100 * test_acc:.2f}%')

Training...


  0%|          | 0/1029 [00:00<?, ?it/s]

Evaluating...


  0%|          | 0/1029 [00:00<?, ?it/s]

  0%|          | 0/129 [00:00<?, ?it/s]

  0%|          | 0/129 [00:00<?, ?it/s]

Epoch: 01, Loss: 0.9952, Train: 71.41%, Valid: 71.57% Test: 68.69%
Training...


  0%|          | 0/1029 [00:00<?, ?it/s]

Evaluating...


  0%|          | 0/1029 [00:00<?, ?it/s]

  0%|          | 0/129 [00:00<?, ?it/s]

  0%|          | 0/129 [00:00<?, ?it/s]

Epoch: 02, Loss: 0.0915, Train: 68.76%, Valid: 68.84% Test: 60.86%
Training...


  0%|          | 0/1029 [00:00<?, ?it/s]

Evaluating...


  0%|          | 0/1029 [00:00<?, ?it/s]

  0%|          | 0/129 [00:00<?, ?it/s]

  0%|          | 0/129 [00:00<?, ?it/s]

Epoch: 03, Loss: 0.0349, Train: 74.65%, Valid: 68.91% Test: 62.89%
Training...


  0%|          | 0/1029 [00:00<?, ?it/s]

Evaluating...


  0%|          | 0/1029 [00:00<?, ?it/s]

  0%|          | 0/129 [00:00<?, ?it/s]

  0%|          | 0/129 [00:00<?, ?it/s]

Epoch: 04, Loss: 0.0346, Train: 75.90%, Valid: 75.14% Test: 74.46%
Training...


  0%|          | 0/1029 [00:00<?, ?it/s]

Evaluating...


  0%|          | 0/1029 [00:00<?, ?it/s]

  0%|          | 0/129 [00:00<?, ?it/s]

  0%|          | 0/129 [00:00<?, ?it/s]

Epoch: 05, Loss: 0.0418, Train: 77.15%, Valid: 76.00% Test: 72.76%
