In [1]:
import sys, os
import tqdm
import numpy as np
import pandas as pd
from torch_geometric.loader import DataLoader
from torch_geometric.nn import global_mean_pool, global_add_pool, global_max_pool
from torch_geometric.utils import degree

from chem import *

In [2]:
batch_size = 32
dataset_name = 'bbbp'

In [3]:
train_dataset = load_dataset('data_curation/train_set.csv')
val_dataset = load_dataset('data_curation/val_set.csv')
test_dataset = load_dataset('data_curation/test_set.csv')

In [4]:
train_dataset[0]

Data(x=[19, 25], edge_index=[2, 39], y=[1])

In [5]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=len(val_dataset))
test_loader = DataLoader(test_dataset, batch_size=len(test_dataset))

In [6]:
# Compute the maximum in-degree in the training data.
max_degree = -1
for data in train_dataset:
    d = degree(data.edge_index[1], num_nodes=data.num_nodes, dtype=torch.long)
    max_degree = max(max_degree, int(d.max()))

# Compute the in-degree histogram tensor
deg = torch.zeros(max_degree + 1, dtype=torch.long)
for data in train_dataset:
    d = degree(data.edge_index[1], num_nodes=data.num_nodes, dtype=torch.long)
    deg += torch.bincount(d, minlength=deg.numel())

In [7]:
import torch
from torch.nn import Embedding
import torch.nn.functional as F
from torch_geometric.nn import PNAConv

In [8]:
class GNN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        
        aggregators = ['mean', 'min', 'max', 'std']
        scalers = ['identity', 'amplification', 'attenuation']
        
        self.conv1 = PNAConv(25, 50, aggregators=aggregators, scalers=scalers, deg=deg)
        self.conv2 = PNAConv(50, 50, aggregators=aggregators, scalers=scalers, deg=deg)
        self.fc1 = torch.nn.Linear(50,16)
        self.fc2 = torch.nn.Linear(16,2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        #x = global_mean_pool(x, data.batch)
        x = global_add_pool(x, data.batch)
        #x = global_max_pool(x, data.batch)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)

        return F.log_softmax(x, dim=1)

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GNN().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)

In [10]:
model.train()
for epoch in range(2000):
    
    loss_all = 0
    for data in train_loader:
        data = data.to(device)
        
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, data.y)
        loss.backward()
        optimizer.step()
        loss_all += data.num_graphs * loss.item()
    if (epoch+1)%100 == 0:
        print (f"epoch {epoch+1}: ",loss_all / len(train_dataset))

epoch 100:  0.22332453071742173
epoch 200:  0.18454181597066174
epoch 300:  0.20065308170841925
epoch 400:  0.16557663328292396
epoch 500:  0.16263432703012343
epoch 600:  0.1555648597515777
epoch 700:  0.14564069212944797
epoch 800:  0.12686451187141465
epoch 900:  0.13398934659564662
epoch 1000:  0.11953985004941062
epoch 1100:  0.115348051518549
epoch 1200:  0.10903394995183027
epoch 1300:  0.1198790487265163
epoch 1400:  0.11506041448625007
epoch 1500:  0.11076088095281403
epoch 1600:  0.1063582906413853
epoch 1700:  0.11177230491585559
epoch 1800:  0.11394343757231055
epoch 1900:  0.09664029649965314
epoch 2000:  0.10464523940739422


In [11]:
model.eval()
correct = 0
for data in train_loader:
    data = data.to(device)
    pred = model(data).max(dim=1)[1]
    correct += pred.eq(data.y).sum().item()
print (correct / len(train_dataset))

0.9619865113427345


In [12]:
model.eval()
correct = 0
for data in test_loader:
    data = data.to(device)
    pred = model(data).max(dim=1)[1]
    correct += pred.eq(data.y).sum().item()
print (correct / len(test_dataset))

0.5784313725490197


In [13]:
model.eval()
correct = 0
for data in val_loader:
    data = data.to(device)
    pred = model(data).max(dim=1)[1]
    correct += pred.eq(data.y).sum().item()
print (correct / len(val_dataset))

0.8970588235294118
