In [1]:
import sys, os
import tqdm
import numpy as np
import pandas as pd
from torch_geometric.loader import DataLoader
from torch_geometric.nn import global_mean_pool, global_add_pool, global_max_pool

from chem import *

In [2]:
batch_size = 32
dataset_name = 'bbbp'

In [3]:
train_dataset = load_dataset('data_curation/train_set.csv')
val_dataset = load_dataset('data_curation/val_set.csv')
test_dataset = load_dataset('data_curation/test_set.csv')

In [4]:
train_dataset[0]

Data(x=[19, 25], edge_index=[2, 39], y=[1])

In [5]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=len(val_dataset))
test_loader = DataLoader(test_dataset, batch_size=len(test_dataset))

In [6]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GATConv

In [7]:
class GNN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GATConv(25, 16)
        self.conv2 = GATConv(16, 8)
        self.fc1 = torch.nn.Linear(8,2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        #x = global_mean_pool(x, data.batch)
        x = global_add_pool(x, data.batch)
        #x = global_max_pool(x, data.batch)
        x = self.fc1(x)

        return F.log_softmax(x, dim=1)

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GNN().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)

In [9]:
model.train()
for epoch in range(2000):
    
    loss_all = 0
    for data in train_loader:
        data = data.to(device)
        
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, data.y)
        loss.backward()
        optimizer.step()
        loss_all += data.num_graphs * loss.item()
    if (epoch+1)%100 == 0:
        print (f"epoch {epoch+1}: ",loss_all / len(train_dataset))

epoch 100:  0.29366533482900653
epoch 200:  0.2751535235519865
epoch 300:  0.27289926106764595
epoch 400:  0.2528318396417149
epoch 500:  0.26844480195197495
epoch 600:  0.2578838173594671
epoch 700:  0.25117631225517234
epoch 800:  0.26232494345331103
epoch 900:  0.25511411661059485
epoch 1000:  0.24852789171921702
epoch 1100:  0.2548897322575641
epoch 1200:  0.24244078767219254
epoch 1300:  0.24094933434032648
epoch 1400:  0.2536930391424496
epoch 1500:  0.25929546414672927
epoch 1600:  0.2420114583394631
epoch 1700:  0.24420059491965318
epoch 1800:  0.25139978230182264
epoch 1900:  0.24738661995115344
epoch 2000:  0.24462707766464778


In [10]:
model.eval()
correct = 0
for data in train_loader:
    data = data.to(device)
    pred = model(data).max(dim=1)[1]
    correct += pred.eq(data.y).sum().item()
print (correct / len(train_dataset))

0.8516247700797057


In [11]:
model.eval()
correct = 0
for data in test_loader:
    data = data.to(device)
    pred = model(data).max(dim=1)[1]
    correct += pred.eq(data.y).sum().item()
print (correct / len(test_dataset))

0.6274509803921569


In [12]:
model.eval()
correct = 0
for data in val_loader:
    data = data.to(device)
    pred = model(data).max(dim=1)[1]
    correct += pred.eq(data.y).sum().item()
print (correct / len(val_dataset))

0.8725490196078431
