In [1]:
import sys, os
import tqdm
import numpy as np
import pandas as pd
from torch_geometric.loader import DataLoader
from torch_geometric.nn import global_mean_pool, global_add_pool, global_max_pool

from chem import *

In [2]:
batch_size = 32
dataset_name = 'bbbp'

In [3]:
train_dataset = load_dataset('data_curation/train_set.csv')
val_dataset = load_dataset('data_curation/val_set.csv')
test_dataset = load_dataset('data_curation/test_set.csv')

In [4]:
train_dataset[0]

Data(x=[19, 25], edge_index=[2, 39], y=[1])

In [5]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=len(val_dataset))
test_loader = DataLoader(test_dataset, batch_size=len(test_dataset))

In [6]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

In [7]:
class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(25, 16)
        self.conv2 = GCNConv(16, 8)
        self.fc1 = torch.nn.Linear(8,2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        #x = global_mean_pool(x, data.batch)
        #x = global_add_pool(x, data.batch)
        x = global_max_pool(x, data.batch)
        x = self.fc1(x)

        return F.log_softmax(x, dim=1)

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)

In [9]:
model.train()
for epoch in range(2000):
    
    loss_all = 0
    for data in train_loader:
        data = data.to(device)
        
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, data.y)
        loss.backward()
        optimizer.step()
        loss_all += data.num_graphs * loss.item()
    if (epoch+1)%100 == 0:
        print (f"epoch {epoch+1}: ",loss_all / len(train_dataset))

epoch 100:  0.35530440236146577
epoch 200:  0.33789590708249045
epoch 300:  0.3298635480595395
epoch 400:  0.33250348470022456
epoch 500:  0.3174979530051031
epoch 600:  0.3188555309335415
epoch 700:  0.3151752635754375
epoch 800:  0.308424481058837
epoch 900:  0.3057986029922268
epoch 1000:  0.30258301941073795
epoch 1100:  0.30622929281650474
epoch 1200:  0.30184741056016584
epoch 1300:  0.29578286481097316
epoch 1400:  0.3091522909833346
epoch 1500:  0.2929949573144673
epoch 1600:  0.3079016545592167
epoch 1700:  0.2980544527456996
epoch 1800:  0.30769997389312176
epoch 1900:  0.299716082070957
epoch 2000:  0.298374870304386


In [10]:
model.eval()
correct = 0
for data in train_loader:
    data = data.to(device)
    pred = model(data).max(dim=1)[1]
    correct += pred.eq(data.y).sum().item()
print (correct / len(train_dataset))

0.891477621091355


In [11]:
model.eval()
correct = 0
for data in test_loader:
    data = data.to(device)
    pred = model(data).max(dim=1)[1]
    correct += pred.eq(data.y).sum().item()
print (correct / len(test_dataset))

0.5686274509803921


In [12]:
model.eval()
correct = 0
for data in val_loader:
    data = data.to(device)
    pred = model(data).max(dim=1)[1]
    correct += pred.eq(data.y).sum().item()
print (correct / len(val_dataset))

0.8970588235294118
