In [1]:
import os
os.environ["KERAS_BACKEND"] = "torch" # Comment out for tensorflow backend

from molexpress import layers
from molexpress.datasets import features
from molexpress.datasets import encoders

from rdkit import Chem

import torch

## 1. Features

In [2]:
mol = Chem.MolFromSmiles('CCO')

print(features.AtomType(vocab={'O'}, oov=False)(mol.GetAtoms()[0]))
print(features.AtomType(vocab={'O'}, oov=True)(mol.GetAtoms()[0]))
print(features.AtomType(vocab={'C', 'O'}, oov=False)(mol.GetAtoms()[0]))
print(features.AtomType(vocab={'C', 'O', 'N'}, oov=False)(mol.GetAtoms()[0]))
print(features.AtomType(vocab={'C', 'O', 'N'}, oov=True)(mol.GetAtoms()[0]))

[0.]
[0. 1.]
[1. 0.]
[1. 0. 0.]
[1. 0. 0. 0.]


## 2. Featurizer

In [3]:
atom_featurizer = features.Compose([
    features.AtomType({'C', 'O', 'N'}),
    features.Hybridization(),
])

bond_featurizer = features.Compose([
    features.BondType()
])

print(mol.GetAtoms()[0].GetSymbol(), atom_featurizer(mol.GetAtoms()[0]))
print(mol.GetBonds()[0].GetBondType(), bond_featurizer(mol.GetBonds()[0]))

C [1. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
SINGLE [0. 0. 1. 0.]


## 3. Encoder

In [4]:
encoder = encoders.MolecularGraphEncoder(
    atom_featurizer=atom_featurizer, 
    bond_featurizer=bond_featurizer,
    self_loops=True # adds one dim to edge state
)

encoder(mol)

{'node_state': array([[1., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 1., 0., 0., 0.]], dtype=float32),
 'edge_src': array([0, 0, 1, 1, 1, 2, 2], dtype=int32),
 'edge_dst': array([0, 1, 0, 1, 2, 1, 2], dtype=int32),
 'edge_state': array([[0., 0., 0., 0., 1.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 1.],
        [0., 0., 1., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 1.]], dtype=float32)}

## 4. Dataset

In [5]:
x_dummy = ['CC', 'CC', 'CCO', 'CCCN']
y_dummy = [1., 2., 3., 4.]


class TinyDataset(torch.utils.data.Dataset):
    
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
    def __len__(self):
        return len(self.x)
        
    def __getitem__(self, index):
        x = self.x[index]
        y = self.y[index]
        x = encoder(x)
        return x, y

torch_dataset = TinyDataset(x_dummy, y_dummy)

dataset = torch.utils.data.DataLoader(
    torch_dataset, batch_size=2, collate_fn=encoder._collate_fn)

for x, y in dataset:
    print(f'x = {x}\ny = {y}', end='\n' + '---' * 30 + '\n')

x = {'node_state': array([[1., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 1., 0., 0., 0.]], dtype=float32), 'edge_state': array([[0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1.]], dtype=float32), 'edge_src': array([0, 0, 1, 1, 2, 2, 3, 3]), 'edge_dst': array([0, 1, 0, 1, 2, 3, 2, 3]), 'graph_indicator': array([0, 0, 1, 1])}
y = [1. 2.]
------------------------------------------------------------------------------------------
x = {'node_state': array([[1., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [

## 5. Model

In [6]:
class TinyGCNModel(torch.nn.Module):

    def __init__(self):
        super().__init__()

        self.gcn1 = layers.GINConv(32)
        self.gcn2 = layers.GINConv(32)
        self.readout = layers.Readout()
        self.linear = torch.nn.Linear(32, 1)

    def forward(self, x):
        x = self.gcn1(x)
        x = self.gcn2(x)
        x = self.readout(x)
        x = self.linear(x)
        return x

model = TinyGCNModel().to('cuda')

## 6. Fit

In [7]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.00001, momentum=0.9)
loss_fn = torch.nn.MSELoss()

for _ in range(30):
    loss_sum = 0.
    for x, y in dataset:
        optimizer.zero_grad()
    
        outputs = model(x)
    
        y = torch.tensor(y, dtype=torch.float32).to('cuda')
        loss = loss_fn(outputs, y[:, None])
        loss.backward()
        optimizer.step()

        loss_sum += loss
        
    print(loss_sum)

tensor(8.3608, device='cuda:0', grad_fn=<AddBackward0>)
tensor(7.9535, device='cuda:0', grad_fn=<AddBackward0>)
tensor(7.2538, device='cuda:0', grad_fn=<AddBackward0>)
tensor(6.3826, device='cuda:0', grad_fn=<AddBackward0>)
tensor(5.4477, device='cuda:0', grad_fn=<AddBackward0>)
tensor(4.5354, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.7063, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.9972, device='cuda:0', grad_fn=<AddBackward0>)
tensor(2.4239, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.9859, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.6715, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.4622, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.3367, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.2734, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.2528, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.2586, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.2779, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.3014, device='cuda:0', grad_fn=<AddBack