In [1]:
from graphchem.datasets import load_cn
from sklearn.model_selection import train_test_split

# load cetane number data
smiles, cn = load_cn()

# create training/testing subsets
smiles_train, smiles_test, cn_train, cn_test = train_test_split(smiles, cn, test_size=0.2, random_state=42)

print(len(smiles_train), len(smiles_test), type(smiles))
print(len(cn_train), len(cn_test), type(cn), cn.shape, '\n')

for i in range(5):

    print(f'{smiles[i]}\t{cn[i]}')

368 92 <class 'list'>
368 92 <class 'torch.Tensor'> torch.Size([460, 1]) 

CC1=CC=C(O1)C(C2=CC=CO2)C3=CC=C(O3)C	tensor([25.5000])
CCCCC1=CC=CO1	tensor([13.1000])
C1CCOCC1	tensor([38.2000])
CC1=CC=C(C)O1	tensor([10.9000])
C1C=CCO1	tensor([15.6000])


In [2]:
from graphchem.data import MoleculeGraph, MoleculeDataset
from graphchem.preprocessing import MoleculeEncoder
from sklearn.model_selection import train_test_split

# construct encoder using training data
encoder = MoleculeEncoder(smiles_train)

# encode training and test data; each element in form (atom attr, bond attr, connectivity)
encoding_train = encoder.encode_many(smiles_train)
encoding_test = encoder.encode_many(smiles_test)

# construct graphs using encoded data
graphs_train = [MoleculeGraph(e[0], e[1], e[2], cn_train[i]) for i, e in enumerate(encoding_train)]
graphs_test = [MoleculeGraph(e[0], e[1], e[2], cn_test[i]) for i, e in enumerate(encoding_test)]

# construct datasets for training and testing graphs
ds_train = MoleculeDataset(graphs_train)
ds_test = MoleculeDataset(graphs_test)

print(len(ds_train), ds_train.num_node_features, ds_train.num_edge_features)
print(len(ds_test), ds_test.num_node_features, ds_test.num_edge_features)

368 16 6
92 16 6


In [3]:
from graphchem.nn import MoleculeGCN
import torch
import torch.nn.functional as F
from torch_geometric.loader import DataLoader

# prepare training set for training
train_loader = DataLoader(ds_train, batch_size=9, shuffle=True)

# construct untrained model
model = MoleculeGCN(
    ds_train.num_node_features,     # number of node/atom features per sample
    ds_train.num_edge_features,     # number of edge/bond features per sample
    1,                              # number of target variables/labels per sample
    embedding_dim=32,               # dimensionality of embedded atoms/bonds
    n_messages=2,                   # number of message-passing ops to perform (>= 1)
    n_readout=2,                    # number of post-graph-operation feed-forward layers (>= 1)
    readout_dim=32,                 # number of neurons in each readout layer (>= 1)
    dropout=0.0                     # neuron dropout rate [0.0, 1.0]
)

# construct torch optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# training loop
model.train()
for epoch in range(50):

    train_loss = 0.0
    for batch in train_loader:

        pred, _, _ = model(batch)
        target = batch.y
        loss = F.mse_loss(pred.reshape((-1,)), target)
        loss.backward()
        optimizer.step()
        train_loss += loss.detach().item()

    train_loss /= len(train_loader.dataset)

    if epoch % 5 == 0:
        print(f'{epoch}: {train_loss}')

# done training
model.eval()

0: 130.72978691432786
5: 131.01069183971572
10: 246.24754366667375
15: 230.95758587381115
20: 201.69607212232506
25: 158.48398772529933
30: 113.54577504033628
35: 82.00476961550505
40: 73.10060517684273
45: 87.86463422360627


MoleculeGCN(
  (emb_atom): Linear(in_features=16, out_features=32, bias=True)
  (emb_bond): Linear(in_features=6, out_features=32, bias=True)
  (atom_conv): MFConv(32, 32)
  (bond_conv): EdgeConv(nn=Sequential(
    (0): Linear(in_features=64, out_features=32, bias=True)
  ))
  (atom_gru): GRU(32, 32)
  (bond_gru): GRU(32, 32)
  (readout): ModuleList(
    (0): Sequential(
      (0): Linear(in_features=64, out_features=32, bias=True)
    )
    (1): Sequential(
      (0): Linear(in_features=32, out_features=32, bias=True)
    )
    (2): Sequential(
      (0): Linear(in_features=32, out_features=1, bias=True)
    )
  )
)