In [1]:
%load_ext autoreload
%autoreload 2

import pathlib
import functools
import tempfile

import numpy as np
import pytorch_lightning as pl
import torch
import datamol as dm

import goli

Using backend: pytorch
  warn("Tensorflow not installed; ParametricUMAP will be unavailable")


In [2]:
# Setup a temporary cache file. Only for
# demo purposes, use a known path in prod.
cache_data_path = pathlib.Path(tempfile.mkdtemp()) / "cache.pkl"

# Load a dataframe
df = goli.data.load_tiny_zinc()
df.head()

# Setup the featurization
featurization_args = {}
featurization_args["atom_property_list_float"] = []  # ["weight", "valence"]
featurization_args["atom_property_list_onehot"] = ["atomic-number", "degree"]
featurization_args["edge_property_list"] = ["ring", "bond-type-onehot"]
featurization_args["add_self_loop"] = False
featurization_args["use_bonds_weights"] = False
featurization_args["explicit_H"] = False

# Config for datamodule
dm_args = {}
dm_args["df"] = df
dm_args["cache_data_path"] = None#cache_data_path  # unsed at the moment
dm_args["featurization"] = featurization_args
dm_args["smiles_col"] = "SMILES"
dm_args["label_cols"] = ["SA"]
dm_args["split_val"] = 0.2
dm_args["split_test"] = 0.2
dm_args["split_seed"] = 19
dm_args["train_val_batch_size"] = 16
dm_args["test_batch_size"] = 16
dm_args["num_workers"] = -1
dm_args["pin_memory"] = True
dm_args["featurization_n_jobs"] = 16
dm_args["featurization_progress"] = True


dm = goli.data.DGLFromSmilesDataModule(**dm_args)
dm

name: DGLFromSmilesDataModule
len: 100
train_val_batch_size: 16
test_batch_size: 16
num_node_feats: 50
num_edge_feats: 6
collate_fn: goli_collate_fn
featurization:
  atom_property_list_float: []
  atom_property_list_onehot:
  - atomic-number
  - degree
  edge_property_list:
  - ring
  - bond-type-onehot
  add_self_loop: false
  use_bonds_weights: false
  explicit_H: false

In [3]:
# Load and prepare the data
dm.prepare_data()

# Create the split torch datasets
dm.setup()

2021-03-17 08:54:46.513 | INFO     | goli.data.datamodule:prepare_data:173 - Prepare dataset with 100 data points.


  0%|          | 0/100 [00:00<?, ?it/s]

In [4]:
# Load a dataloader and get the first batch from it
dl = dm.train_dataloader()
it = iter(dl)
batch = next(it)
batch

{'smiles': ['c1cc2c(cc1N[C@@H]1CCOC3(CCC3)C1)CCC2',
  'Cc1cc(C)cc(NC(=O)[C@@H](Sc2nnnn2C2CC2)c2ccccc2)c1',
  'Cc1cc(C(=O)N2CCN(C(=O)N[C@H]3CC(=O)N(C4CC4)C3)CC2)c(C)o1',
  'O=C(CNc1nc(C2CC2)no1)N1CCc2sccc2C1',
  'COc1ccc(Cl)cc1NC(=O)c1nn(C)cc1[N+](=O)[O-]',
  'C[NH2+]Cc1ccc(-c2cc(C)ccc2F)s1',
  'Cc1ocnc1CNC(=O)N(C)Cc1ccc(OC(F)F)cc1',
  'Cc1ccc(NC(=O)c2ccc(F)cc2F)cc1S(=O)(=O)Nc1ccc(Cl)cc1',
  'CCc1onc(C)c1NC(=O)CCCC(C)(C)C',
  'CCC[C@H](NC(N)=O)C(=O)NC[C@@H]1CCCO1',
  'C=CCn1c([S-])nnc1-c1sc(NC(=O)c2cccc(NC(C)=O)c2)nc1C',
  'CCOC[C@H](O)[C@](C)(CC)[NH+]1CCCC1',
  'COc1ccccc1[C@H](C)NC(=O)C[C@H]1C[NH2+]CCO1',
  'COC(C[C@@](C)(O)C[C@@H]1CCCN(S(C)(=O)=O)C1)OC',
  'O=c1c2ccccc2sn1-c1ncc(Br)s1',
  'COc1ccc2c(c1)[C@H]([NH2+][C@H](C)CCN1CCOCC1)CCCO2'],
 'features': Graph(num_nodes=341, num_edges=730,
       ndata_schemes={'feat': Scheme(shape=(50,), dtype=torch.float32)}
       edata_schemes={'feat': Scheme(shape=(6,), dtype=torch.float32)}),
 'labels': tensor([[3.3631],
         [2.7085],
    