In [1]:
%load_ext autoreload
%autoreload 2

import pathlib
import functools
import tempfile

import numpy as np
import pytorch_lightning as pl
import torch
import datamol as dm

import goli

Using backend: pytorch


In [4]:
# Load a default config
config = goli.load_config(name="micro_zinc_default")

# Setup a temporary cache file. Only for
# demo purposes, use a known path in prod.
cache_data_path = pathlib.Path(tempfile.mkdtemp()) / "cache.pkl"
cache_data_path = None

# Load a dataframe
df = goli.data.load_tiny_zinc()

# Config for datamodule
dm_args = dict(config.data.args)
dm_args["df"] = df
dm_args["cache_data_path"] = cache_data_path
dm_args["featurization_n_jobs"] = 0
dm_args["featurization_progress"] = True

dm = goli.data.DGLFromSmilesDataModule(**dm_args)
dm.prepare_data()
dm.setup()
dm

2021-03-17 10:09:54.960 | INFO     | goli.data.datamodule:prepare_data:174 - Prepare dataset with 100 data points.


  0%|          | 0/100 [00:00<?, ?it/s]

name: DGLFromSmilesDataModule
len: 100
train_val_batch_size: 16
test_batch_size: 16
num_node_feats: 50
num_edge_feats: 6
collate_fn: goli_collate_fn
featurization:
  atom_property_list_float: []
  atom_property_list_onehot:
  - atomic-number
  - degree
  edge_property_list:
  - ring
  - bond-type-onehot
  add_self_loop: false
  use_bonds_weights: false
  explicit_H: false

In [5]:
# Load a dataloader and get the first batch from it
dl = dm.train_dataloader()
it = iter(dl)
batch = next(it)
batch

{'smiles': ['Cc1ncc(-c2ccc(S(=O)(=O)NC3CCCCCC3)s2)o1',
  'Cc1ccc(-c2nc3ccc(C)c(C)c3[nH]2)nc1',
  'CC[NH+](C/C=C/c1ccc(C#N)cc1)C[C@H](C)C#N',
  'CNC(=O)[C@@H]1CCC[NH+]1Cc1ccc(C)c(F)c1',
  'C=CCSc1nnc(C2CCOCC2)n1N',
  'COc1ccc(-c2noc3ncnc(N4CCC[C@H](C(=O)[O-])C4)c23)cc1',
  'CC#CCCC(=O)Nc1cccc2c1C(=O)c1ccccc1C2=O',
  'CC(=O)c1c(C)[nH]c(C(=O)OCC(=O)N2C[C@H](C)C[C@@H](C)C2)c1C',
  'CN1C(=O)N[C@@H](c2cccc([N+](=O)[O-])c2)C2=C1CN(c1ccc(F)cc1)C2=O',
  'CC1(C)CCC[C@H]1n1c(N)[nH+]c2ccccc21',
  'Cc1ccc(NC(=O)c2ccc(F)cc2F)cc1S(=O)(=O)Nc1ccc(Cl)cc1',
  'CC(=O)N1c2ccc(S(=O)(=O)N3CCCC3)cc2C[C@H]1C(=O)NCC[NH+](C)C1CCCCC1',
  'CCOc1ccc(C[NH+]2CCS[C@H]3COCC[C@@H]32)cc1OC',
  'O=c1c2c3nc4ccccc4nc3n(CCC3=CCCCC3)c2ncn1C[C@H]1CCCO1',
  'CCc1onc(C)c1NC(=O)CCCC(C)(C)C',
  'CC(C)(C)OC(=O)N[C@H]1CCN(c2cc(-c3cccs3)n[nH]2)C1'],
 'features': Graph(num_nodes=369, num_edges=800,
       ndata_schemes={'feat': Scheme(shape=(50,), dtype=torch.float32)}
       edata_schemes={'feat': Scheme(shape=(6,), dtype=torch.float