In [1]:
%load_ext autoreload
%autoreload 2

import pathlib
import functools
import tempfile

import numpy as np
import lightning
import torch
import datamol as dm

import graphium

Using backend: pytorch


In [6]:
# Setup a temporary cache file. Only for
# demo purposes, use a known path in prod.
cache_data_path = pathlib.Path(tempfile.mkdtemp()) / "cache.pkl"
cache_data_path = "/home/hadim/test-cache.pkl"

# Load a dataframe
df = graphium.data.load_tiny_zinc()
df.head()

# Setup the featurization
featurization_args = {}
featurization_args["atom_property_list_onehot"] = ["atomic-number", "valence"]
featurization_args["atom_property_list_float"] = ["mass", "electronegativity", "in-ring"]
featurization_args["edge_property_list"] = ["bond-type-onehot", "stereo", "in-ring"]
featurization_args["add_self_loop"] = False
featurization_args["use_bonds_weights"] = False
featurization_args["explicit_H"] = False

# Config for datamodule
dm_args = {}
dm_args["df"] = df
dm_args["cache_data_path"] = cache_data_path
dm_args["featurization"] = featurization_args
dm_args["smiles_col"] = "SMILES"
dm_args["label_cols"] = ["SA"]
dm_args["split_val"] = 0.2
dm_args["split_test"] = 0.2
dm_args["split_seed"] = 19
dm_args["batch_size_training"] = 16
dm_args["batch_size_inference"] = 16
dm_args["num_workers"] = 0
dm_args["pin_memory"] = True
dm_args["featurization_n_jobs"] = 16
dm_args["featurization_progress"] = True

datam = graphium.data.DGLFromSmilesDataModule(**dm_args)
# datam

In [7]:
# Load and prepare the data
datam.prepare_data()

# Create the split torch datasets
datam.setup()

2021-04-30 14:19:26.972 | INFO     | graphium.data.datamodule:_load_from_cache:460 - Try reloading the data module from /home/hadim/test-cache.pkl.
2021-04-30 14:19:27.001 | INFO     | graphium.data.datamodule:_load_from_cache:485 - Cache featurizer arguments are different than the provided ones.
2021-04-30 14:19:27.001 | INFO     | graphium.data.datamodule:_load_from_cache:486 - Cache featurizer arguments: {'atom_property_list_onehot': ['atomic-number', 'valence'], 'atom_property_list_float': ['mass', 'electronegativity', 'in-ring'], 'edge_property_list': ['bond-type-onehot', 'stereo'], 'add_self_loop': False, 'explicit_H': False, 'use_bonds_weights': False, 'pos_encoding_as_features': None, 'pos_encoding_as_directions': None, 'dtype': torch.float32}
2021-04-30 14:19:27.002 | INFO     | graphium.data.datamodule:_load_from_cache:487 - Provided featurizer arguments: {'atom_property_list_onehot': ['atomic-number', 'valence'], 'atom_property_list_float': ['mass', 'electronegativity', 'in-

  0%|          | 0/100 [00:00<?, ?it/s]

2021-04-30 14:19:27.099 | INFO     | graphium.data.datamodule:_save_to_cache:433 - Write prepared datamodule to /home/hadim/test-cache.pkl


In [27]:
# Setup a temporary cache file. Only for
# demo purposes, use a known path in prod.
cache_data_path = pathlib.Path(tempfile.mkdtemp()) / "cache.pkl"
cache_data_path = "/home/hadim/test-cache.pkl"

# Load a dataframe
df = graphium.data.load_tiny_zinc()
df.head()

# Setup the featurization
featurization_args = {}
featurization_args["atom_property_list_float"] = ["mass", "electronegativity"]
featurization_args["edge_property_list"] = ["stereo", "in-ring"]

# Config for datamodule
dm_args = {}
dm_args["df"] = df
dm_args["cache_data_path"] = cache_data_path
dm_args["featurization"] = featurization_args

datam = graphium.data.DGLFromSmilesDataModule(**dm_args)
datam.prepare_data()
datam.setup()

2021-04-30 14:35:26.199 | INFO     | graphium.data.datamodule:_load_from_cache:460 - Try reloading the data module from /home/hadim/test-cache.pkl.
2021-04-30 14:35:26.226 | INFO     | graphium.data.datamodule:_load_from_cache:485 - Cache featurizer arguments are different than the provided ones.
2021-04-30 14:35:26.227 | INFO     | graphium.data.datamodule:_load_from_cache:486 - Cache featurizer arguments: {'atom_property_list_onehot': [], 'atom_property_list_float': ['mass', 'electronegativity', 'in-ring'], 'edge_property_list': ['bond-type-onehot', 'stereo', 'in-ring'], 'add_self_loop': False, 'explicit_H': False, 'use_bonds_weights': False, 'pos_encoding_as_features': None, 'pos_encoding_as_directions': None, 'dtype': torch.float32}
2021-04-30 14:35:26.228 | INFO     | graphium.data.datamodule:_load_from_cache:487 - Provided featurizer arguments: {'atom_property_list_onehot': [], 'atom_property_list_float': ['mass', 'electronegativity'], 'edge_property_list': ['stereo', 'in-ring'],

In [26]:
assert datam.num_node_feats == 3
assert datam.num_edge_feats == 13

In [16]:
g = batch["features"]

In [28]:
datam.num_node_feats

2

In [29]:
datam.num_edge_feats

8