```yaml
data:
  module_type: "DGLOGBDataModule"
  args:
    cache_data_path: null
  
    dataset_name: "ogbg-moltox21"
  
    batch_size_train_val: 16
    batch_size_test: 16
  
    featurization:
      atom_property_list_float: []
      atom_property_list_onehot: ["atomic-number", "degree"]
      edge_property_list: ["ring", "bond-type-onehot"]
      add_self_loop: false
      use_bonds_weights: false
      explicit_H: false
```

In [1]:
%load_ext autoreload
%autoreload 2

import pathlib
import functools
import tempfile
import importlib

import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
import datamol as dm

import goli

Using backend: pytorch


In [13]:
dataset_names = ["ogbg-molhiv", "ogbg-molpcba", "ogbg-moltox21"]
dataset_name = dataset_names[1]

# Setup a temporary cache file. Only for
# demo purposes, use a known path in prod.
cache_data_path = pathlib.Path(tempfile.mkdtemp()) / "cache.pkl"

# Setup the featurization
featurization_args = {}
featurization_args["atom_property_list_float"] = []  # ["weight", "valence"]
featurization_args["atom_property_list_onehot"] = ["atomic-number", "degree"]
featurization_args["edge_property_list"] = ["bond-type-onehot"]
featurization_args["add_self_loop"] = False
featurization_args["use_bonds_weights"] = False
featurization_args["explicit_H"] = False

# Config for datamodule
dm_args = {}
dm_args["dataset_name"] = dataset_name
dm_args["cache_data_path"] = cache_data_path
dm_args["featurization"] = featurization_args
dm_args["batch_size_train_val"] = 16
dm_args["batch_size_test"] = 16
dm_args["num_workers"] = 0
dm_args["pin_memory"] = True
dm_args["featurization_n_jobs"] = 16
dm_args["featurization_progress"] = True

ds = goli.data.DGLOGBDataModule(**dm_args)
ds

2021-04-15 13:38:54.312 | INFO     | goli.data.datamodule:_load_dataset:573 - Downloading http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/pcba.zip to /home/hadim/.cache/goli/ogb/pcba.zip


  0%|          | 0.00/37.7M [00:00<?, ?B/s]

2021-04-15 13:39:42.467 | INFO     | goli.data.datamodule:_load_dataset:582 - Loading /home/hadim/.cache/goli/ogb/pcba/mapping/mol.csv.gz in memory.
2021-04-15 13:39:44.727 | INFO     | goli.data.datamodule:_load_dataset:595 - Saving splits to /home/hadim/.cache/goli/ogb/pcba/split/scaffold.csv.gz


dataset_name: ogbg-molpcba
name: DGLOGBDataModule
len: 437929
batch_size_train_val: 16
batch_size_test: 16
num_node_feats: 50
num_edge_feats: 5
collate_fn: goli_collate_fn
featurization:
  atom_property_list_float: []
  atom_property_list_onehot:
  - atomic-number
  - degree
  edge_property_list:
  - bond-type-onehot
  add_self_loop: false
  use_bonds_weights: false
  explicit_H: false

In [14]:
# Access to the OGB metadata with
ds.metadata

{'num tasks': '128',
 'eval metric': 'ap',
 'download_name': 'pcba',
 'version': '1',
 'url': 'http://snap.stanford.edu/ogb/data/graphproppred/csv_mol_download/pcba.zip',
 'add_inverse_edge': 'True',
 'data type': 'mol',
 'has_node_attr': 'True',
 'has_edge_attr': 'True',
 'task type': 'binary classification',
 'num classes': '2',
 'split': 'scaffold',
 'additional node files': 'None',
 'additional edge files': 'None',
 'binary': 'False'}

In [None]:
# Load and prepare the data
ds.prepare_data()

# Create the split torch datasets
ds.setup()

2021-04-15 13:46:36.444 | INFO     | goli.data.datamodule:prepare_data:291 - Prepare dataset with 437929 data points.


  0%|          | 0/437929 [00:00<?, ?it/s]

In [None]:
ds.train_ds[0]

In [None]:
# Load a dataloader and get the first batch from it
dl = ds.train_dataloader()
it = iter(dl)
batch = next(it)
batch