In [1]:
import numpy as np
import tensorflow as tf
import deepchem as dc
import pandas as pd
from rdkit import Chem



In [2]:
import os
import deepchem as dc
from deepchem.molnet.load_function.molnet_loader import TransformerGenerator, _MolnetLoader
from deepchem.data import Dataset
from typing import List, Optional, Tuple, Union

#TASKS = [
#    'amg', 'cisplatin'
#]
TASKS = [
    'cisplatin'
]


class _DataLoader(_MolnetLoader):

    def create_dataset(self) -> Dataset:
        dataset_file = os.path.join(self.data_dir, "combined_cisplatin_data.csv.gz")
        loader = dc.data.CSVLoader(
            tasks=self.tasks, feature_field="smiles", featurizer=self.featurizer)
        return loader.create_dataset(dataset_file, shard_size=8192)


def load_data(
        featurizer: Union[dc.feat.Featurizer, str] = 'ECFP',
        splitter: Union[dc.splits.Splitter, str, None] = 'scaffold',
        transformers: List[Union[TransformerGenerator, str]] = ['balancing'],
        reload: bool = True,
        data_dir: Optional[str] = None,
        save_dir: Optional[str] = None,
        **kwargs
    ) -> Tuple[List[str], Tuple[Dataset, ...], List[dc.trans.Transformer]]:
    loader = _DataLoader(featurizer, splitter, transformers, TASKS,
                        data_dir, save_dir, **kwargs)
    return loader.load_dataset('combined', reload)

In [3]:
from featurizer import MolGraphConvFeaturizer

featurizer = MolGraphConvFeaturizer(
    use_edges=True, use_chirality=True, use_partial_charge=True
)

splitter = dc.splits.RandomStratifiedSplitter()

combined_tasks, datasets, transformers = load_data(
    featurizer=featurizer, splitter=splitter, 
    data_dir='data/combined/cisplatin/', save_dir='data/combined/cisplatin/'
)
train_dataset, valid_dataset, test_dataset = datasets

Failed to featurize datapoint 590, [Hg+2]. Appending empty array
Exception message: zero-size array to reduction operation maximum which has no identity
Failed to featurize datapoint 597, [Se]. Appending empty array
Exception message: zero-size array to reduction operation maximum which has no identity
  return array(a, dtype, copy=False, order=order)


In [4]:
print('train/val/test split: {}/{}/{}'.format(len(train_dataset), len(valid_dataset), len(test_dataset)))
print('num_node_features: {}'.format(train_dataset.X[0].num_node_features))
print('num_edge_features: {}'.format(train_dataset.X[0].num_edge_features))

train/val/test split: 490/61/61
num_node_features: 79
num_edge_features: 12


In [5]:
dc.utils.save_dataset_to_disk('data/combined/cisplatin/', train_dataset, valid_dataset, test_dataset, transformers)