From 354d66a29005676805943c95f26503014fa6cfe8 Mon Sep 17 00:00:00 2001 From: Bharath Ramsundar Date: Tue, 7 Jul 2020 18:50:33 -0700 Subject: [PATCH] Changes --- deepchem/splits/splitters.py | 423 +++++++++++++++++++++++++++-------- docs/index.rst | 1 + docs/tutorial.rst | 84 +++++++ 3 files changed, 413 insertions(+), 95 deletions(-) create mode 100644 docs/tutorial.rst diff --git a/deepchem/splits/splitters.py b/deepchem/splits/splitters.py index 869448a3e4..9885bd1957 100644 --- a/deepchem/splits/splitters.py +++ b/deepchem/splits/splitters.py @@ -24,7 +24,12 @@ def generate_scaffold(smiles, include_chirality=False): - """Compute the Bemis-Murcko scaffold for a SMILES string.""" + """Compute the Bemis-Murcko scaffold for a SMILES string. + + Note + ---- + This function requires `rdkit` to be installed. + """ from rdkit import Chem mol = Chem.MolFromSmiles(smiles) engine = ScaffoldGenerator(include_chirality=include_chirality) @@ -43,36 +48,34 @@ def randomize_arrays(array_list): class Splitter(object): + """Splitters split up Datasets into pieces for training/validation/testing. + + In machine learning applications, it's often necessary to split up a dataset + into training/validation/test sets. Or to k-fold split a dataset (that is, + divide into k equal subsets) for cross-validation. The `Splitter` class is + an abstract superclass for all splitters that captures the common API across + splitter classes. + + Note that `Splitter` is an abstract superclass. You won't want to + instantiate this class directly. Rather you will want to use a concrete + subclass for your application. """ - Abstract base class for chemically aware splits.. - """ def k_fold_split(self, dataset, k, directories=None, **kwargs): """ Parameters ---------- - dataset: Dataset - Dataset to do a k-fold split - + dataset: `dc.data.Dataset` + Dataset to do a k-fold split k: int - number of folds - - directories: list of str - list of length 2*k filepaths to save the result disk-datasets - - kwargs + Number of folds to split `dataset` into. + directories: list[str] + list of length 2*k filepaths to save the result disk-datasets Returns ------- - list of length k tuples of (train, cv) - - """ - """ - :param dataset: - :param k: - :param directories: - :param kwargs: - :return: list of length k tuples of (train, cv) + list of length k tuples of (train, cv) where `train` and `cv` are both + lists of `Dataset`s. """ logger.info("Computing K-fold split") if directories is None: @@ -127,7 +130,43 @@ def train_valid_test_split(self, **kwargs): """ Splits self into train/validation/test sets. - Returns Dataset objects. + Returns Dataset objects for train, valid, test. + + Parameters + ---------- + dataset: data like object. + Dataset to be split. This should either be of type + `dc.data.Dataset` or a type that `dc.utils.data.datasetify` can + convert into a `Dataset`. + train_dir: str, optional + If specified, the directory in which the generated + training dataset should be stored. This is only + considered if `isinstance(dataset, dc.data.DiskDataset)` + valid_dir: str, optional + If specified, the directory in which the generated + valid dataset should be stored. This is only + considered if `isinstance(dataset, dc.data.DiskDataset)` + is True. + test_dir: str, optional + If specified, the directory in which the generated + test dataset should be stored. This is only + considered if `isinstance(dataset, dc.data.DiskDataset)` + is True. + frac_train: float, optional (default 0.8) + The fraction of data to be used for the training split. + frac_valid: float, optional (default 0.1) + The fraction of data to be used for the validation split. + frac_test: float, optional (default 0.1) + The fraction of data to be used for the test split. + seed: int, optional (default None) + Random seed to use. + log_every_n: int, optional + Controls the logger by dictating how often logger outputs + will be produced. + + Returns + ------- + Train and test datasets as dc.data.Dataset objects. """ logger.info("Computing train/valid/test indices") train_inds, valid_inds, test_inds = self.split( @@ -163,7 +202,33 @@ def train_test_split(self, frac_train=.8, **kwargs): """Splits self into train/test sets. - Returns Dataset objects. + + Returns Dataset objects for train/test. + + Parameters + ---------- + dataset: data like object + Dataset to be split. This should either be of type + `dc.data.Dataset` or a type that `dc.utils.data.datasetify` can + convert into a `Dataset`. + train_dir: str, optional + If specified, the directory in which the generated + training dataset should be stored. This is only + considered if `isinstance(dataset, dc.data.DiskDataset)` + is True. + test_dir: str, optional + If specified, the directory in which the generated + test dataset should be stored. This is only + considered if `isinstance(dataset, dc.data.DiskDataset)` + is True. + seed: int, optional (default None) + Random seed to use. + frac_train: float, optional (default 0.8) + The fraction of data to be used for the training split. + + Returns + ------- + Train and test datasets as dc.data.Dataset objects. """ valid_dir = tempfile.mkdtemp() train_dataset, _, test_dataset = self.train_valid_test_split( @@ -186,37 +251,61 @@ def split(self, frac_test=None, log_every_n=None, **kwargs): - """ - Stub to be filled in by child classes. + """Return indices for specified split + + Parameters + ---------- + dataset: dc.data.Dataset + Dataset to be split + seed: int, optional (default None) + Random seed to use. + frac_train: float, optional (default 0.8) + The fraction of data to be used for the training split. + frac_valid: float, optional (default 0.1) + The fraction of data to be used for the validation split. + frac_test: float, optional (default 0.1) + The fraction of data to be used for the test split. + log_every_n: int, optional + Controls the logger by dictating how often logger outputs + will be produced. + + Returns + ------- + A tuple `(train_inds, valid_inds, test_inds` of the indices (integers) for + the various splits. """ raise NotImplementedError class RandomGroupSplitter(Splitter): + """Random split based on groupings. - def __init__(self, groups, *args, **kwargs): - """ - A splitter class that splits on groupings. An example use case is when there - are multiple conformations of the same molecule that share the same topology. - This splitter subsequently guarantees that resulting splits preserve groupings. + A splitter class that splits on groupings. An example use case is when + there are multiple conformations of the same molecule that share the same + topology. This splitter subsequently guarantees that resulting splits + preserve groupings. - Note that it doesn't do any dynamic programming or something fancy to try to - maximize the choice such that frac_train, frac_valid, or frac_test is maximized. - It simply permutes the groups themselves. As such, use with caution if the number - of elements per group varies significantly. + Note that it doesn't do any dynamic programming or something fancy to try + to maximize the choice such that frac_train, frac_valid, or frac_test is + maximized. It simply permutes the groups themselves. As such, use with + caution if the number of elements per group varies significantly. + """ + + def __init__(self, groups, *args, **kwargs): + """Initialize this object. Parameters ---------- groups: array like list of hashables An auxiliary array indicating the group of each item. - Eg: - g: 3 2 2 0 1 1 2 4 3 - X: 0 1 2 3 4 5 6 7 8 + Eg: + g: 3 2 2 0 1 1 2 4 3 + X: 0 1 2 3 4 5 6 7 8 - Eg: - g: a b b e q x a a r - X: 0 1 2 3 4 5 6 7 8 + Eg: + g: a b b e q x a a r + X: 0 1 2 3 4 5 6 7 8 """ self.groups = groups @@ -229,6 +318,29 @@ def split(self, frac_valid=.1, frac_test=.1, log_every_n=None): + """Return indices for specified split + + Parameters + ---------- + dataset: dc.data.Dataset + Dataset to be split + seed: int, optional (default None) + Random seed to use. + frac_train: float, optional (default 0.8) + The fraction of data to be used for the training split. + frac_valid: float, optional (default 0.1) + The fraction of data to be used for the validation split. + frac_test: float, optional (default 0.1) + The fraction of data to be used for the test split. + log_every_n: int, optional + Controls the logger by dictating how often logger outputs + will be produced. + + Returns + ------- + A tuple `(train_inds, valid_inds, test_inds` of the indices (integers) for + the various splits. + """ assert len(self.groups) == dataset.X.shape[0] np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.) @@ -267,8 +379,7 @@ def split(self, class RandomStratifiedSplitter(Splitter): - """ - RandomStratified Splitter class. + """RandomStratified Splitter class. For sparse multitask datasets, a standard split offers no guarantees that the splits will have any activate compounds. This class guarantees @@ -368,8 +479,48 @@ def train_valid_test_split(self, frac_test=.1, seed=None, log_every_n=1000): - """Custom split due to raggedness in original split. - """ + """ Splits self into train/validation/test sets. + + Most splitters use the superclass implementation + `Splitter.train_valid_test_split` but this class has to override the + implementation to deal with potentially ragged splits. + + Parameters + ---------- + dataset: data like object. + Dataset to be split. This should either be of type + `dc.data.Dataset` or a type that `dc.utils.data.datasetify` can + convert into a `Dataset`. + train_dir: str, optional + If specified, the directory in which the generated + training dataset should be stored. This is only + considered if `isinstance(dataset, dc.data.DiskDataset)` + valid_dir: str, optional + If specified, the directory in which the generated + valid dataset should be stored. This is only + considered if `isinstance(dataset, dc.data.DiskDataset)` + is True. + test_dir: str, optional + If specified, the directory in which the generated + test dataset should be stored. This is only + considered if `isinstance(dataset, dc.data.DiskDataset)` + is True. + frac_train: float, optional (default 0.8) + The fraction of data to be used for the training split. + frac_valid: float, optional (default 0.1) + The fraction of data to be used for the validation split. + frac_test: float, optional (default 0.1) + The fraction of data to be used for the test split. + seed: int, optional (default None) + Random seed to use. + log_every_n: int, optional + Controls the logger by dictating how often logger outputs + will be produced. + + Returns + ------- + Train and test datasets as dc.data.Dataset objects. + """ if train_dir is None: train_dir = tempfile.mkdtemp() if valid_dir is None: @@ -414,22 +565,22 @@ def k_fold_split(self, dataset, k, directories=None, **kwargs): class SingletaskStratifiedSplitter(Splitter): - """ - Class for doing data splits by stratification on a single task. + """Class for doing data splits by stratification on a single task. - Example: + Example + ------- - >>> n_samples = 100 - >>> n_features = 10 - >>> n_tasks = 10 - >>> X = np.random.rand(n_samples, n_features) - >>> y = np.random.rand(n_samples, n_tasks) - >>> w = np.ones_like(y) - >>> dataset = DiskDataset.from_numpy(np.ones((100,n_tasks)), np.ones((100,n_tasks))) - >>> splitter = SingletaskStratifiedSplitter(task_number=5) - >>> train_dataset, test_dataset = splitter.train_test_split(dataset) + >>> n_samples = 100 + >>> n_features = 10 + >>> n_tasks = 10 + >>> X = np.random.rand(n_samples, n_features) + >>> y = np.random.rand(n_samples, n_tasks) + >>> w = np.ones_like(y) + >>> dataset = DiskDataset.from_numpy(np.ones((100,n_tasks)), np.ones((100,n_tasks))) + >>> splitter = SingletaskStratifiedSplitter(task_number=5) + >>> train_dataset, test_dataset = splitter.train_test_split(dataset) - """ + """ def __init__(self, task_number=0): """ @@ -495,28 +646,28 @@ def split(self, frac_test=.1, log_every_n=None): """ - Splits compounds into train/validation/test using stratified sampling. - - Parameters - ---------- - dataset: dc.data.Dataset object - Dataset. - seed: int (Optional, Default None) - Random seed. - frac_train: float (Optional, Default .8) - Fraction of dataset put into training data. - frac_valid: float (Optional, Default .1) - Fraction of dataset put into validation data. - frac_test: float (Optional, Default .1) - Fraction of dataset put into test data. - log_every_n: int (Optional, Default None) - Log every n examples (not currently used). - - Returns - ------- - retval: Tuple - Tuple containing train indices, valid indices, and test indices - """ + Splits compounds into train/validation/test using stratified sampling. + + Parameters + ---------- + dataset: dc.data.Dataset object + Dataset. + seed: int (Optional, Default None) + Random seed. + frac_train: float (Optional, Default .8) + Fraction of dataset put into training data. + frac_valid: float (Optional, Default .1) + Fraction of dataset put into validation data. + frac_test: float (Optional, Default .1) + Fraction of dataset put into test data. + log_every_n: int (Optional, Default None) + Log every n examples (not currently used). + + Returns + ------- + retval: Tuple + Tuple containing train indices, valid indices, and test indices + """ # JSG Assert that split fractions can be written as proper fractions over 10. # This can be generalized in the future with some common demoninator determination. # This will work for 80/20 train/test or 80/10/10 train/valid/test (most use cases). @@ -555,8 +706,12 @@ def split(self, class MolecularWeightSplitter(Splitter): """ - Class for doing data splits by molecular weight. - """ + Class for doing data splits by molecular weight. + + Note + ---- + This class requires `rdkit` to be installed. + """ def split(self, dataset, @@ -565,10 +720,32 @@ def split(self, frac_valid=.1, frac_test=.1, log_every_n=None): + """Splits on molecular weight. + + Splits internal compounds into train/validation/test using the MW + calculated by SMILES string. + + Parameters + ---------- + dataset: dc.data.Dataset + Dataset to be split + seed: int, optional (default None) + Random seed to use. + frac_train: float, optional (default 0.8) + The fraction of data to be used for the training split. + frac_valid: float, optional (default 0.1) + The fraction of data to be used for the validation split. + frac_test: float, optional (default 0.1) + The fraction of data to be used for the test split. + log_every_n: int, optional + Controls the logger by dictating how often logger outputs + will be produced. + + Returns + ------- + A tuple `(train_inds, valid_inds, test_inds` of the indices (integers) for + the various splits. """ - Splits internal compounds into train/validation/test using the MW calculated - by SMILES string. - """ np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.) if not seed is None: @@ -593,11 +770,16 @@ def split(self, class MaxMinSplitter(Splitter): - """ + """Chemical diversity splitter. + Class for doing splits based on the MaxMin diversity algorithm. Intuitively, the test set is comprised of the most diverse compounds of the entire dataset. Furthermore, the validation set is comprised of diverse compounds under the test set. + + Note + ---- + This class requires `rdkit` to be installed. """ def split(self, @@ -667,9 +849,8 @@ def distance(i, j): class RandomSplitter(Splitter): + """Class for doing random data splits. """ - Class for doing random data splits. - """ def split(self, dataset, @@ -679,8 +860,29 @@ def split(self, frac_test=.1, log_every_n=None): """ - Splits internal compounds randomly into train/validation/test. - """ + Splits internal compounds randomly into train/validation/test. + + Parameters + ---------- + dataset: dc.data.Dataset + Dataset to be split + seed: int, optional (default None) + Random seed to use. + frac_train: float, optional (default 0.8) + The fraction of data to be used for the training split. + frac_valid: float, optional (default 0.1) + The fraction of data to be used for the validation split. + frac_test: float, optional (default 0.1) + The fraction of data to be used for the test split. + log_every_n: int, optional + Controls the logger by dictating how often logger outputs + will be produced. + + Returns + ------- + A tuple `(train_inds, valid_inds, test_inds` of the indices (integers) for + the various splits. + """ np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.) if not seed is None: np.random.seed(seed) @@ -693,9 +895,14 @@ def split(self, class IndexSplitter(Splitter): + """Class for simple order based splits. + + Use this class when the `Dataset` you have is already ordered sa you would + like it to be processed. Then the first `frac_train` proportion is used for + training, the next `frac_valid` for validation, and the final `frac_test` for + testing. This class may make sense to use your `Dataset` is already time + ordered (for example). """ - Class for simple order based splits. - """ def split(self, dataset, @@ -704,9 +911,29 @@ def split(self, frac_valid=.1, frac_test=.1, log_every_n=None): + """Splits internal compounds into train/validation/test in provided order. + + Parameters + ---------- + dataset: dc.data.Dataset + Dataset to be split + seed: int, optional (default None) + Random seed to use. + frac_train: float, optional (default 0.8) + The fraction of data to be used for the training split. + frac_valid: float, optional (default 0.1) + The fraction of data to be used for the validation split. + frac_test: float, optional (default 0.1) + The fraction of data to be used for the test split. + log_every_n: int, optional + Controls the logger by dictating how often logger outputs + will be produced. + + Returns + ------- + A tuple `(train_inds, valid_inds, test_inds` of the indices (integers) for + the various splits. """ - Splits internal compounds into train/validation/test in provided order. - """ np.testing.assert_almost_equal(frac_train + frac_valid + frac_test, 1.) num_datapoints = len(dataset) train_cutoff = int(frac_train * num_datapoints) @@ -717,9 +944,15 @@ def split(self, class IndiceSplitter(Splitter): + """Split data in the fasion specified by user. + + For some applications, you will already know how you'd like to split the + dataset. In this splitter, you simplify specify `valid_indices` and + `test_indices` and the datapoints at those indices are pulled out of the + dataset. Note that this is different from `IndexSplitter` which only splits + based on the existing dataset orderning, while this `IndiceSplitter` can + split on any specified ordering. """ - Class for splits based on input order. - """ def __init__(self, valid_indices=None, test_indices=None): """ diff --git a/docs/index.rst b/docs/index.rst index 0d1572cb8d..9ae6b0c53d 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -124,6 +124,7 @@ discussions about research, development or any general questions. If you'd like :name: mastertoc Introduction + Tutorial Installation Datasets Data Loaders diff --git a/docs/tutorial.rst b/docs/tutorial.rst new file mode 100644 index 0000000000..68e578775e --- /dev/null +++ b/docs/tutorial.rst @@ -0,0 +1,84 @@ +DeepChem Tutorial +================= + +If you're new to DeepChem, you probably want to know the basics. What is DeepChem? Why should you care about using it? The short answer is that DeepChem is a scientific machine learning library. (The "Chem" indicates the historical fact that DeepChem initially focused on chemical applications, but we aim to support all types of scientific applications more broadly). + +Why would you want to use DeepChem instead of another machine learning +library? Simply put, DeepChem maintains an extensive collection of utilities +to enable scientific deep learning including classes for loading scientific +datasets, processing them, transforming them, splitting them up, and learning +from them. Behind the scenes DeepChem uses a variety of other machine +learning frameworks such as `sklearn`_, `tensorflow`_, and `xgboost`_. We are +also experimenting with adding additional models implemented in `pytorch`_ +and `jax`_. Our focus is to facilitate scientific experimentation using +whatever tools are available at hand. + +In the rest of this tutorials, we'll provide a rapid fire overview of DeepChem's API. DeepChem is a big library so we won't cover everything, but we should give you enough to get started. + +.. _`sklearn`: https://scikit-learn.org/stable/ + +.. _`tensorflow`: https://www.tensorflow.org/ + +.. _`xgboost`: https://xgboost.readthedocs.io/en/latest/ + +.. _`pytorch`: https://pytorch.org/ + +.. _`jax`: https://github.com/google/jax + + +Quickstart +---------- +If you're new, you can install DeepChem on a new machine with the following commands + +.. code-block:: bash + pip install tensorflow + pip install deepchem-nightly + +DeepChem is under very active development at present, so we recommend using our nightly build until we release a next major release. Note that to use DeepChem for chemistry applications, you will have to also install RDKit using conda. + +.. code-block:: bash + conda install -y -c rdkit -c conda-forge rdkit + + +Datasets +-------- +The :code:`dc.data` module contains utilities to handle :code:`Dataset` +objects. These :code:`Dataset` objects are the heart of DeepChem. A +:code:`Dataset` is an abstraction of a dataset in machine learning. That is, +a collection of features, labels, weights, alongside associated identifiers. +Rather than explaining further, we'll just show you. + +.. doctest:: + + >>> import deepchem as dc + >>> import numpy as np + >>> N_samples = 50 + >>> n_features = 10 + >>> X = np.random.rand(N_samples, n_features) + >>> y = np.random.rand(N_samples) + >>> dataset = dc.data.NumpyDataset(X, y) + >>> dataset.X.shape + (50, 10) + >>> dataset.y.shape + (50,) + +Here we've used the :code:`NumpyDataset` class which stores datasets in memory. This works fine for smaller datasets and is very convenient for experimentation, but is less convenient for larger datasets. For that we have the :code:`DiskDataset` class. + +.. doctest:: + + >>> dataset = dc.data.DiskDataset.from_numpy(X, y) + >>> dataset.X.shape + (50, 10) + >>> dataset.y.shape + (50,) + +In this example we haven't specified a data directory, so this :code:`DiskDataset` is written to a temporary folder. Note that :code:`dataset.X` and :code:`dataset.y` load data from disk underneath the hood! So this can get very expensive for larger datasets. + + +More Tutorials +-------------- +DeepChem maintains an extensive collection of addition `tutorials`_ that are meant to be run on Google `colab`_, an online platform that allows you to execute Jupyter notebooks. Once you've finished this introductory tutorial, we recommend working through these more involved tutorials. + +.. _`tutorials`: https://github.com/deepchem/deepchem/tree/master/examples/tutorials + +.. _`colab`: https://colab.research.google.com/