From f3a9896fc83a184d2e6be4b7d1c0b3276424b7bb Mon Sep 17 00:00:00 2001 From: Bharath Ramsundar Date: Wed, 8 Jul 2020 16:20:13 -0700 Subject: [PATCH] Changes --- deepchem/data/datasets.py | 12 +- deepchem/trans/tests/test_balancing.py | 291 ++++++++++++++----------- deepchem/trans/transformers.py | 10 +- 3 files changed, 176 insertions(+), 137 deletions(-) diff --git a/deepchem/data/datasets.py b/deepchem/data/datasets.py index a69a6f5a2b..619df1159f 100644 --- a/deepchem/data/datasets.py +++ b/deepchem/data/datasets.py @@ -1321,23 +1321,23 @@ def transform(self, fn, **args): >> newx, newy, neww = fn(x, y, w) - It might be called only once with the whole dataset, or multiple times with different - subsets of the data. Each time it is called, it should transform the samples and return - the transformed data. + It might be called only once with the whole dataset, or multiple times + with different subsets of the data. Each time it is called, it should + transform the samples and return the transformed data. Parameters ---------- fn: function A function to apply to each sample in the dataset out_dir: string - The directory to save the new dataset in. If this is omitted, a temporary directory - is created automatically + The directory to save the new dataset in. If this is omitted, a + temporary directory is created automatically Returns ------- a newly constructed Dataset object """ - if 'out_dir' in args: + if 'out_dir' in args and args['out_dir'] is not None: out_dir = args['out_dir'] else: out_dir = tempfile.mkdtemp() diff --git a/deepchem/trans/tests/test_balancing.py b/deepchem/trans/tests/test_balancing.py index a82feab141..6cb81620fc 100644 --- a/deepchem/trans/tests/test_balancing.py +++ b/deepchem/trans/tests/test_balancing.py @@ -1,148 +1,181 @@ +import os import numpy as np import unittest import deepchem as dc import itertools -import os +import tempfile + + +def test_binary_1d(): + """Test balancing transformer on single-task dataset without explicit task dimension.""" + n_samples = 20 + n_features = 3 + n_classes = 2 + np.random.seed(123) + ids = np.arange(n_samples) + X = np.random.rand(n_samples, n_features) + y = np.random.randint(n_classes, size=(n_samples,)) + w = np.ones((n_samples,)) + dataset = dc.data.NumpyDataset(X, y, w) + balancing_transformer = dc.trans.BalancingTransformer( + transform_w=True, dataset=dataset) + dataset = balancing_transformer.transform(dataset) + X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids) + # Check ids are unchanged. + for id_elt, id_t_elt in zip(ids, ids_t): + assert id_elt == id_t_elt + # Check X is unchanged since this is a w transformer + np.testing.assert_allclose(X, X_t) + # Check y is unchanged since this is a w transformer + np.testing.assert_allclose(y, y_t) + y_task = y_t + w_task = w_t + w_orig_task = w + # Assert that entries with zero weight retain zero weight + np.testing.assert_allclose(w_task[w_orig_task == 0], + np.zeros_like(w_task[w_orig_task == 0])) + # Check that sum of 0s equals sum of 1s in transformed for each task + assert np.isclose(np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1])) -class TestBalancingTransformer(unittest.TestCase): - """ - Test BalancingTransformer functionality. - """ - def test_binary_1d(self): - """Test balancing transformer on single-task dataset without explicit task dimension.""" - n_samples = 20 - n_features = 3 - n_classes = 2 - np.random.seed(123) - ids = np.arange(n_samples) - X = np.random.rand(n_samples, n_features) - y = np.random.randint(n_classes, size=(n_samples,)) - w = np.ones((n_samples,)) - dataset = dc.data.NumpyDataset(X, y, w) +def test_binary_singletask(): + """Test balancing transformer on single-task dataset.""" + n_samples = 20 + n_features = 3 + n_tasks = 1 + n_classes = 2 + np.random.seed(123) + ids = np.arange(n_samples) + X = np.random.rand(n_samples, n_features) + y = np.random.randint(n_classes, size=(n_samples, n_tasks)) + w = np.ones((n_samples, n_tasks)) + dataset = dc.data.NumpyDataset(X, y, w) - balancing_transformer = dc.trans.BalancingTransformer( - transform_w=True, dataset=dataset) - dataset = balancing_transformer.transform(dataset) - X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids) - # Check ids are unchanged. - for id_elt, id_t_elt in zip(ids, ids_t): - assert id_elt == id_t_elt - # Check X is unchanged since this is a w transformer - np.testing.assert_allclose(X, X_t) - # Check y is unchanged since this is a w transformer - np.testing.assert_allclose(y, y_t) - y_task = y_t - w_task = w_t - w_orig_task = w + balancing_transformer = dc.trans.BalancingTransformer( + transform_w=True, dataset=dataset) + dataset = balancing_transformer.transform(dataset) + X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids) + # Check ids are unchanged. + for id_elt, id_t_elt in zip(ids, ids_t): + assert id_elt == id_t_elt + # Check X is unchanged since this is a w transformer + np.testing.assert_allclose(X, X_t) + # Check y is unchanged since this is a w transformer + np.testing.assert_allclose(y, y_t) + for ind, task in enumerate(dataset.get_task_names()): + y_task = y_t[:, ind] + w_task = w_t[:, ind] + w_orig_task = w[:, ind] # Assert that entries with zero weight retain zero weight np.testing.assert_allclose(w_task[w_orig_task == 0], np.zeros_like(w_task[w_orig_task == 0])) # Check that sum of 0s equals sum of 1s in transformed for each task assert np.isclose(np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1])) - def test_binary_singletask(self): - """Test balancing transformer on single-task dataset.""" - n_samples = 20 - n_features = 3 - n_tasks = 1 - n_classes = 2 - np.random.seed(123) - ids = np.arange(n_samples) - X = np.random.rand(n_samples, n_features) - y = np.random.randint(n_classes, size=(n_samples, n_tasks)) - w = np.ones((n_samples, n_tasks)) - dataset = dc.data.NumpyDataset(X, y, w) - balancing_transformer = dc.trans.BalancingTransformer( - transform_w=True, dataset=dataset) - dataset = balancing_transformer.transform(dataset) - X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids) - # Check ids are unchanged. - for id_elt, id_t_elt in zip(ids, ids_t): - assert id_elt == id_t_elt - # Check X is unchanged since this is a w transformer - np.testing.assert_allclose(X, X_t) - # Check y is unchanged since this is a w transformer - np.testing.assert_allclose(y, y_t) - for ind, task in enumerate(dataset.get_task_names()): - y_task = y_t[:, ind] - w_task = w_t[:, ind] - w_orig_task = w[:, ind] - # Assert that entries with zero weight retain zero weight - np.testing.assert_allclose(w_task[w_orig_task == 0], - np.zeros_like(w_task[w_orig_task == 0])) - # Check that sum of 0s equals sum of 1s in transformed for each task - assert np.isclose( - np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1])) +def test_binary_multitask(): + """Test balancing transformer on multitask dataset.""" + n_samples = 10 + n_features = 3 + n_tasks = 5 + n_classes = 2 + ids = np.arange(n_samples) + X = np.random.rand(n_samples, n_features) + y = np.random.randint(n_classes, size=(n_samples, n_tasks)) + w = np.ones((n_samples, n_tasks)) + multitask_dataset = dc.data.NumpyDataset(X, y, w) + balancing_transformer = dc.trans.BalancingTransformer( + transform_w=True, dataset=multitask_dataset) + #X, y, w, ids = (multitask_dataset.X, multitask_dataset.y, + # multitask_dataset.w, multitask_dataset.ids) + multitask_dataset = balancing_transformer.transform(multitask_dataset) + X_t, y_t, w_t, ids_t = (multitask_dataset.X, multitask_dataset.y, + multitask_dataset.w, multitask_dataset.ids) + # Check ids are unchanged. + for id_elt, id_t_elt in zip(ids, ids_t): + assert id_elt == id_t_elt + # Check X is unchanged since this is a w transformer + np.testing.assert_allclose(X, X_t) + # Check y is unchanged since this is a w transformer + np.testing.assert_allclose(y, y_t) + for ind, task in enumerate(multitask_dataset.get_task_names()): + y_task = y_t[:, ind] + w_task = w_t[:, ind] + w_orig_task = w[:, ind] + # Assert that entries with zero weight retain zero weight + np.testing.assert_allclose(w_task[w_orig_task == 0], + np.zeros_like(w_task[w_orig_task == 0])) + # Check that sum of 0s equals sum of 1s in transformed for each task + assert np.isclose(np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1])) + - def test_binary_multitask(self): - """Test balancing transformer on multitask dataset.""" - n_samples = 10 - n_features = 3 - n_tasks = 5 - n_classes = 2 - ids = np.arange(n_samples) - X = np.random.rand(n_samples, n_features) - y = np.random.randint(n_classes, size=(n_samples, n_tasks)) - w = np.ones((n_samples, n_tasks)) - multitask_dataset = dc.data.NumpyDataset(X, y, w) - balancing_transformer = dc.trans.BalancingTransformer( - transform_w=True, dataset=multitask_dataset) - #X, y, w, ids = (multitask_dataset.X, multitask_dataset.y, - # multitask_dataset.w, multitask_dataset.ids) - multitask_dataset = balancing_transformer.transform(multitask_dataset) - X_t, y_t, w_t, ids_t = (multitask_dataset.X, multitask_dataset.y, - multitask_dataset.w, multitask_dataset.ids) - # Check ids are unchanged. - for id_elt, id_t_elt in zip(ids, ids_t): - assert id_elt == id_t_elt - # Check X is unchanged since this is a w transformer - np.testing.assert_allclose(X, X_t) - # Check y is unchanged since this is a w transformer - np.testing.assert_allclose(y, y_t) - for ind, task in enumerate(multitask_dataset.get_task_names()): - y_task = y_t[:, ind] - w_task = w_t[:, ind] - w_orig_task = w[:, ind] - # Assert that entries with zero weight retain zero weight - np.testing.assert_allclose(w_task[w_orig_task == 0], - np.zeros_like(w_task[w_orig_task == 0])) - # Check that sum of 0s equals sum of 1s in transformed for each task +def test_multiclass_singletask(): + """Test balancing transformer on single-task dataset.""" + n_samples = 50 + n_features = 3 + n_tasks = 1 + n_classes = 5 + ids = np.arange(n_samples) + X = np.random.rand(n_samples, n_features) + y = np.random.randint(n_classes, size=(n_samples, n_tasks)) + w = np.ones((n_samples, n_tasks)) + dataset = dc.data.NumpyDataset(X, y, w) + + balancing_transformer = dc.trans.BalancingTransformer( + transform_w=True, dataset=dataset) + dataset = balancing_transformer.transform(dataset) + X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids) + # Check ids are unchanged. + for id_elt, id_t_elt in zip(ids, ids_t): + assert id_elt == id_t_elt + # Check X is unchanged since this is a w transformer + np.testing.assert_allclose(X, X_t) + # Check y is unchanged since this is a w transformer + np.testing.assert_allclose(y, y_t) + for ind, task in enumerate(dataset.get_task_names()): + y_task = y_t[:, ind] + w_task = w_t[:, ind] + w_orig_task = w[:, ind] + # Check that sum of 0s equals sum of 1s in transformed for each task + for i, j in itertools.product(range(n_classes), range(n_classes)): + if i == j: + continue assert np.isclose( - np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1])) + np.sum(w_task[y_task == i]), np.sum(w_task[y_task == j])) + - def test_multiclass_singletask(self): - """Test balancing transformer on single-task dataset.""" - n_samples = 50 - n_features = 3 - n_tasks = 1 - n_classes = 5 - ids = np.arange(n_samples) - X = np.random.rand(n_samples, n_features) - y = np.random.randint(n_classes, size=(n_samples, n_tasks)) - w = np.ones((n_samples, n_tasks)) - dataset = dc.data.NumpyDataset(X, y, w) +def test_transform_to_directory(): + """Test that output can be written to a directory.""" + n_samples = 20 + n_features = 3 + n_classes = 2 + np.random.seed(123) + ids = np.arange(n_samples) + X = np.random.rand(n_samples, n_features) + y = np.random.randint(n_classes, size=(n_samples,)) + w = np.ones((n_samples,)) + dataset = dc.data.NumpyDataset(X, y, w) - balancing_transformer = dc.trans.BalancingTransformer( - transform_w=True, dataset=dataset) - dataset = balancing_transformer.transform(dataset) - X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids) - # Check ids are unchanged. - for id_elt, id_t_elt in zip(ids, ids_t): - assert id_elt == id_t_elt - # Check X is unchanged since this is a w transformer - np.testing.assert_allclose(X, X_t) - # Check y is unchanged since this is a w transformer - np.testing.assert_allclose(y, y_t) - for ind, task in enumerate(dataset.get_task_names()): - y_task = y_t[:, ind] - w_task = w_t[:, ind] - w_orig_task = w[:, ind] - # Check that sum of 0s equals sum of 1s in transformed for each task - for i, j in itertools.product(range(n_classes), range(n_classes)): - if i == j: - continue - assert np.isclose( - np.sum(w_task[y_task == i]), np.sum(w_task[y_task == j])) + balancing_transformer = dc.trans.BalancingTransformer( + transform_w=True, dataset=dataset) + with tempfile.TemporaryDirectory() as tmpdirname: + dataset = balancing_transformer.transform(dataset, out_dir=tmpdirname) + balanced_dataset = dc.data.DiskDataset(tmpdirname) + X_t, y_t, w_t, ids_t = (balanced_dataset.X, balanced_dataset.y, + balanced_dataset.w, balanced_dataset.ids) + # Check ids are unchanged. + for id_elt, id_t_elt in zip(ids, ids_t): + assert id_elt == id_t_elt + # Check X is unchanged since this is a w transformer + np.testing.assert_allclose(X, X_t) + # Check y is unchanged since this is a w transformer + np.testing.assert_allclose(y, y_t) + y_task = y_t + w_task = w_t + w_orig_task = w + # Assert that entries with zero weight retain zero weight + np.testing.assert_allclose(w_task[w_orig_task == 0], + np.zeros_like(w_task[w_orig_task == 0])) + # Check that sum of 0s equals sum of 1s in transformed for each task + assert np.isclose(np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1])) diff --git a/deepchem/trans/transformers.py b/deepchem/trans/transformers.py index ccaada015d..135eb41b76 100644 --- a/deepchem/trans/transformers.py +++ b/deepchem/trans/transformers.py @@ -153,7 +153,7 @@ def untransform(self, z): raise NotImplementedError( "Each Transformer is responsible for its own untransform method.") - def transform(self, dataset, parallel=False, **kwargs): + def transform(self, dataset, parallel=False, out_dir=None, **kwargs): """Transforms all internally stored data in dataset. This method transforms all internal data in the provided dataset by using @@ -175,12 +175,18 @@ def transform(self, dataset, parallel=False, **kwargs): ------- a newly constructed Dataset object """ + # Add this case in to handle non-DiskDataset that should be written to disk + if out_dir is not None: + if not isinstance(dataset, dc.data.DiskDataset): + dataset = dc.data.DiskDataset.from_numpy(dataset.X, dataset.y, + dataset.w, dataset.ids) _, y_shape, w_shape, _ = dataset.get_shape() if y_shape == tuple() and self.transform_y: raise ValueError("Cannot transform y when y_values are not present") if w_shape == tuple() and self.transform_w: raise ValueError("Cannot transform w when w_values are not present") - return dataset.transform(lambda X, y, w: self.transform_array(X, y, w)) + return dataset.transform( + lambda X, y, w: self.transform_array(X, y, w), out_dir=out_dir) def transform_on_array(self, X, y, w): """Transforms numpy arrays X, y, and w