Merge f3a9896 into 5202bd7

deepchem · Jul 8, 2020 · 4441241 · 4441241
2 parents 5202bd7 + f3a9896
commit 4441241
Show file tree

Hide file tree

Showing 3 changed files with 176 additions and 137 deletions.
diff --git a/deepchem/data/datasets.py b/deepchem/data/datasets.py
@@ -1321,23 +1321,23 @@ def transform(self, fn, **args):
 
     >> newx, newy, neww = fn(x, y, w)
 
-    It might be called only once with the whole dataset, or multiple times with different
-    subsets of the data.  Each time it is called, it should transform the samples and return
-    the transformed data.
+    It might be called only once with the whole dataset, or multiple times
+    with different subsets of the data.  Each time it is called, it should
+    transform the samples and return the transformed data.
 
     Parameters
     ----------
     fn: function
       A function to apply to each sample in the dataset
     out_dir: string
-      The directory to save the new dataset in.  If this is omitted, a temporary directory
-      is created automatically
+      The directory to save the new dataset in.  If this is omitted, a
+      temporary directory is created automatically
 
     Returns
     -------
     a newly constructed Dataset object
     """
-    if 'out_dir' in args:
+    if 'out_dir' in args and args['out_dir'] is not None:
       out_dir = args['out_dir']
     else:
       out_dir = tempfile.mkdtemp()

diff --git a/deepchem/trans/tests/test_balancing.py b/deepchem/trans/tests/test_balancing.py
@@ -1,148 +1,181 @@
+import os
 import numpy as np
 import unittest
 import deepchem as dc
 import itertools
-import os
+import tempfile
+
+
+def test_binary_1d():
+  """Test balancing transformer on single-task dataset without explicit task dimension."""
+  n_samples = 20
+  n_features = 3
+  n_classes = 2
+  np.random.seed(123)
+  ids = np.arange(n_samples)
+  X = np.random.rand(n_samples, n_features)
+  y = np.random.randint(n_classes, size=(n_samples,))
+  w = np.ones((n_samples,))
+  dataset = dc.data.NumpyDataset(X, y, w)
 
+  balancing_transformer = dc.trans.BalancingTransformer(
+      transform_w=True, dataset=dataset)
+  dataset = balancing_transformer.transform(dataset)
+  X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids)
+  # Check ids are unchanged.
+  for id_elt, id_t_elt in zip(ids, ids_t):
+    assert id_elt == id_t_elt
+  # Check X is unchanged since this is a w transformer
+  np.testing.assert_allclose(X, X_t)
+  # Check y is unchanged since this is a w transformer
+  np.testing.assert_allclose(y, y_t)
+  y_task = y_t
+  w_task = w_t
+  w_orig_task = w
+  # Assert that entries with zero weight retain zero weight
+  np.testing.assert_allclose(w_task[w_orig_task == 0],
+                             np.zeros_like(w_task[w_orig_task == 0]))
+  # Check that sum of 0s equals sum of 1s in transformed for each task
+  assert np.isclose(np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1]))
 
-class TestBalancingTransformer(unittest.TestCase):
-  """
-  Test BalancingTransformer functionality. 
-  """
 
-  def test_binary_1d(self):
-    """Test balancing transformer on single-task dataset without explicit task dimension."""
-    n_samples = 20
-    n_features = 3
-    n_classes = 2
-    np.random.seed(123)
-    ids = np.arange(n_samples)
-    X = np.random.rand(n_samples, n_features)
-    y = np.random.randint(n_classes, size=(n_samples,))
-    w = np.ones((n_samples,))
-    dataset = dc.data.NumpyDataset(X, y, w)
+def test_binary_singletask():
+  """Test balancing transformer on single-task dataset."""
+  n_samples = 20
+  n_features = 3
+  n_tasks = 1
+  n_classes = 2
+  np.random.seed(123)
+  ids = np.arange(n_samples)
+  X = np.random.rand(n_samples, n_features)
+  y = np.random.randint(n_classes, size=(n_samples, n_tasks))
+  w = np.ones((n_samples, n_tasks))
+  dataset = dc.data.NumpyDataset(X, y, w)
 
-    balancing_transformer = dc.trans.BalancingTransformer(
-        transform_w=True, dataset=dataset)
-    dataset = balancing_transformer.transform(dataset)
-    X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids)
-    # Check ids are unchanged.
-    for id_elt, id_t_elt in zip(ids, ids_t):
-      assert id_elt == id_t_elt
-    # Check X is unchanged since this is a w transformer
-    np.testing.assert_allclose(X, X_t)
-    # Check y is unchanged since this is a w transformer
-    np.testing.assert_allclose(y, y_t)
-    y_task = y_t
-    w_task = w_t
-    w_orig_task = w
+  balancing_transformer = dc.trans.BalancingTransformer(
+      transform_w=True, dataset=dataset)
+  dataset = balancing_transformer.transform(dataset)
+  X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids)
+  # Check ids are unchanged.
+  for id_elt, id_t_elt in zip(ids, ids_t):
+    assert id_elt == id_t_elt
+  # Check X is unchanged since this is a w transformer
+  np.testing.assert_allclose(X, X_t)
+  # Check y is unchanged since this is a w transformer
+  np.testing.assert_allclose(y, y_t)
+  for ind, task in enumerate(dataset.get_task_names()):
+    y_task = y_t[:, ind]
+    w_task = w_t[:, ind]
+    w_orig_task = w[:, ind]
     # Assert that entries with zero weight retain zero weight
     np.testing.assert_allclose(w_task[w_orig_task == 0],
                                np.zeros_like(w_task[w_orig_task == 0]))
     # Check that sum of 0s equals sum of 1s in transformed for each task
     assert np.isclose(np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1]))
 
-  def test_binary_singletask(self):
-    """Test balancing transformer on single-task dataset."""
-    n_samples = 20
-    n_features = 3
-    n_tasks = 1
-    n_classes = 2
-    np.random.seed(123)
-    ids = np.arange(n_samples)
-    X = np.random.rand(n_samples, n_features)
-    y = np.random.randint(n_classes, size=(n_samples, n_tasks))
-    w = np.ones((n_samples, n_tasks))
-    dataset = dc.data.NumpyDataset(X, y, w)
 
-    balancing_transformer = dc.trans.BalancingTransformer(
-        transform_w=True, dataset=dataset)
-    dataset = balancing_transformer.transform(dataset)
-    X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids)
-    # Check ids are unchanged.
-    for id_elt, id_t_elt in zip(ids, ids_t):
-      assert id_elt == id_t_elt
-    # Check X is unchanged since this is a w transformer
-    np.testing.assert_allclose(X, X_t)
-    # Check y is unchanged since this is a w transformer
-    np.testing.assert_allclose(y, y_t)
-    for ind, task in enumerate(dataset.get_task_names()):
-      y_task = y_t[:, ind]
-      w_task = w_t[:, ind]
-      w_orig_task = w[:, ind]
-      # Assert that entries with zero weight retain zero weight
-      np.testing.assert_allclose(w_task[w_orig_task == 0],
-                                 np.zeros_like(w_task[w_orig_task == 0]))
-      # Check that sum of 0s equals sum of 1s in transformed for each task
-      assert np.isclose(
-          np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1]))
+def test_binary_multitask():
+  """Test balancing transformer on multitask dataset."""
+  n_samples = 10
+  n_features = 3
+  n_tasks = 5
+  n_classes = 2
+  ids = np.arange(n_samples)
+  X = np.random.rand(n_samples, n_features)
+  y = np.random.randint(n_classes, size=(n_samples, n_tasks))
+  w = np.ones((n_samples, n_tasks))
+  multitask_dataset = dc.data.NumpyDataset(X, y, w)
+  balancing_transformer = dc.trans.BalancingTransformer(
+      transform_w=True, dataset=multitask_dataset)
+  #X, y, w, ids = (multitask_dataset.X, multitask_dataset.y,
+  #                multitask_dataset.w, multitask_dataset.ids)
+  multitask_dataset = balancing_transformer.transform(multitask_dataset)
+  X_t, y_t, w_t, ids_t = (multitask_dataset.X, multitask_dataset.y,
+                          multitask_dataset.w, multitask_dataset.ids)
+  # Check ids are unchanged.
+  for id_elt, id_t_elt in zip(ids, ids_t):
+    assert id_elt == id_t_elt
+  # Check X is unchanged since this is a w transformer
+  np.testing.assert_allclose(X, X_t)
+  # Check y is unchanged since this is a w transformer
+  np.testing.assert_allclose(y, y_t)
+  for ind, task in enumerate(multitask_dataset.get_task_names()):
+    y_task = y_t[:, ind]
+    w_task = w_t[:, ind]
+    w_orig_task = w[:, ind]
+    # Assert that entries with zero weight retain zero weight
+    np.testing.assert_allclose(w_task[w_orig_task == 0],
+                               np.zeros_like(w_task[w_orig_task == 0]))
+    # Check that sum of 0s equals sum of 1s in transformed for each task
+    assert np.isclose(np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1]))
+
 
-  def test_binary_multitask(self):
-    """Test balancing transformer on multitask dataset."""
-    n_samples = 10
-    n_features = 3
-    n_tasks = 5
-    n_classes = 2
-    ids = np.arange(n_samples)
-    X = np.random.rand(n_samples, n_features)
-    y = np.random.randint(n_classes, size=(n_samples, n_tasks))
-    w = np.ones((n_samples, n_tasks))
-    multitask_dataset = dc.data.NumpyDataset(X, y, w)
-    balancing_transformer = dc.trans.BalancingTransformer(
-        transform_w=True, dataset=multitask_dataset)
-    #X, y, w, ids = (multitask_dataset.X, multitask_dataset.y,
-    #                multitask_dataset.w, multitask_dataset.ids)
-    multitask_dataset = balancing_transformer.transform(multitask_dataset)
-    X_t, y_t, w_t, ids_t = (multitask_dataset.X, multitask_dataset.y,
-                            multitask_dataset.w, multitask_dataset.ids)
-    # Check ids are unchanged.
-    for id_elt, id_t_elt in zip(ids, ids_t):
-      assert id_elt == id_t_elt
-    # Check X is unchanged since this is a w transformer
-    np.testing.assert_allclose(X, X_t)
-    # Check y is unchanged since this is a w transformer
-    np.testing.assert_allclose(y, y_t)
-    for ind, task in enumerate(multitask_dataset.get_task_names()):
-      y_task = y_t[:, ind]
-      w_task = w_t[:, ind]
-      w_orig_task = w[:, ind]
-      # Assert that entries with zero weight retain zero weight
-      np.testing.assert_allclose(w_task[w_orig_task == 0],
-                                 np.zeros_like(w_task[w_orig_task == 0]))
-      # Check that sum of 0s equals sum of 1s in transformed for each task
+def test_multiclass_singletask():
+  """Test balancing transformer on single-task dataset."""
+  n_samples = 50
+  n_features = 3
+  n_tasks = 1
+  n_classes = 5
+  ids = np.arange(n_samples)
+  X = np.random.rand(n_samples, n_features)
+  y = np.random.randint(n_classes, size=(n_samples, n_tasks))
+  w = np.ones((n_samples, n_tasks))
+  dataset = dc.data.NumpyDataset(X, y, w)
+
+  balancing_transformer = dc.trans.BalancingTransformer(
+      transform_w=True, dataset=dataset)
+  dataset = balancing_transformer.transform(dataset)
+  X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids)
+  # Check ids are unchanged.
+  for id_elt, id_t_elt in zip(ids, ids_t):
+    assert id_elt == id_t_elt
+  # Check X is unchanged since this is a w transformer
+  np.testing.assert_allclose(X, X_t)
+  # Check y is unchanged since this is a w transformer
+  np.testing.assert_allclose(y, y_t)
+  for ind, task in enumerate(dataset.get_task_names()):
+    y_task = y_t[:, ind]
+    w_task = w_t[:, ind]
+    w_orig_task = w[:, ind]
+    # Check that sum of 0s equals sum of 1s in transformed for each task
+    for i, j in itertools.product(range(n_classes), range(n_classes)):
+      if i == j:
+        continue
       assert np.isclose(
-          np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1]))
+          np.sum(w_task[y_task == i]), np.sum(w_task[y_task == j]))
+
 
-  def test_multiclass_singletask(self):
-    """Test balancing transformer on single-task dataset."""
-    n_samples = 50
-    n_features = 3
-    n_tasks = 1
-    n_classes = 5
-    ids = np.arange(n_samples)
-    X = np.random.rand(n_samples, n_features)
-    y = np.random.randint(n_classes, size=(n_samples, n_tasks))
-    w = np.ones((n_samples, n_tasks))
-    dataset = dc.data.NumpyDataset(X, y, w)
+def test_transform_to_directory():
+  """Test that output can be written to a directory."""
+  n_samples = 20
+  n_features = 3
+  n_classes = 2
+  np.random.seed(123)
+  ids = np.arange(n_samples)
+  X = np.random.rand(n_samples, n_features)
+  y = np.random.randint(n_classes, size=(n_samples,))
+  w = np.ones((n_samples,))
+  dataset = dc.data.NumpyDataset(X, y, w)
 
-    balancing_transformer = dc.trans.BalancingTransformer(
-        transform_w=True, dataset=dataset)
-    dataset = balancing_transformer.transform(dataset)
-    X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids)
-    # Check ids are unchanged.
-    for id_elt, id_t_elt in zip(ids, ids_t):
-      assert id_elt == id_t_elt
-    # Check X is unchanged since this is a w transformer
-    np.testing.assert_allclose(X, X_t)
-    # Check y is unchanged since this is a w transformer
-    np.testing.assert_allclose(y, y_t)
-    for ind, task in enumerate(dataset.get_task_names()):
-      y_task = y_t[:, ind]
-      w_task = w_t[:, ind]
-      w_orig_task = w[:, ind]
-      # Check that sum of 0s equals sum of 1s in transformed for each task
-      for i, j in itertools.product(range(n_classes), range(n_classes)):
-        if i == j:
-          continue
-        assert np.isclose(
-            np.sum(w_task[y_task == i]), np.sum(w_task[y_task == j]))
+  balancing_transformer = dc.trans.BalancingTransformer(
+      transform_w=True, dataset=dataset)
+  with tempfile.TemporaryDirectory() as tmpdirname:
+    dataset = balancing_transformer.transform(dataset, out_dir=tmpdirname)
+    balanced_dataset = dc.data.DiskDataset(tmpdirname)
+    X_t, y_t, w_t, ids_t = (balanced_dataset.X, balanced_dataset.y,
+                            balanced_dataset.w, balanced_dataset.ids)
+  # Check ids are unchanged.
+  for id_elt, id_t_elt in zip(ids, ids_t):
+    assert id_elt == id_t_elt
+  # Check X is unchanged since this is a w transformer
+  np.testing.assert_allclose(X, X_t)
+  # Check y is unchanged since this is a w transformer
+  np.testing.assert_allclose(y, y_t)
+  y_task = y_t
+  w_task = w_t
+  w_orig_task = w
+  # Assert that entries with zero weight retain zero weight
+  np.testing.assert_allclose(w_task[w_orig_task == 0],
+                             np.zeros_like(w_task[w_orig_task == 0]))
+  # Check that sum of 0s equals sum of 1s in transformed for each task
+  assert np.isclose(np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1]))
diff --git a/deepchem/trans/transformers.py b/deepchem/trans/transformers.py
@@ -153,7 +153,7 @@ def untransform(self, z):
     raise NotImplementedError(
         "Each Transformer is responsible for its own untransform method.")
 
-  def transform(self, dataset, parallel=False, **kwargs):
+  def transform(self, dataset, parallel=False, out_dir=None, **kwargs):
     """Transforms all internally stored data in dataset.
 
     This method transforms all internal data in the provided dataset by using
@@ -175,12 +175,18 @@ def transform(self, dataset, parallel=False, **kwargs):
     -------
     a newly constructed Dataset object
     """
+    # Add this case in to handle non-DiskDataset that should be written to disk
+    if out_dir is not None:
+      if not isinstance(dataset, dc.data.DiskDataset):
+        dataset = dc.data.DiskDataset.from_numpy(dataset.X, dataset.y,
+                                                 dataset.w, dataset.ids)
     _, y_shape, w_shape, _ = dataset.get_shape()
     if y_shape == tuple() and self.transform_y:
       raise ValueError("Cannot transform y when y_values are not present")
     if w_shape == tuple() and self.transform_w:
       raise ValueError("Cannot transform w when w_values are not present")
-    return dataset.transform(lambda X, y, w: self.transform_array(X, y, w))
+    return dataset.transform(
+        lambda X, y, w: self.transform_array(X, y, w), out_dir=out_dir)
 
   def transform_on_array(self, X, y, w):
     """Transforms numpy arrays X, y, and w