Skip to content

Commit

Permalink
Merge f3a9896 into 5202bd7
Browse files Browse the repository at this point in the history
  • Loading branch information
rbharath committed Jul 8, 2020
2 parents 5202bd7 + f3a9896 commit 4441241
Show file tree
Hide file tree
Showing 3 changed files with 176 additions and 137 deletions.
12 changes: 6 additions & 6 deletions deepchem/data/datasets.py
Expand Up @@ -1321,23 +1321,23 @@ def transform(self, fn, **args):
>> newx, newy, neww = fn(x, y, w)
It might be called only once with the whole dataset, or multiple times with different
subsets of the data. Each time it is called, it should transform the samples and return
the transformed data.
It might be called only once with the whole dataset, or multiple times
with different subsets of the data. Each time it is called, it should
transform the samples and return the transformed data.
Parameters
----------
fn: function
A function to apply to each sample in the dataset
out_dir: string
The directory to save the new dataset in. If this is omitted, a temporary directory
is created automatically
The directory to save the new dataset in. If this is omitted, a
temporary directory is created automatically
Returns
-------
a newly constructed Dataset object
"""
if 'out_dir' in args:
if 'out_dir' in args and args['out_dir'] is not None:
out_dir = args['out_dir']
else:
out_dir = tempfile.mkdtemp()
Expand Down
291 changes: 162 additions & 129 deletions deepchem/trans/tests/test_balancing.py
@@ -1,148 +1,181 @@
import os
import numpy as np
import unittest
import deepchem as dc
import itertools
import os
import tempfile


def test_binary_1d():
"""Test balancing transformer on single-task dataset without explicit task dimension."""
n_samples = 20
n_features = 3
n_classes = 2
np.random.seed(123)
ids = np.arange(n_samples)
X = np.random.rand(n_samples, n_features)
y = np.random.randint(n_classes, size=(n_samples,))
w = np.ones((n_samples,))
dataset = dc.data.NumpyDataset(X, y, w)

balancing_transformer = dc.trans.BalancingTransformer(
transform_w=True, dataset=dataset)
dataset = balancing_transformer.transform(dataset)
X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids)
# Check ids are unchanged.
for id_elt, id_t_elt in zip(ids, ids_t):
assert id_elt == id_t_elt
# Check X is unchanged since this is a w transformer
np.testing.assert_allclose(X, X_t)
# Check y is unchanged since this is a w transformer
np.testing.assert_allclose(y, y_t)
y_task = y_t
w_task = w_t
w_orig_task = w
# Assert that entries with zero weight retain zero weight
np.testing.assert_allclose(w_task[w_orig_task == 0],
np.zeros_like(w_task[w_orig_task == 0]))
# Check that sum of 0s equals sum of 1s in transformed for each task
assert np.isclose(np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1]))

class TestBalancingTransformer(unittest.TestCase):
"""
Test BalancingTransformer functionality.
"""

def test_binary_1d(self):
"""Test balancing transformer on single-task dataset without explicit task dimension."""
n_samples = 20
n_features = 3
n_classes = 2
np.random.seed(123)
ids = np.arange(n_samples)
X = np.random.rand(n_samples, n_features)
y = np.random.randint(n_classes, size=(n_samples,))
w = np.ones((n_samples,))
dataset = dc.data.NumpyDataset(X, y, w)
def test_binary_singletask():
"""Test balancing transformer on single-task dataset."""
n_samples = 20
n_features = 3
n_tasks = 1
n_classes = 2
np.random.seed(123)
ids = np.arange(n_samples)
X = np.random.rand(n_samples, n_features)
y = np.random.randint(n_classes, size=(n_samples, n_tasks))
w = np.ones((n_samples, n_tasks))
dataset = dc.data.NumpyDataset(X, y, w)

balancing_transformer = dc.trans.BalancingTransformer(
transform_w=True, dataset=dataset)
dataset = balancing_transformer.transform(dataset)
X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids)
# Check ids are unchanged.
for id_elt, id_t_elt in zip(ids, ids_t):
assert id_elt == id_t_elt
# Check X is unchanged since this is a w transformer
np.testing.assert_allclose(X, X_t)
# Check y is unchanged since this is a w transformer
np.testing.assert_allclose(y, y_t)
y_task = y_t
w_task = w_t
w_orig_task = w
balancing_transformer = dc.trans.BalancingTransformer(
transform_w=True, dataset=dataset)
dataset = balancing_transformer.transform(dataset)
X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids)
# Check ids are unchanged.
for id_elt, id_t_elt in zip(ids, ids_t):
assert id_elt == id_t_elt
# Check X is unchanged since this is a w transformer
np.testing.assert_allclose(X, X_t)
# Check y is unchanged since this is a w transformer
np.testing.assert_allclose(y, y_t)
for ind, task in enumerate(dataset.get_task_names()):
y_task = y_t[:, ind]
w_task = w_t[:, ind]
w_orig_task = w[:, ind]
# Assert that entries with zero weight retain zero weight
np.testing.assert_allclose(w_task[w_orig_task == 0],
np.zeros_like(w_task[w_orig_task == 0]))
# Check that sum of 0s equals sum of 1s in transformed for each task
assert np.isclose(np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1]))

def test_binary_singletask(self):
"""Test balancing transformer on single-task dataset."""
n_samples = 20
n_features = 3
n_tasks = 1
n_classes = 2
np.random.seed(123)
ids = np.arange(n_samples)
X = np.random.rand(n_samples, n_features)
y = np.random.randint(n_classes, size=(n_samples, n_tasks))
w = np.ones((n_samples, n_tasks))
dataset = dc.data.NumpyDataset(X, y, w)

balancing_transformer = dc.trans.BalancingTransformer(
transform_w=True, dataset=dataset)
dataset = balancing_transformer.transform(dataset)
X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids)
# Check ids are unchanged.
for id_elt, id_t_elt in zip(ids, ids_t):
assert id_elt == id_t_elt
# Check X is unchanged since this is a w transformer
np.testing.assert_allclose(X, X_t)
# Check y is unchanged since this is a w transformer
np.testing.assert_allclose(y, y_t)
for ind, task in enumerate(dataset.get_task_names()):
y_task = y_t[:, ind]
w_task = w_t[:, ind]
w_orig_task = w[:, ind]
# Assert that entries with zero weight retain zero weight
np.testing.assert_allclose(w_task[w_orig_task == 0],
np.zeros_like(w_task[w_orig_task == 0]))
# Check that sum of 0s equals sum of 1s in transformed for each task
assert np.isclose(
np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1]))
def test_binary_multitask():
"""Test balancing transformer on multitask dataset."""
n_samples = 10
n_features = 3
n_tasks = 5
n_classes = 2
ids = np.arange(n_samples)
X = np.random.rand(n_samples, n_features)
y = np.random.randint(n_classes, size=(n_samples, n_tasks))
w = np.ones((n_samples, n_tasks))
multitask_dataset = dc.data.NumpyDataset(X, y, w)
balancing_transformer = dc.trans.BalancingTransformer(
transform_w=True, dataset=multitask_dataset)
#X, y, w, ids = (multitask_dataset.X, multitask_dataset.y,
# multitask_dataset.w, multitask_dataset.ids)
multitask_dataset = balancing_transformer.transform(multitask_dataset)
X_t, y_t, w_t, ids_t = (multitask_dataset.X, multitask_dataset.y,
multitask_dataset.w, multitask_dataset.ids)
# Check ids are unchanged.
for id_elt, id_t_elt in zip(ids, ids_t):
assert id_elt == id_t_elt
# Check X is unchanged since this is a w transformer
np.testing.assert_allclose(X, X_t)
# Check y is unchanged since this is a w transformer
np.testing.assert_allclose(y, y_t)
for ind, task in enumerate(multitask_dataset.get_task_names()):
y_task = y_t[:, ind]
w_task = w_t[:, ind]
w_orig_task = w[:, ind]
# Assert that entries with zero weight retain zero weight
np.testing.assert_allclose(w_task[w_orig_task == 0],
np.zeros_like(w_task[w_orig_task == 0]))
# Check that sum of 0s equals sum of 1s in transformed for each task
assert np.isclose(np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1]))


def test_binary_multitask(self):
"""Test balancing transformer on multitask dataset."""
n_samples = 10
n_features = 3
n_tasks = 5
n_classes = 2
ids = np.arange(n_samples)
X = np.random.rand(n_samples, n_features)
y = np.random.randint(n_classes, size=(n_samples, n_tasks))
w = np.ones((n_samples, n_tasks))
multitask_dataset = dc.data.NumpyDataset(X, y, w)
balancing_transformer = dc.trans.BalancingTransformer(
transform_w=True, dataset=multitask_dataset)
#X, y, w, ids = (multitask_dataset.X, multitask_dataset.y,
# multitask_dataset.w, multitask_dataset.ids)
multitask_dataset = balancing_transformer.transform(multitask_dataset)
X_t, y_t, w_t, ids_t = (multitask_dataset.X, multitask_dataset.y,
multitask_dataset.w, multitask_dataset.ids)
# Check ids are unchanged.
for id_elt, id_t_elt in zip(ids, ids_t):
assert id_elt == id_t_elt
# Check X is unchanged since this is a w transformer
np.testing.assert_allclose(X, X_t)
# Check y is unchanged since this is a w transformer
np.testing.assert_allclose(y, y_t)
for ind, task in enumerate(multitask_dataset.get_task_names()):
y_task = y_t[:, ind]
w_task = w_t[:, ind]
w_orig_task = w[:, ind]
# Assert that entries with zero weight retain zero weight
np.testing.assert_allclose(w_task[w_orig_task == 0],
np.zeros_like(w_task[w_orig_task == 0]))
# Check that sum of 0s equals sum of 1s in transformed for each task
def test_multiclass_singletask():
"""Test balancing transformer on single-task dataset."""
n_samples = 50
n_features = 3
n_tasks = 1
n_classes = 5
ids = np.arange(n_samples)
X = np.random.rand(n_samples, n_features)
y = np.random.randint(n_classes, size=(n_samples, n_tasks))
w = np.ones((n_samples, n_tasks))
dataset = dc.data.NumpyDataset(X, y, w)

balancing_transformer = dc.trans.BalancingTransformer(
transform_w=True, dataset=dataset)
dataset = balancing_transformer.transform(dataset)
X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids)
# Check ids are unchanged.
for id_elt, id_t_elt in zip(ids, ids_t):
assert id_elt == id_t_elt
# Check X is unchanged since this is a w transformer
np.testing.assert_allclose(X, X_t)
# Check y is unchanged since this is a w transformer
np.testing.assert_allclose(y, y_t)
for ind, task in enumerate(dataset.get_task_names()):
y_task = y_t[:, ind]
w_task = w_t[:, ind]
w_orig_task = w[:, ind]
# Check that sum of 0s equals sum of 1s in transformed for each task
for i, j in itertools.product(range(n_classes), range(n_classes)):
if i == j:
continue
assert np.isclose(
np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1]))
np.sum(w_task[y_task == i]), np.sum(w_task[y_task == j]))


def test_multiclass_singletask(self):
"""Test balancing transformer on single-task dataset."""
n_samples = 50
n_features = 3
n_tasks = 1
n_classes = 5
ids = np.arange(n_samples)
X = np.random.rand(n_samples, n_features)
y = np.random.randint(n_classes, size=(n_samples, n_tasks))
w = np.ones((n_samples, n_tasks))
dataset = dc.data.NumpyDataset(X, y, w)
def test_transform_to_directory():
"""Test that output can be written to a directory."""
n_samples = 20
n_features = 3
n_classes = 2
np.random.seed(123)
ids = np.arange(n_samples)
X = np.random.rand(n_samples, n_features)
y = np.random.randint(n_classes, size=(n_samples,))
w = np.ones((n_samples,))
dataset = dc.data.NumpyDataset(X, y, w)

balancing_transformer = dc.trans.BalancingTransformer(
transform_w=True, dataset=dataset)
dataset = balancing_transformer.transform(dataset)
X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids)
# Check ids are unchanged.
for id_elt, id_t_elt in zip(ids, ids_t):
assert id_elt == id_t_elt
# Check X is unchanged since this is a w transformer
np.testing.assert_allclose(X, X_t)
# Check y is unchanged since this is a w transformer
np.testing.assert_allclose(y, y_t)
for ind, task in enumerate(dataset.get_task_names()):
y_task = y_t[:, ind]
w_task = w_t[:, ind]
w_orig_task = w[:, ind]
# Check that sum of 0s equals sum of 1s in transformed for each task
for i, j in itertools.product(range(n_classes), range(n_classes)):
if i == j:
continue
assert np.isclose(
np.sum(w_task[y_task == i]), np.sum(w_task[y_task == j]))
balancing_transformer = dc.trans.BalancingTransformer(
transform_w=True, dataset=dataset)
with tempfile.TemporaryDirectory() as tmpdirname:
dataset = balancing_transformer.transform(dataset, out_dir=tmpdirname)
balanced_dataset = dc.data.DiskDataset(tmpdirname)
X_t, y_t, w_t, ids_t = (balanced_dataset.X, balanced_dataset.y,
balanced_dataset.w, balanced_dataset.ids)
# Check ids are unchanged.
for id_elt, id_t_elt in zip(ids, ids_t):
assert id_elt == id_t_elt
# Check X is unchanged since this is a w transformer
np.testing.assert_allclose(X, X_t)
# Check y is unchanged since this is a w transformer
np.testing.assert_allclose(y, y_t)
y_task = y_t
w_task = w_t
w_orig_task = w
# Assert that entries with zero weight retain zero weight
np.testing.assert_allclose(w_task[w_orig_task == 0],
np.zeros_like(w_task[w_orig_task == 0]))
# Check that sum of 0s equals sum of 1s in transformed for each task
assert np.isclose(np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1]))
10 changes: 8 additions & 2 deletions deepchem/trans/transformers.py
Expand Up @@ -153,7 +153,7 @@ def untransform(self, z):
raise NotImplementedError(
"Each Transformer is responsible for its own untransform method.")

def transform(self, dataset, parallel=False, **kwargs):
def transform(self, dataset, parallel=False, out_dir=None, **kwargs):
"""Transforms all internally stored data in dataset.
This method transforms all internal data in the provided dataset by using
Expand All @@ -175,12 +175,18 @@ def transform(self, dataset, parallel=False, **kwargs):
-------
a newly constructed Dataset object
"""
# Add this case in to handle non-DiskDataset that should be written to disk
if out_dir is not None:
if not isinstance(dataset, dc.data.DiskDataset):
dataset = dc.data.DiskDataset.from_numpy(dataset.X, dataset.y,
dataset.w, dataset.ids)
_, y_shape, w_shape, _ = dataset.get_shape()
if y_shape == tuple() and self.transform_y:
raise ValueError("Cannot transform y when y_values are not present")
if w_shape == tuple() and self.transform_w:
raise ValueError("Cannot transform w when w_values are not present")
return dataset.transform(lambda X, y, w: self.transform_array(X, y, w))
return dataset.transform(
lambda X, y, w: self.transform_array(X, y, w), out_dir=out_dir)

def transform_on_array(self, X, y, w):
"""Transforms numpy arrays X, y, and w
Expand Down

0 comments on commit 4441241

Please sign in to comment.