Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
176 additions
and
137 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,148 +1,181 @@ | ||
import os | ||
import numpy as np | ||
import unittest | ||
import deepchem as dc | ||
import itertools | ||
import os | ||
import tempfile | ||
|
||
|
||
def test_binary_1d(): | ||
"""Test balancing transformer on single-task dataset without explicit task dimension.""" | ||
n_samples = 20 | ||
n_features = 3 | ||
n_classes = 2 | ||
np.random.seed(123) | ||
ids = np.arange(n_samples) | ||
X = np.random.rand(n_samples, n_features) | ||
y = np.random.randint(n_classes, size=(n_samples,)) | ||
w = np.ones((n_samples,)) | ||
dataset = dc.data.NumpyDataset(X, y, w) | ||
|
||
balancing_transformer = dc.trans.BalancingTransformer( | ||
transform_w=True, dataset=dataset) | ||
dataset = balancing_transformer.transform(dataset) | ||
X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids) | ||
# Check ids are unchanged. | ||
for id_elt, id_t_elt in zip(ids, ids_t): | ||
assert id_elt == id_t_elt | ||
# Check X is unchanged since this is a w transformer | ||
np.testing.assert_allclose(X, X_t) | ||
# Check y is unchanged since this is a w transformer | ||
np.testing.assert_allclose(y, y_t) | ||
y_task = y_t | ||
w_task = w_t | ||
w_orig_task = w | ||
# Assert that entries with zero weight retain zero weight | ||
np.testing.assert_allclose(w_task[w_orig_task == 0], | ||
np.zeros_like(w_task[w_orig_task == 0])) | ||
# Check that sum of 0s equals sum of 1s in transformed for each task | ||
assert np.isclose(np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1])) | ||
|
||
class TestBalancingTransformer(unittest.TestCase): | ||
""" | ||
Test BalancingTransformer functionality. | ||
""" | ||
|
||
def test_binary_1d(self): | ||
"""Test balancing transformer on single-task dataset without explicit task dimension.""" | ||
n_samples = 20 | ||
n_features = 3 | ||
n_classes = 2 | ||
np.random.seed(123) | ||
ids = np.arange(n_samples) | ||
X = np.random.rand(n_samples, n_features) | ||
y = np.random.randint(n_classes, size=(n_samples,)) | ||
w = np.ones((n_samples,)) | ||
dataset = dc.data.NumpyDataset(X, y, w) | ||
def test_binary_singletask(): | ||
"""Test balancing transformer on single-task dataset.""" | ||
n_samples = 20 | ||
n_features = 3 | ||
n_tasks = 1 | ||
n_classes = 2 | ||
np.random.seed(123) | ||
ids = np.arange(n_samples) | ||
X = np.random.rand(n_samples, n_features) | ||
y = np.random.randint(n_classes, size=(n_samples, n_tasks)) | ||
w = np.ones((n_samples, n_tasks)) | ||
dataset = dc.data.NumpyDataset(X, y, w) | ||
|
||
balancing_transformer = dc.trans.BalancingTransformer( | ||
transform_w=True, dataset=dataset) | ||
dataset = balancing_transformer.transform(dataset) | ||
X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids) | ||
# Check ids are unchanged. | ||
for id_elt, id_t_elt in zip(ids, ids_t): | ||
assert id_elt == id_t_elt | ||
# Check X is unchanged since this is a w transformer | ||
np.testing.assert_allclose(X, X_t) | ||
# Check y is unchanged since this is a w transformer | ||
np.testing.assert_allclose(y, y_t) | ||
y_task = y_t | ||
w_task = w_t | ||
w_orig_task = w | ||
balancing_transformer = dc.trans.BalancingTransformer( | ||
transform_w=True, dataset=dataset) | ||
dataset = balancing_transformer.transform(dataset) | ||
X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids) | ||
# Check ids are unchanged. | ||
for id_elt, id_t_elt in zip(ids, ids_t): | ||
assert id_elt == id_t_elt | ||
# Check X is unchanged since this is a w transformer | ||
np.testing.assert_allclose(X, X_t) | ||
# Check y is unchanged since this is a w transformer | ||
np.testing.assert_allclose(y, y_t) | ||
for ind, task in enumerate(dataset.get_task_names()): | ||
y_task = y_t[:, ind] | ||
w_task = w_t[:, ind] | ||
w_orig_task = w[:, ind] | ||
# Assert that entries with zero weight retain zero weight | ||
np.testing.assert_allclose(w_task[w_orig_task == 0], | ||
np.zeros_like(w_task[w_orig_task == 0])) | ||
# Check that sum of 0s equals sum of 1s in transformed for each task | ||
assert np.isclose(np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1])) | ||
|
||
def test_binary_singletask(self): | ||
"""Test balancing transformer on single-task dataset.""" | ||
n_samples = 20 | ||
n_features = 3 | ||
n_tasks = 1 | ||
n_classes = 2 | ||
np.random.seed(123) | ||
ids = np.arange(n_samples) | ||
X = np.random.rand(n_samples, n_features) | ||
y = np.random.randint(n_classes, size=(n_samples, n_tasks)) | ||
w = np.ones((n_samples, n_tasks)) | ||
dataset = dc.data.NumpyDataset(X, y, w) | ||
|
||
balancing_transformer = dc.trans.BalancingTransformer( | ||
transform_w=True, dataset=dataset) | ||
dataset = balancing_transformer.transform(dataset) | ||
X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids) | ||
# Check ids are unchanged. | ||
for id_elt, id_t_elt in zip(ids, ids_t): | ||
assert id_elt == id_t_elt | ||
# Check X is unchanged since this is a w transformer | ||
np.testing.assert_allclose(X, X_t) | ||
# Check y is unchanged since this is a w transformer | ||
np.testing.assert_allclose(y, y_t) | ||
for ind, task in enumerate(dataset.get_task_names()): | ||
y_task = y_t[:, ind] | ||
w_task = w_t[:, ind] | ||
w_orig_task = w[:, ind] | ||
# Assert that entries with zero weight retain zero weight | ||
np.testing.assert_allclose(w_task[w_orig_task == 0], | ||
np.zeros_like(w_task[w_orig_task == 0])) | ||
# Check that sum of 0s equals sum of 1s in transformed for each task | ||
assert np.isclose( | ||
np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1])) | ||
def test_binary_multitask(): | ||
"""Test balancing transformer on multitask dataset.""" | ||
n_samples = 10 | ||
n_features = 3 | ||
n_tasks = 5 | ||
n_classes = 2 | ||
ids = np.arange(n_samples) | ||
X = np.random.rand(n_samples, n_features) | ||
y = np.random.randint(n_classes, size=(n_samples, n_tasks)) | ||
w = np.ones((n_samples, n_tasks)) | ||
multitask_dataset = dc.data.NumpyDataset(X, y, w) | ||
balancing_transformer = dc.trans.BalancingTransformer( | ||
transform_w=True, dataset=multitask_dataset) | ||
#X, y, w, ids = (multitask_dataset.X, multitask_dataset.y, | ||
# multitask_dataset.w, multitask_dataset.ids) | ||
multitask_dataset = balancing_transformer.transform(multitask_dataset) | ||
X_t, y_t, w_t, ids_t = (multitask_dataset.X, multitask_dataset.y, | ||
multitask_dataset.w, multitask_dataset.ids) | ||
# Check ids are unchanged. | ||
for id_elt, id_t_elt in zip(ids, ids_t): | ||
assert id_elt == id_t_elt | ||
# Check X is unchanged since this is a w transformer | ||
np.testing.assert_allclose(X, X_t) | ||
# Check y is unchanged since this is a w transformer | ||
np.testing.assert_allclose(y, y_t) | ||
for ind, task in enumerate(multitask_dataset.get_task_names()): | ||
y_task = y_t[:, ind] | ||
w_task = w_t[:, ind] | ||
w_orig_task = w[:, ind] | ||
# Assert that entries with zero weight retain zero weight | ||
np.testing.assert_allclose(w_task[w_orig_task == 0], | ||
np.zeros_like(w_task[w_orig_task == 0])) | ||
# Check that sum of 0s equals sum of 1s in transformed for each task | ||
assert np.isclose(np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1])) | ||
|
||
|
||
def test_binary_multitask(self): | ||
"""Test balancing transformer on multitask dataset.""" | ||
n_samples = 10 | ||
n_features = 3 | ||
n_tasks = 5 | ||
n_classes = 2 | ||
ids = np.arange(n_samples) | ||
X = np.random.rand(n_samples, n_features) | ||
y = np.random.randint(n_classes, size=(n_samples, n_tasks)) | ||
w = np.ones((n_samples, n_tasks)) | ||
multitask_dataset = dc.data.NumpyDataset(X, y, w) | ||
balancing_transformer = dc.trans.BalancingTransformer( | ||
transform_w=True, dataset=multitask_dataset) | ||
#X, y, w, ids = (multitask_dataset.X, multitask_dataset.y, | ||
# multitask_dataset.w, multitask_dataset.ids) | ||
multitask_dataset = balancing_transformer.transform(multitask_dataset) | ||
X_t, y_t, w_t, ids_t = (multitask_dataset.X, multitask_dataset.y, | ||
multitask_dataset.w, multitask_dataset.ids) | ||
# Check ids are unchanged. | ||
for id_elt, id_t_elt in zip(ids, ids_t): | ||
assert id_elt == id_t_elt | ||
# Check X is unchanged since this is a w transformer | ||
np.testing.assert_allclose(X, X_t) | ||
# Check y is unchanged since this is a w transformer | ||
np.testing.assert_allclose(y, y_t) | ||
for ind, task in enumerate(multitask_dataset.get_task_names()): | ||
y_task = y_t[:, ind] | ||
w_task = w_t[:, ind] | ||
w_orig_task = w[:, ind] | ||
# Assert that entries with zero weight retain zero weight | ||
np.testing.assert_allclose(w_task[w_orig_task == 0], | ||
np.zeros_like(w_task[w_orig_task == 0])) | ||
# Check that sum of 0s equals sum of 1s in transformed for each task | ||
def test_multiclass_singletask(): | ||
"""Test balancing transformer on single-task dataset.""" | ||
n_samples = 50 | ||
n_features = 3 | ||
n_tasks = 1 | ||
n_classes = 5 | ||
ids = np.arange(n_samples) | ||
X = np.random.rand(n_samples, n_features) | ||
y = np.random.randint(n_classes, size=(n_samples, n_tasks)) | ||
w = np.ones((n_samples, n_tasks)) | ||
dataset = dc.data.NumpyDataset(X, y, w) | ||
|
||
balancing_transformer = dc.trans.BalancingTransformer( | ||
transform_w=True, dataset=dataset) | ||
dataset = balancing_transformer.transform(dataset) | ||
X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids) | ||
# Check ids are unchanged. | ||
for id_elt, id_t_elt in zip(ids, ids_t): | ||
assert id_elt == id_t_elt | ||
# Check X is unchanged since this is a w transformer | ||
np.testing.assert_allclose(X, X_t) | ||
# Check y is unchanged since this is a w transformer | ||
np.testing.assert_allclose(y, y_t) | ||
for ind, task in enumerate(dataset.get_task_names()): | ||
y_task = y_t[:, ind] | ||
w_task = w_t[:, ind] | ||
w_orig_task = w[:, ind] | ||
# Check that sum of 0s equals sum of 1s in transformed for each task | ||
for i, j in itertools.product(range(n_classes), range(n_classes)): | ||
if i == j: | ||
continue | ||
assert np.isclose( | ||
np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1])) | ||
np.sum(w_task[y_task == i]), np.sum(w_task[y_task == j])) | ||
|
||
|
||
def test_multiclass_singletask(self): | ||
"""Test balancing transformer on single-task dataset.""" | ||
n_samples = 50 | ||
n_features = 3 | ||
n_tasks = 1 | ||
n_classes = 5 | ||
ids = np.arange(n_samples) | ||
X = np.random.rand(n_samples, n_features) | ||
y = np.random.randint(n_classes, size=(n_samples, n_tasks)) | ||
w = np.ones((n_samples, n_tasks)) | ||
dataset = dc.data.NumpyDataset(X, y, w) | ||
def test_transform_to_directory(): | ||
"""Test that output can be written to a directory.""" | ||
n_samples = 20 | ||
n_features = 3 | ||
n_classes = 2 | ||
np.random.seed(123) | ||
ids = np.arange(n_samples) | ||
X = np.random.rand(n_samples, n_features) | ||
y = np.random.randint(n_classes, size=(n_samples,)) | ||
w = np.ones((n_samples,)) | ||
dataset = dc.data.NumpyDataset(X, y, w) | ||
|
||
balancing_transformer = dc.trans.BalancingTransformer( | ||
transform_w=True, dataset=dataset) | ||
dataset = balancing_transformer.transform(dataset) | ||
X_t, y_t, w_t, ids_t = (dataset.X, dataset.y, dataset.w, dataset.ids) | ||
# Check ids are unchanged. | ||
for id_elt, id_t_elt in zip(ids, ids_t): | ||
assert id_elt == id_t_elt | ||
# Check X is unchanged since this is a w transformer | ||
np.testing.assert_allclose(X, X_t) | ||
# Check y is unchanged since this is a w transformer | ||
np.testing.assert_allclose(y, y_t) | ||
for ind, task in enumerate(dataset.get_task_names()): | ||
y_task = y_t[:, ind] | ||
w_task = w_t[:, ind] | ||
w_orig_task = w[:, ind] | ||
# Check that sum of 0s equals sum of 1s in transformed for each task | ||
for i, j in itertools.product(range(n_classes), range(n_classes)): | ||
if i == j: | ||
continue | ||
assert np.isclose( | ||
np.sum(w_task[y_task == i]), np.sum(w_task[y_task == j])) | ||
balancing_transformer = dc.trans.BalancingTransformer( | ||
transform_w=True, dataset=dataset) | ||
with tempfile.TemporaryDirectory() as tmpdirname: | ||
dataset = balancing_transformer.transform(dataset, out_dir=tmpdirname) | ||
balanced_dataset = dc.data.DiskDataset(tmpdirname) | ||
X_t, y_t, w_t, ids_t = (balanced_dataset.X, balanced_dataset.y, | ||
balanced_dataset.w, balanced_dataset.ids) | ||
# Check ids are unchanged. | ||
for id_elt, id_t_elt in zip(ids, ids_t): | ||
assert id_elt == id_t_elt | ||
# Check X is unchanged since this is a w transformer | ||
np.testing.assert_allclose(X, X_t) | ||
# Check y is unchanged since this is a w transformer | ||
np.testing.assert_allclose(y, y_t) | ||
y_task = y_t | ||
w_task = w_t | ||
w_orig_task = w | ||
# Assert that entries with zero weight retain zero weight | ||
np.testing.assert_allclose(w_task[w_orig_task == 0], | ||
np.zeros_like(w_task[w_orig_task == 0])) | ||
# Check that sum of 0s equals sum of 1s in transformed for each task | ||
assert np.isclose(np.sum(w_task[y_task == 0]), np.sum(w_task[y_task == 1])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters