-
Notifications
You must be signed in to change notification settings - Fork 1.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2062 from deepchem/cached_shape
Add shape metadata to DiskDataset
- Loading branch information
Showing
58 changed files
with
799 additions
and
221 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9] |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
import deepchem as dc | ||
import tempfile | ||
import numpy as np | ||
import os | ||
|
||
|
||
def test_copy(): | ||
"""Test that copy works correctly.""" | ||
num_datapoints = 100 | ||
num_features = 10 | ||
num_tasks = 10 | ||
# Generate data | ||
X = np.random.rand(num_datapoints, num_features) | ||
y = np.random.randint(2, size=(num_datapoints, num_tasks)) | ||
w = np.random.randint(2, size=(num_datapoints, num_tasks)) | ||
ids = np.array(["id"] * num_datapoints) | ||
|
||
# legacy_dataset_reshard is a shared dataset in the legacy format kept | ||
# around for testing resharding. | ||
dataset = dc.data.DiskDataset.from_numpy(X, y, w, ids) | ||
# Set cache to 0 size to avoid cache hiding errors | ||
dataset.memory_cache_size = 0 | ||
|
||
with tempfile.TemporaryDirectory() as tmpdirname: | ||
copy = dataset.copy(tmpdirname) | ||
assert np.all(copy.X == dataset.X) | ||
assert np.all(copy.y == dataset.y) | ||
assert np.all(copy.w == dataset.w) | ||
assert np.all(copy.ids == dataset.ids) | ||
|
||
|
||
def test_move(): | ||
"""Test that move works correctly.""" | ||
num_datapoints = 100 | ||
num_features = 10 | ||
num_tasks = 10 | ||
# Generate data | ||
X = np.random.rand(num_datapoints, num_features) | ||
y = np.random.randint(2, size=(num_datapoints, num_tasks)) | ||
w = np.random.randint(2, size=(num_datapoints, num_tasks)) | ||
ids = np.array(["id"] * num_datapoints) | ||
|
||
# legacy_dataset_reshard is a shared dataset in the legacy format kept | ||
# around for testing resharding. | ||
dataset = dc.data.DiskDataset.from_numpy(X, y, w, ids) | ||
# Set cache to 0 size to avoid cache hiding errors | ||
dataset.memory_cache_size = 0 | ||
data_dir = dataset.data_dir | ||
|
||
with tempfile.TemporaryDirectory() as tmpdirname: | ||
dataset.move(tmpdirname, delete_if_exists=False) | ||
assert np.all(X == dataset.X) | ||
assert np.all(y == dataset.y) | ||
assert np.all(w == dataset.w) | ||
assert np.all(ids == dataset.ids) | ||
assert dataset.data_dir == os.path.join(tmpdirname, | ||
os.path.basename(data_dir)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
import os | ||
import deepchem as dc | ||
import numpy as np | ||
import tempfile | ||
|
||
|
||
def test_make_legacy_dataset_from_numpy(): | ||
"""Test that legacy DiskDataset objects can be constructed.""" | ||
# This is the shape of legacy_data | ||
num_datapoints = 100 | ||
num_features = 10 | ||
num_tasks = 10 | ||
|
||
current_dir = os.path.dirname(os.path.abspath(__file__)) | ||
# legacy_dataset is a dataset in the legacy format kept around for testing | ||
# purposes. | ||
data_dir = os.path.join(current_dir, "legacy_dataset") | ||
dataset = dc.data.DiskDataset(data_dir) | ||
assert dataset.legacy_metadata | ||
assert len(dataset.metadata_df.columns) == 4 | ||
assert list(dataset.metadata_df.columns) == ['ids', 'X', 'y', 'w'] | ||
|
||
# Test constructor reload works for legacy format | ||
dataset2 = dc.data.DiskDataset(dataset.data_dir) | ||
assert dataset2.legacy_metadata | ||
assert len(dataset2.metadata_df.columns) == 4 | ||
assert list(dataset2.metadata_df.columns) == ['ids', 'X', 'y', 'w'] | ||
|
||
|
||
def test_reshard(): | ||
"""Test that resharding updates legacy datasets.""" | ||
# This is the shape of legacy_data_reshard | ||
num_datapoints = 100 | ||
num_features = 10 | ||
num_tasks = 10 | ||
|
||
# legacy_dataset_reshard is a sharded dataset in the legacy format kept | ||
# around for testing resharding. | ||
current_dir = os.path.dirname(os.path.abspath(__file__)) | ||
data_dir = os.path.join(current_dir, "legacy_dataset_reshard") | ||
dataset = dc.data.DiskDataset(data_dir) | ||
assert dataset.legacy_metadata | ||
assert len(dataset.metadata_df.columns) == 4 | ||
assert list(dataset.metadata_df.columns) == ['ids', 'X', 'y', 'w'] | ||
|
||
with tempfile.TemporaryDirectory() as tmpdirname: | ||
copy = dataset.copy(tmpdirname) | ||
assert np.all(copy.X == dataset.X) | ||
assert np.all(copy.y == dataset.y) | ||
assert np.all(copy.w == dataset.w) | ||
assert np.all(copy.ids == dataset.ids) | ||
|
||
# Reshard copy | ||
copy.reshard(shard_size=10) | ||
assert copy.get_number_shards() == 10 | ||
# Check metadata has been updated | ||
assert not copy.legacy_metadata | ||
assert len(copy.metadata_df.columns) == 8 | ||
assert list(copy.metadata_df.columns) == [ | ||
'ids', 'X', 'y', 'w', 'ids_shape', 'X_shape', 'y_shape', 'w_shape' | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,61 +1,52 @@ | ||
""" | ||
Testing singletask/multitask dataset merging | ||
""" | ||
__author__ = "Bharath Ramsundar" | ||
__copyright__ = "Copyright 2016, Stanford University" | ||
__license__ = "MIT" | ||
|
||
import os | ||
import shutil | ||
import tempfile | ||
import unittest | ||
import deepchem as dc | ||
import numpy as np | ||
|
||
|
||
class TestMerge(unittest.TestCase): | ||
""" | ||
Test singletask/multitask dataset merging. | ||
""" | ||
def test_merge(): | ||
"""Test that datasets can be merged.""" | ||
current_dir = os.path.dirname(os.path.realpath(__file__)) | ||
|
||
def test_merge(self): | ||
"""Test that datasets can be merged.""" | ||
current_dir = os.path.dirname(os.path.realpath(__file__)) | ||
dataset_file = os.path.join(current_dir, "../../models/tests/example.csv") | ||
|
||
dataset_file = os.path.join(current_dir, "../../models/tests/example.csv") | ||
featurizer = dc.feat.CircularFingerprint(size=1024) | ||
tasks = ["log-solubility"] | ||
loader = dc.data.CSVLoader( | ||
tasks=tasks, smiles_field="smiles", featurizer=featurizer) | ||
first_dataset = loader.create_dataset(dataset_file) | ||
second_dataset = loader.create_dataset(dataset_file) | ||
|
||
featurizer = dc.feat.CircularFingerprint(size=1024) | ||
tasks = ["log-solubility"] | ||
loader = dc.data.CSVLoader( | ||
tasks=tasks, smiles_field="smiles", featurizer=featurizer) | ||
first_dataset = loader.featurize(dataset_file) | ||
second_dataset = loader.featurize(dataset_file) | ||
merged_dataset = dc.data.DiskDataset.merge([first_dataset, second_dataset]) | ||
|
||
merged_dataset = dc.data.DiskDataset.merge([first_dataset, second_dataset]) | ||
assert len(merged_dataset) == len(first_dataset) + len(second_dataset) | ||
|
||
assert len(merged_dataset) == len(first_dataset) + len(second_dataset) | ||
|
||
def test_subset(self): | ||
"""Tests that subsetting of datasets works.""" | ||
current_dir = os.path.dirname(os.path.realpath(__file__)) | ||
def test_subset(): | ||
"""Tests that subsetting of datasets works.""" | ||
current_dir = os.path.dirname(os.path.realpath(__file__)) | ||
|
||
dataset_file = os.path.join(current_dir, "../../models/tests/example.csv") | ||
dataset_file = os.path.join(current_dir, "../../models/tests/example.csv") | ||
|
||
featurizer = dc.feat.CircularFingerprint(size=1024) | ||
tasks = ["log-solubility"] | ||
loader = dc.data.CSVLoader( | ||
tasks=tasks, smiles_field="smiles", featurizer=featurizer) | ||
dataset = loader.featurize(dataset_file, shard_size=2) | ||
featurizer = dc.feat.CircularFingerprint(size=1024) | ||
tasks = ["log-solubility"] | ||
loader = dc.data.CSVLoader( | ||
tasks=tasks, smiles_field="smiles", featurizer=featurizer) | ||
dataset = loader.create_dataset(dataset_file, shard_size=2) | ||
|
||
shard_nums = [1, 2] | ||
shard_nums = [1, 2] | ||
|
||
orig_ids = dataset.ids | ||
_, _, _, ids_1 = dataset.get_shard(1) | ||
_, _, _, ids_2 = dataset.get_shard(2) | ||
orig_ids = dataset.ids | ||
_, _, _, ids_1 = dataset.get_shard(1) | ||
_, _, _, ids_2 = dataset.get_shard(2) | ||
|
||
subset = dataset.subset(shard_nums) | ||
after_ids = dataset.ids | ||
subset = dataset.subset(shard_nums) | ||
after_ids = dataset.ids | ||
|
||
assert len(subset) == 4 | ||
assert sorted(subset.ids) == sorted(np.concatenate([ids_1, ids_2])) | ||
assert list(orig_ids) == list(after_ids) | ||
assert len(subset) == 4 | ||
assert sorted(subset.ids) == sorted(np.concatenate([ids_1, ids_2])) | ||
assert list(orig_ids) == list(after_ids) |
20 changes: 20 additions & 0 deletions
20
deepchem/data/tests/test_non_classification_regression_datasets.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
import deepchem as dc | ||
import numpy as np | ||
|
||
|
||
def test_disk_generative_dataset(): | ||
"""Test for a hypothetical generative dataset.""" | ||
X = np.random.rand(100, 10, 10) | ||
y = np.random.rand(100, 10, 10) | ||
dataset = dc.data.DiskDataset.from_numpy(X, y) | ||
assert (dataset.X == X).all() | ||
assert (dataset.y == y).all() | ||
|
||
|
||
def test_numpy_generative_dataset(): | ||
"""Test for a hypothetical generative dataset.""" | ||
X = np.random.rand(100, 10, 10) | ||
y = np.random.rand(100, 10, 10) | ||
dataset = dc.data.NumpyDataset(X, y) | ||
assert (dataset.X == X).all() | ||
assert (dataset.y == y).all() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
import deepchem as dc | ||
import numpy as np | ||
|
||
|
||
def test_reshard_with_X(): | ||
"""Test resharding on a simple example""" | ||
X = np.random.rand(100, 10) | ||
dataset = dc.data.DiskDataset.from_numpy(X) | ||
assert dataset.get_number_shards() == 1 | ||
dataset.reshard(shard_size=10) | ||
assert (dataset.X == X).all() | ||
assert dataset.get_number_shards() == 10 | ||
|
||
|
||
def test_reshard_with_X_y(): | ||
"""Test resharding on a simple example""" | ||
X = np.random.rand(100, 10) | ||
y = np.random.rand(100,) | ||
dataset = dc.data.DiskDataset.from_numpy(X, y) | ||
assert dataset.get_number_shards() == 1 | ||
dataset.reshard(shard_size=10) | ||
assert (dataset.X == X).all() | ||
# This is necessary since from_numpy adds in shape information | ||
assert (dataset.y.flatten() == y).all() | ||
assert dataset.get_number_shards() == 10 | ||
|
||
|
||
def test_reshard_with_X_y_generative(): | ||
"""Test resharding for a hypothetical generative dataset.""" | ||
X = np.random.rand(100, 10, 10) | ||
y = np.random.rand(100, 10, 10) | ||
dataset = dc.data.DiskDataset.from_numpy(X, y) | ||
assert (dataset.X == X).all() | ||
assert (dataset.y == y).all() | ||
assert dataset.get_number_shards() == 1 | ||
dataset.reshard(shard_size=10) | ||
assert (dataset.X == X).all() | ||
assert (dataset.y == y).all() | ||
assert dataset.get_number_shards() == 10 | ||
|
||
|
||
def test_reshard_with_X_y_w(): | ||
"""Test resharding on a simple example""" | ||
X = np.random.rand(100, 10) | ||
y = np.random.rand(100,) | ||
w = np.ones_like(y) | ||
dataset = dc.data.DiskDataset.from_numpy(X, y, w) | ||
assert dataset.get_number_shards() == 1 | ||
dataset.reshard(shard_size=10) | ||
assert (dataset.X == X).all() | ||
# This is necessary since from_numpy adds in shape information | ||
assert (dataset.y.flatten() == y).all() | ||
assert (dataset.w.flatten() == w).all() | ||
assert dataset.get_number_shards() == 10 | ||
|
||
|
||
def test_reshard_with_X_y_w_ids(): | ||
"""Test resharding on a simple example""" | ||
X = np.random.rand(100, 10) | ||
y = np.random.rand(100,) | ||
w = np.ones_like(y) | ||
ids = np.arange(100) | ||
dataset = dc.data.DiskDataset.from_numpy(X, y, w, ids) | ||
assert dataset.get_number_shards() == 1 | ||
dataset.reshard(shard_size=10) | ||
assert (dataset.X == X).all() | ||
# This is necessary since from_numpy adds in shape information | ||
assert (dataset.y.flatten() == y).all() | ||
assert (dataset.w.flatten() == w).all() | ||
assert (dataset.ids == ids).all() | ||
assert dataset.get_number_shards() == 10 |
Oops, something went wrong.