Skip to content

Commit

Permalink
Merge pull request #2062 from deepchem/cached_shape
Browse files Browse the repository at this point in the history
Add shape metadata to DiskDataset
  • Loading branch information
Bharath Ramsundar committed Aug 13, 2020
2 parents 7691cc5 + 5873743 commit 4e9cfbe
Show file tree
Hide file tree
Showing 58 changed files with 799 additions and 221 deletions.
420 changes: 340 additions & 80 deletions deepchem/data/datasets.py

Large diffs are not rendered by default.

Binary file not shown.
Binary file added deepchem/data/tests/legacy_dataset/shard-0-X.npy
Binary file not shown.
Binary file not shown.
Binary file added deepchem/data/tests/legacy_dataset/shard-0-w.npy
Binary file not shown.
Binary file added deepchem/data/tests/legacy_dataset/shard-0-y.npy
Binary file not shown.
1 change: 1 addition & 0 deletions deepchem/data/tests/legacy_dataset/tasks.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
1 change: 1 addition & 0 deletions deepchem/data/tests/legacy_dataset_reshard/tasks.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
57 changes: 57 additions & 0 deletions deepchem/data/tests/test_copy_and_move.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import deepchem as dc
import tempfile
import numpy as np
import os


def test_copy():
"""Test that copy works correctly."""
num_datapoints = 100
num_features = 10
num_tasks = 10
# Generate data
X = np.random.rand(num_datapoints, num_features)
y = np.random.randint(2, size=(num_datapoints, num_tasks))
w = np.random.randint(2, size=(num_datapoints, num_tasks))
ids = np.array(["id"] * num_datapoints)

# legacy_dataset_reshard is a shared dataset in the legacy format kept
# around for testing resharding.
dataset = dc.data.DiskDataset.from_numpy(X, y, w, ids)
# Set cache to 0 size to avoid cache hiding errors
dataset.memory_cache_size = 0

with tempfile.TemporaryDirectory() as tmpdirname:
copy = dataset.copy(tmpdirname)
assert np.all(copy.X == dataset.X)
assert np.all(copy.y == dataset.y)
assert np.all(copy.w == dataset.w)
assert np.all(copy.ids == dataset.ids)


def test_move():
"""Test that move works correctly."""
num_datapoints = 100
num_features = 10
num_tasks = 10
# Generate data
X = np.random.rand(num_datapoints, num_features)
y = np.random.randint(2, size=(num_datapoints, num_tasks))
w = np.random.randint(2, size=(num_datapoints, num_tasks))
ids = np.array(["id"] * num_datapoints)

# legacy_dataset_reshard is a shared dataset in the legacy format kept
# around for testing resharding.
dataset = dc.data.DiskDataset.from_numpy(X, y, w, ids)
# Set cache to 0 size to avoid cache hiding errors
dataset.memory_cache_size = 0
data_dir = dataset.data_dir

with tempfile.TemporaryDirectory() as tmpdirname:
dataset.move(tmpdirname, delete_if_exists=False)
assert np.all(X == dataset.X)
assert np.all(y == dataset.y)
assert np.all(w == dataset.w)
assert np.all(ids == dataset.ids)
assert dataset.data_dir == os.path.join(tmpdirname,
os.path.basename(data_dir))
24 changes: 0 additions & 24 deletions deepchem/data/tests/test_datasets.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
"""
Tests for dataset creation
"""
__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "MIT"

import random
import math
import unittest
Expand Down Expand Up @@ -343,26 +339,6 @@ def shard_generator():
np.testing.assert_array_equal(np.sort(dataset.ids), np.sort(res.ids))


def test_get_shape():
"""Test that get_shape works."""
num_datapoints = 100
num_features = 10
num_tasks = 10
# Generate data
X = np.random.rand(num_datapoints, num_features)
y = np.random.randint(2, size=(num_datapoints, num_tasks))
w = np.random.randint(2, size=(num_datapoints, num_tasks))
ids = np.array(["id"] * num_datapoints)

dataset = dc.data.NumpyDataset(X, y, w, ids)

X_shape, y_shape, w_shape, ids_shape = dataset.get_shape()
assert X_shape == X.shape
assert y_shape == y.shape
assert w_shape == w.shape
assert ids_shape == ids.shape


def test_iterbatches():
"""Test that iterating over batches of data works."""
solubility_dataset = load_solubility_data()
Expand Down
61 changes: 61 additions & 0 deletions deepchem/data/tests/test_legacy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import os
import deepchem as dc
import numpy as np
import tempfile


def test_make_legacy_dataset_from_numpy():
"""Test that legacy DiskDataset objects can be constructed."""
# This is the shape of legacy_data
num_datapoints = 100
num_features = 10
num_tasks = 10

current_dir = os.path.dirname(os.path.abspath(__file__))
# legacy_dataset is a dataset in the legacy format kept around for testing
# purposes.
data_dir = os.path.join(current_dir, "legacy_dataset")
dataset = dc.data.DiskDataset(data_dir)
assert dataset.legacy_metadata
assert len(dataset.metadata_df.columns) == 4
assert list(dataset.metadata_df.columns) == ['ids', 'X', 'y', 'w']

# Test constructor reload works for legacy format
dataset2 = dc.data.DiskDataset(dataset.data_dir)
assert dataset2.legacy_metadata
assert len(dataset2.metadata_df.columns) == 4
assert list(dataset2.metadata_df.columns) == ['ids', 'X', 'y', 'w']


def test_reshard():
"""Test that resharding updates legacy datasets."""
# This is the shape of legacy_data_reshard
num_datapoints = 100
num_features = 10
num_tasks = 10

# legacy_dataset_reshard is a sharded dataset in the legacy format kept
# around for testing resharding.
current_dir = os.path.dirname(os.path.abspath(__file__))
data_dir = os.path.join(current_dir, "legacy_dataset_reshard")
dataset = dc.data.DiskDataset(data_dir)
assert dataset.legacy_metadata
assert len(dataset.metadata_df.columns) == 4
assert list(dataset.metadata_df.columns) == ['ids', 'X', 'y', 'w']

with tempfile.TemporaryDirectory() as tmpdirname:
copy = dataset.copy(tmpdirname)
assert np.all(copy.X == dataset.X)
assert np.all(copy.y == dataset.y)
assert np.all(copy.w == dataset.w)
assert np.all(copy.ids == dataset.ids)

# Reshard copy
copy.reshard(shard_size=10)
assert copy.get_number_shards() == 10
# Check metadata has been updated
assert not copy.legacy_metadata
assert len(copy.metadata_df.columns) == 8
assert list(copy.metadata_df.columns) == [
'ids', 'X', 'y', 'w', 'ids_shape', 'X_shape', 'y_shape', 'w_shape'
]
69 changes: 30 additions & 39 deletions deepchem/data/tests/test_merge.py
Original file line number Diff line number Diff line change
@@ -1,61 +1,52 @@
"""
Testing singletask/multitask dataset merging
"""
__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "MIT"

import os
import shutil
import tempfile
import unittest
import deepchem as dc
import numpy as np


class TestMerge(unittest.TestCase):
"""
Test singletask/multitask dataset merging.
"""
def test_merge():
"""Test that datasets can be merged."""
current_dir = os.path.dirname(os.path.realpath(__file__))

def test_merge(self):
"""Test that datasets can be merged."""
current_dir = os.path.dirname(os.path.realpath(__file__))
dataset_file = os.path.join(current_dir, "../../models/tests/example.csv")

dataset_file = os.path.join(current_dir, "../../models/tests/example.csv")
featurizer = dc.feat.CircularFingerprint(size=1024)
tasks = ["log-solubility"]
loader = dc.data.CSVLoader(
tasks=tasks, smiles_field="smiles", featurizer=featurizer)
first_dataset = loader.create_dataset(dataset_file)
second_dataset = loader.create_dataset(dataset_file)

featurizer = dc.feat.CircularFingerprint(size=1024)
tasks = ["log-solubility"]
loader = dc.data.CSVLoader(
tasks=tasks, smiles_field="smiles", featurizer=featurizer)
first_dataset = loader.featurize(dataset_file)
second_dataset = loader.featurize(dataset_file)
merged_dataset = dc.data.DiskDataset.merge([first_dataset, second_dataset])

merged_dataset = dc.data.DiskDataset.merge([first_dataset, second_dataset])
assert len(merged_dataset) == len(first_dataset) + len(second_dataset)

assert len(merged_dataset) == len(first_dataset) + len(second_dataset)

def test_subset(self):
"""Tests that subsetting of datasets works."""
current_dir = os.path.dirname(os.path.realpath(__file__))
def test_subset():
"""Tests that subsetting of datasets works."""
current_dir = os.path.dirname(os.path.realpath(__file__))

dataset_file = os.path.join(current_dir, "../../models/tests/example.csv")
dataset_file = os.path.join(current_dir, "../../models/tests/example.csv")

featurizer = dc.feat.CircularFingerprint(size=1024)
tasks = ["log-solubility"]
loader = dc.data.CSVLoader(
tasks=tasks, smiles_field="smiles", featurizer=featurizer)
dataset = loader.featurize(dataset_file, shard_size=2)
featurizer = dc.feat.CircularFingerprint(size=1024)
tasks = ["log-solubility"]
loader = dc.data.CSVLoader(
tasks=tasks, smiles_field="smiles", featurizer=featurizer)
dataset = loader.create_dataset(dataset_file, shard_size=2)

shard_nums = [1, 2]
shard_nums = [1, 2]

orig_ids = dataset.ids
_, _, _, ids_1 = dataset.get_shard(1)
_, _, _, ids_2 = dataset.get_shard(2)
orig_ids = dataset.ids
_, _, _, ids_1 = dataset.get_shard(1)
_, _, _, ids_2 = dataset.get_shard(2)

subset = dataset.subset(shard_nums)
after_ids = dataset.ids
subset = dataset.subset(shard_nums)
after_ids = dataset.ids

assert len(subset) == 4
assert sorted(subset.ids) == sorted(np.concatenate([ids_1, ids_2]))
assert list(orig_ids) == list(after_ids)
assert len(subset) == 4
assert sorted(subset.ids) == sorted(np.concatenate([ids_1, ids_2]))
assert list(orig_ids) == list(after_ids)
20 changes: 20 additions & 0 deletions deepchem/data/tests/test_non_classification_regression_datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import deepchem as dc
import numpy as np


def test_disk_generative_dataset():
"""Test for a hypothetical generative dataset."""
X = np.random.rand(100, 10, 10)
y = np.random.rand(100, 10, 10)
dataset = dc.data.DiskDataset.from_numpy(X, y)
assert (dataset.X == X).all()
assert (dataset.y == y).all()


def test_numpy_generative_dataset():
"""Test for a hypothetical generative dataset."""
X = np.random.rand(100, 10, 10)
y = np.random.rand(100, 10, 10)
dataset = dc.data.NumpyDataset(X, y)
assert (dataset.X == X).all()
assert (dataset.y == y).all()
71 changes: 71 additions & 0 deletions deepchem/data/tests/test_reshard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import deepchem as dc
import numpy as np


def test_reshard_with_X():
"""Test resharding on a simple example"""
X = np.random.rand(100, 10)
dataset = dc.data.DiskDataset.from_numpy(X)
assert dataset.get_number_shards() == 1
dataset.reshard(shard_size=10)
assert (dataset.X == X).all()
assert dataset.get_number_shards() == 10


def test_reshard_with_X_y():
"""Test resharding on a simple example"""
X = np.random.rand(100, 10)
y = np.random.rand(100,)
dataset = dc.data.DiskDataset.from_numpy(X, y)
assert dataset.get_number_shards() == 1
dataset.reshard(shard_size=10)
assert (dataset.X == X).all()
# This is necessary since from_numpy adds in shape information
assert (dataset.y.flatten() == y).all()
assert dataset.get_number_shards() == 10


def test_reshard_with_X_y_generative():
"""Test resharding for a hypothetical generative dataset."""
X = np.random.rand(100, 10, 10)
y = np.random.rand(100, 10, 10)
dataset = dc.data.DiskDataset.from_numpy(X, y)
assert (dataset.X == X).all()
assert (dataset.y == y).all()
assert dataset.get_number_shards() == 1
dataset.reshard(shard_size=10)
assert (dataset.X == X).all()
assert (dataset.y == y).all()
assert dataset.get_number_shards() == 10


def test_reshard_with_X_y_w():
"""Test resharding on a simple example"""
X = np.random.rand(100, 10)
y = np.random.rand(100,)
w = np.ones_like(y)
dataset = dc.data.DiskDataset.from_numpy(X, y, w)
assert dataset.get_number_shards() == 1
dataset.reshard(shard_size=10)
assert (dataset.X == X).all()
# This is necessary since from_numpy adds in shape information
assert (dataset.y.flatten() == y).all()
assert (dataset.w.flatten() == w).all()
assert dataset.get_number_shards() == 10


def test_reshard_with_X_y_w_ids():
"""Test resharding on a simple example"""
X = np.random.rand(100, 10)
y = np.random.rand(100,)
w = np.ones_like(y)
ids = np.arange(100)
dataset = dc.data.DiskDataset.from_numpy(X, y, w, ids)
assert dataset.get_number_shards() == 1
dataset.reshard(shard_size=10)
assert (dataset.X == X).all()
# This is necessary since from_numpy adds in shape information
assert (dataset.y.flatten() == y).all()
assert (dataset.w.flatten() == w).all()
assert (dataset.ids == ids).all()
assert dataset.get_number_shards() == 10
Loading

0 comments on commit 4e9cfbe

Please sign in to comment.