Skip to content

Commit

Permalink
Merge pull request #1965 from deepchem/transformer_docs
Browse files Browse the repository at this point in the history
Assorted Fixes
  • Loading branch information
Bharath Ramsundar committed Jul 5, 2020
2 parents 9e8f9fa + 74fb271 commit 036edfe
Show file tree
Hide file tree
Showing 7 changed files with 279 additions and 246 deletions.
122 changes: 0 additions & 122 deletions deepchem/data/tests/__init__.py
@@ -1,122 +0,0 @@
"""
General API for testing dataset objects
"""
__author__ = "Bharath Ramsundar"
__copyright__ = "Copyright 2016, Stanford University"
__license__ = "MIT"

import unittest
import tempfile
import os
import shutil
import numpy as np
import deepchem as dc


def load_solubility_data():
"""Loads solubility dataset"""
current_dir = os.path.dirname(os.path.abspath(__file__))
featurizer = dc.feat.CircularFingerprint(size=1024)
tasks = ["log-solubility"]
task_type = "regression"
input_file = os.path.join(current_dir, "../../models/tests/example.csv")
loader = dc.data.CSVLoader(
tasks=tasks, smiles_field="smiles", featurizer=featurizer)

return loader.featurize(input_file)


def load_butina_data():
"""Loads solubility dataset"""
current_dir = os.path.dirname(os.path.abspath(__file__))
featurizer = dc.feat.CircularFingerprint(size=1024)
tasks = ["task"]
# task_type = "regression"
input_file = os.path.join(current_dir,
"../../models/tests/butina_example.csv")
loader = dc.data.CSVLoader(
tasks=tasks, smiles_field="smiles", featurizer=featurizer)

return loader.featurize(input_file)


def load_multitask_data():
"""Load example multitask data."""
current_dir = os.path.dirname(os.path.abspath(__file__))
featurizer = dc.feat.CircularFingerprint(size=1024)
tasks = [
"task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7",
"task8", "task9", "task10", "task11", "task12", "task13", "task14",
"task15", "task16"
]
input_file = os.path.join(current_dir,
"../../models/tests/multitask_example.csv")
loader = dc.data.CSVLoader(
tasks=tasks, smiles_field="smiles", featurizer=featurizer)
return loader.featurize(input_file)


def load_classification_data():
"""Loads classification data from example.csv"""
current_dir = os.path.dirname(os.path.abspath(__file__))
featurizer = dc.feat.CircularFingerprint(size=1024)
tasks = ["outcome"]
task_type = "classification"
input_file = os.path.join(current_dir,
"../../models/tests/example_classification.csv")
loader = dc.data.CSVLoader(
tasks=tasks, smiles_field="smiles", featurizer=featurizer)
return loader.featurize(input_file)


def load_sparse_multitask_dataset():
"""Load sparse tox multitask data, sample dataset."""
current_dir = os.path.dirname(os.path.abspath(__file__))
featurizer = dc.feat.CircularFingerprint(size=1024)
tasks = [
"task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8",
"task9"
]
input_file = os.path.join(current_dir,
"../../models/tests/sparse_multitask_example.csv")
loader = dc.data.CSVLoader(
tasks=tasks, smiles_field="smiles", featurizer=featurizer)
return loader.featurize(input_file)


def load_feat_multitask_data():
"""Load example with numerical features, tasks."""
current_dir = os.path.dirname(os.path.abspath(__file__))
features = ["feat0", "feat1", "feat2", "feat3", "feat4", "feat5"]
featurizer = dc.feat.UserDefinedFeaturizer(features)
tasks = ["task0", "task1", "task2", "task3", "task4", "task5"]
input_file = os.path.join(current_dir,
"../../models/tests/feat_multitask_example.csv")
loader = dc.data.UserCSVLoader(
tasks=tasks, featurizer=featurizer, id_field="id")
return loader.featurize(input_file)


def load_gaussian_cdf_data():
"""Load example with numbers sampled from Gaussian normal distribution.
Each feature and task is a column of values that is sampled
from a normal distribution of mean 0, stdev 1."""
current_dir = os.path.dirname(os.path.abspath(__file__))
features = ["feat0", "feat1"]
featurizer = dc.feat.UserDefinedFeaturizer(features)
tasks = ["task0", "task1"]
input_file = os.path.join(current_dir,
"../../models/tests/gaussian_cdf_example.csv")
loader = dc.data.UserCSVLoader(
tasks=tasks, featurizer=featurizer, id_field="id")
return loader.featurize(input_file)


def load_unlabelled_data():
current_dir = os.path.dirname(os.path.abspath(__file__))
featurizer = dc.feat.CircularFingerprint(size=1024)
tasks = []
input_file = os.path.join(current_dir, "../../data/tests/no_labels.csv")
loader = dc.data.CSVLoader(
tasks=tasks, smiles_field="smiles", featurizer=featurizer)
return loader.featurize(input_file)
61 changes: 45 additions & 16 deletions deepchem/data/tests/test_datasets.py
Expand Up @@ -24,6 +24,35 @@
PYTORCH_IMPORT_FAILED = True


def load_solubility_data():
"""Loads solubility dataset"""
current_dir = os.path.dirname(os.path.abspath(__file__))
featurizer = dc.feat.CircularFingerprint(size=1024)
tasks = ["log-solubility"]
task_type = "regression"
input_file = os.path.join(current_dir, "../../models/tests/example.csv")
loader = dc.data.CSVLoader(
tasks=tasks, smiles_field="smiles", featurizer=featurizer)

return loader.create_dataset(input_file)


def load_multitask_data():
"""Load example multitask data."""
current_dir = os.path.dirname(os.path.abspath(__file__))
featurizer = dc.feat.CircularFingerprint(size=1024)
tasks = [
"task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7",
"task8", "task9", "task10", "task11", "task12", "task13", "task14",
"task15", "task16"
]
input_file = os.path.join(current_dir,
"../../models/tests/multitask_example.csv")
loader = dc.data.CSVLoader(
tasks=tasks, smiles_field="smiles", featurizer=featurizer)
return loader.featurize(input_file)


class TestDatasets(test_util.TensorFlowTestCase):
"""
Test basic top-level API for dataset objects.
Expand Down Expand Up @@ -172,10 +201,10 @@ def test_pad_batches(self):

def test_get_task_names(self):
"""Test that get_task_names returns correct task_names"""
solubility_dataset = dc.data.tests.load_solubility_data()
solubility_dataset = load_solubility_data()
assert solubility_dataset.get_task_names() == ["log-solubility"]

multitask_dataset = dc.data.tests.load_multitask_data()
multitask_dataset = load_multitask_data()
assert sorted(multitask_dataset.get_task_names()) == sorted([
"task0", "task1", "task2", "task3", "task4", "task5", "task6", "task7",
"task8", "task9", "task10", "task11", "task12", "task13", "task14",
Expand All @@ -184,20 +213,20 @@ def test_get_task_names(self):

def test_get_data_shape(self):
"""Test that get_data_shape returns currect data shape"""
solubility_dataset = dc.data.tests.load_solubility_data()
solubility_dataset = load_solubility_data()
assert solubility_dataset.get_data_shape() == (1024,)

multitask_dataset = dc.data.tests.load_multitask_data()
multitask_dataset = load_multitask_data()
assert multitask_dataset.get_data_shape() == (1024,)

def test_len(self):
"""Test that len(dataset) works."""
solubility_dataset = dc.data.tests.load_solubility_data()
solubility_dataset = load_solubility_data()
assert len(solubility_dataset) == 10

def test_reshard(self):
"""Test that resharding the dataset works."""
solubility_dataset = dc.data.tests.load_solubility_data()
solubility_dataset = load_solubility_data()
X, y, w, ids = (solubility_dataset.X, solubility_dataset.y,
solubility_dataset.w, solubility_dataset.ids)
assert solubility_dataset.get_number_shards() == 1
Expand Down Expand Up @@ -302,7 +331,7 @@ def test_get_shape(self):

def test_iterbatches(self):
"""Test that iterating over batches of data works."""
solubility_dataset = dc.data.tests.load_solubility_data()
solubility_dataset = load_solubility_data()
batch_size = 2
data_shape = solubility_dataset.get_data_shape()
tasks = solubility_dataset.get_task_names()
Expand Down Expand Up @@ -331,7 +360,7 @@ def test_itersamples_numpy(self):

def test_itersamples_disk(self):
"""Test that iterating over samples in a DiskDataset works."""
solubility_dataset = dc.data.tests.load_solubility_data()
solubility_dataset = load_solubility_data()
X = solubility_dataset.X
y = solubility_dataset.y
w = solubility_dataset.w
Expand Down Expand Up @@ -372,7 +401,7 @@ def fn(x, y, w):

def test_transform_disk(self):
"""Test that the transform() method works for DiskDatasets."""
dataset = dc.data.tests.load_solubility_data()
dataset = load_solubility_data()
X = dataset.X
y = dataset.y
w = dataset.w
Expand All @@ -394,7 +423,7 @@ def fn(x, y, w):

def test_to_numpy(self):
"""Test that transformation to numpy arrays is sensible."""
solubility_dataset = dc.data.tests.load_solubility_data()
solubility_dataset = load_solubility_data()
data_shape = solubility_dataset.get_data_shape()
tasks = solubility_dataset.get_task_names()
X, y, w, ids = (solubility_dataset.X, solubility_dataset.y,
Expand All @@ -409,7 +438,7 @@ def test_to_numpy(self):

def test_consistent_ordering(self):
"""Test that ordering of labels is consistent over time."""
solubility_dataset = dc.data.tests.load_solubility_data()
solubility_dataset = load_solubility_data()

ids1 = solubility_dataset.ids
ids2 = solubility_dataset.ids
Expand All @@ -418,7 +447,7 @@ def test_consistent_ordering(self):

def test_get_statistics(self):
"""Test statistics computation of this dataset."""
solubility_dataset = dc.data.tests.load_solubility_data()
solubility_dataset = load_solubility_data()
X, y, _, _ = (solubility_dataset.X, solubility_dataset.y,
solubility_dataset.w, solubility_dataset.ids)
X_means, y_means = np.mean(X, axis=0), np.mean(y, axis=0)
Expand All @@ -431,7 +460,7 @@ def test_get_statistics(self):
np.testing.assert_allclose(comp_y_stds, y_stds)

def test_disk_iterate_batch_size(self):
solubility_dataset = dc.data.tests.load_solubility_data()
solubility_dataset = load_solubility_data()
X, y, _, _ = (solubility_dataset.X, solubility_dataset.y,
solubility_dataset.w, solubility_dataset.ids)
batch_sizes = []
Expand Down Expand Up @@ -656,7 +685,7 @@ def shard_generator():
np.sort(all_ids, axis=0), np.sort(test_ids, axis=0))

def test_numpy_iterate_batch_size(self):
solubility_dataset = dc.data.tests.load_solubility_data()
solubility_dataset = load_solubility_data()
X, y, _, _ = (solubility_dataset.X, solubility_dataset.y,
solubility_dataset.w, solubility_dataset.ids)
solubility_dataset = dc.data.NumpyDataset.from_DiskDataset(
Expand Down Expand Up @@ -769,12 +798,12 @@ def test_make_pytorch_dataset_from_images(self):
@unittest.skipIf(PYTORCH_IMPORT_FAILED, 'PyTorch is not installed')
def test_make_pytorch_dataset_from_disk(self):
"""Test creating a PyTorch Dataset from a DiskDataset."""
dataset = dc.data.tests.load_solubility_data()
dataset = load_solubility_data()
self._validate_pytorch_dataset(dataset)

def test_dataframe(self):
"""Test converting between Datasets and DataFrames."""
dataset = dc.data.tests.load_solubility_data()
dataset = load_solubility_data()

# A round trip from Dataset to DataFrame to Dataset should produce identical arrays.

Expand Down

0 comments on commit 036edfe

Please sign in to comment.