Skip to content

Commit

Permalink
Changes
Browse files Browse the repository at this point in the history
  • Loading branch information
Bharath Ramsundar authored and Bharath Ramsundar committed Jul 31, 2020
1 parent de59c02 commit cb8bb5d
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 63 deletions.
11 changes: 5 additions & 6 deletions deepchem/data/datasets.py
Expand Up @@ -1051,7 +1051,6 @@ def __init__(self, data_dir: str) -> None:
Location on disk of an existing `DiskDataset`.
"""
self.data_dir = data_dir
self.legacy_metadata = legacy_metadata

logger.info("Loading dataset from disk.")
self.tasks, self.metadata_df = self.load_metadata()
Expand Down Expand Up @@ -1114,7 +1113,7 @@ def create_dataset(shard_generator: Iterable[Batch],
DiskDataset._save_metadata(tasks, metadata_df, data_dir)
time2 = time.time()
logger.info("TIMING: dataset construction took %0.3f s" % (time2 - time1))
return DiskDataset(data_dir, legacy_metadata)
return DiskDataset(data_dir)

def load_metadata(self) -> Tuple[List[str], pd.DataFrame]:
"""Helper method that loads metadata from disk."""
Expand Down Expand Up @@ -2193,23 +2192,23 @@ def get_shape(self) -> Tuple[Shape, Shape, Shape, Shape]:
for shard_num in range(n_rows):
row = self.metadata_df.iloc[shard_num]
if row['X_shape'] is not None:
shard_X_shape = make_tuple(row['X_shape'])
shard_X_shape = make_tuple(str(row['X_shape']))
else:
shard_X_shape = tuple()
if n_tasks > 0:
if row['y_shape'] is not None:
shard_y_shape = make_tuple(row['y_shape'])
shard_y_shape = make_tuple(str(row['y_shape']))
else:
shard_y_shape = tuple()
if row['w_shape'] is not None:
shard_w_shape = make_tuple(row['w_shape'])
shard_w_shape = make_tuple(str(row['w_shape']))
else:
shard_w_shape = tuple()
else:
shard_y_shape = tuple()
shard_w_shape = tuple()
if row['ids_shape'] is not None:
shard_ids_shape = make_tuple(row['ids_shape'])
shard_ids_shape = make_tuple(str(row['ids_shape']))
else:
shard_ids_shape = tuple()
if shard_num == 0:
Expand Down
116 changes: 59 additions & 57 deletions deepchem/models/tests/test_generalize.py
Expand Up @@ -122,63 +122,65 @@ def model_builder(model_dir):
assert score > .5


#def test_sklearn_classification():
# """Test that sklearn models can learn on simple classification datasets."""
# np.random.seed(123)
# dataset = sklearn.datasets.load_digits(n_class=2)
# X, y = dataset.data, dataset.target

# frac_train = .7
# n_samples = len(X)
# n_train = int(frac_train*n_samples)
# X_train, y_train = X[:n_train], y[:n_train]
# X_test, y_test = X[n_train:], y[n_train:]
# train_dataset = dc.data.NumpyDataset(X_train, y_train)
# test_dataset = dc.data.NumpyDataset(X_test, y_test)

# classification_metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
# sklearn_model = LogisticRegression()
# model = dc.models.SklearnModel(sklearn_model)

# # Fit trained model
# model.fit(train_dataset)
# model.save()

# # Eval model on test
# scores = model.evaluate(test_dataset, [classification_metric])
# assert scores[classification_metric.name] > .5

#def test_sklearn_multitask_classification():
# """Test that sklearn models can learn on simple multitask classification."""
# np.random.seed(123)
# n_tasks = 4
# tasks = range(n_tasks)
# dataset = sklearn.datasets.load_digits(n_class=2)
# X, y = dataset.data, dataset.target
# y = np.reshape(y, (len(y), 1))
# y = np.hstack([y] * n_tasks)
#
# frac_train = .7
# n_samples = len(X)
# n_train = int(frac_train*n_samples)
# X_train, y_train = X[:n_train], y[:n_train]
# X_test, y_test = X[n_train:], y[n_train:]
# train_dataset = dc.data.DiskDataset.from_numpy(X_train, y_train)
# test_dataset = dc.data.DiskDataset.from_numpy(X_test, y_test)

# classification_metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
# def model_builder(model_dir):
# sklearn_model = LogisticRegression()
# return dc.models.SklearnModel(sklearn_model, model_dir)
# model = dc.models.SingletaskToMultitask(tasks, model_builder)

# # Fit trained model
# model.fit(train_dataset)
# model.save()
# # Eval model on test
# scores = model.evaluate(test_dataset, [classification_metric])
# for score in scores[classification_metric.name]:
# assert score > .5
def test_sklearn_classification():
"""Test that sklearn models can learn on simple classification datasets."""
np.random.seed(123)
dataset = sklearn.datasets.load_digits(n_class=2)
X, y = dataset.data, dataset.target

frac_train = .7
n_samples = len(X)
n_train = int(frac_train * n_samples)
X_train, y_train = X[:n_train], y[:n_train]
X_test, y_test = X[n_train:], y[n_train:]
train_dataset = dc.data.NumpyDataset(X_train, y_train)
test_dataset = dc.data.NumpyDataset(X_test, y_test)

classification_metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
sklearn_model = LogisticRegression()
model = dc.models.SklearnModel(sklearn_model)

# Fit trained model
model.fit(train_dataset)
model.save()

# Eval model on test
scores = model.evaluate(test_dataset, [classification_metric])
assert scores[classification_metric.name] > .5


def test_sklearn_multitask_classification():
"""Test that sklearn models can learn on simple multitask classification."""
np.random.seed(123)
n_tasks = 4
tasks = range(n_tasks)
dataset = sklearn.datasets.load_digits(n_class=2)
X, y = dataset.data, dataset.target
y = np.reshape(y, (len(y), 1))
y = np.hstack([y] * n_tasks)

frac_train = .7
n_samples = len(X)
n_train = int(frac_train * n_samples)
X_train, y_train = X[:n_train], y[:n_train]
X_test, y_test = X[n_train:], y[n_train:]
train_dataset = dc.data.DiskDataset.from_numpy(X_train, y_train)
test_dataset = dc.data.DiskDataset.from_numpy(X_test, y_test)

classification_metric = dc.metrics.Metric(dc.metrics.roc_auc_score)

def model_builder(model_dir):
sklearn_model = LogisticRegression()
return dc.models.SklearnModel(sklearn_model, model_dir)

model = dc.models.SingletaskToMultitask(tasks, model_builder)

# Fit trained model
model.fit(train_dataset)
model.save()
# Eval model on test
scores = model.evaluate(test_dataset, [classification_metric])
assert scores['roc_auc_score'] > 0.5


def test_xgboost_regression():
Expand Down

0 comments on commit cb8bb5d

Please sign in to comment.