Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BBBC5 Cell Counting Loader Addition #3798

Merged
merged 10 commits into from
Feb 2, 2024
2 changes: 1 addition & 1 deletion deepchem/molnet/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from deepchem.molnet.load_function.bace_datasets import load_bace_classification, load_bace_regression
from deepchem.molnet.load_function.bbbc_datasets import load_bbbc001, load_bbbc002, load_bbbc003, load_bbbc004
from deepchem.molnet.load_function.bbbc_datasets import load_bbbc001, load_bbbc002, load_bbbc003, load_bbbc004, load_bbbc005
from deepchem.molnet.load_function.bbbp_datasets import load_bbbp
from deepchem.molnet.load_function.cell_counting_datasets import load_cell_counting
from deepchem.molnet.load_function.chembl_datasets import load_chembl
Expand Down
103 changes: 103 additions & 0 deletions deepchem/molnet/load_function/bbbc_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from deepchem.molnet.load_function.molnet_loader import TransformerGenerator, _MolnetLoader
from deepchem.data import Dataset
from typing import List, Optional, Tuple, Union
import zipfile
import numpy as np
import pandas as pd

Expand All @@ -26,8 +27,13 @@

BBBC4_TASKS = ["cell-count"]

BBBC5_IMAGE_URL = 'https://data.broadinstitute.org/bbbc/BBBC005/BBBC005_v1_images.zip'
BBBC5_FOREGROUND_URL = 'https://data.broadinstitute.org/bbbc/BBBC005/BBBC005_v1_ground_truth.zip'
BBBC5_TASKS = ["cell-count"]


class _BBBC001_Loader(_MolnetLoader):
"""BBBC001 cell count dataset loader"""

def create_dataset(self) -> Dataset:
"""Creates a dataset from BBBC001 images and cell counts as labels"""
Expand Down Expand Up @@ -91,6 +97,7 @@ def load_bbbc001(


class _BBBC002_Loader(_MolnetLoader):
"""BBBC002 cell count dataset loader"""

def create_dataset(self) -> Dataset:
"""Creates a dataset from BBBC002 images and cell counts as labels"""
Expand Down Expand Up @@ -487,3 +494,99 @@ def load_bbbc004(
**kwargs)

return loader.load_dataset('bbbc004', reload)


class _BBBC005_Loader(_MolnetLoader):
"""BBBC005 cell count dataset loader"""

def create_dataset(self):
dataset_file = os.path.join(self.data_dir, "BBBC005_v1_images.zip")
if not os.path.exists(dataset_file):
dc.utils.data_utils.download_url(url=BBBC5_IMAGE_URL,
dest_dir=self.data_dir)

labels = []

# Read the zip file
with zipfile.ZipFile(dataset_file, 'r') as zip_ref:
file_list = zip_ref.namelist()

file_list = file_list[1:]

# Get the labels from filenames
for filename in file_list:
if filename.split('/')[-1].split('.')[-1] == 'TIF':
labels.append(int(filename.split('/')[-1].split('_')[2][1:]))

lbx = np.array(labels, dtype=np.int32)

loader = dc.data.ImageLoader(sorting=False)
return loader.create_dataset(inputs=(dataset_file, lbx),
in_memory=False)


def load_bbbc005(
splitter: Union[dc.splits.Splitter, str, None] = 'index',
transformers: List[Union[TransformerGenerator, str]] = [],
reload: bool = True,
data_dir: Optional[str] = None,
save_dir: Optional[str] = None,
**kwargs
) -> Tuple[List[str], Tuple[Dataset, ...], List[dc.trans.Transformer]]:
"""Load BBBC005 dataset

This dataset contains data corresponding to 19,200 samples of synthetically generated
fluorescent cell population images. These images were simulated for a given cell count
with a clustering probablity of 25% and a CCD noise variance of 0.0001. Focus blur
was simulated by applying varying Guassian filters to the images. Each image is of
size 520x696. Ground truth labels contain cell counts for this dataset. Full details
about this dataset are present at
https://data.broadinstitute.org/bbbc/BBBC005/.

Parameters
----------
splitter: Splitter or str
the splitter to use for splitting the data into training, validation, and
test sets. Alternatively you can pass one of the names from
dc.molnet.splitters as a shortcut. If this is None, all the data
will be included in a single dataset.
transformers: list of TransformerGenerators or strings
the Transformers to apply to the data. Each one is specified by a
TransformerGenerator or, as a shortcut, one of the names from
dc.molnet.transformers.
reload: bool
if True, the first call for a particular featurizer and splitter will cache
the datasets to disk, and subsequent calls will reload the cached datasets.
data_dir: str
a directory to save the raw data in
save_dir: str
a directory to save the dataset in

Examples
--------
Importing necessary modules

>> import deepchem as dc
>> import numpy as np

We will now load the BBBC005 dataset with cell counts as labels.

>> loader = dc.molnet.load_bbbc005()
>> tasks, dataset, transformers = loader
>> train, val, test = dataset

We now have a dataset with a total of 19,200 samples with cell counts in
the range of 1-100. The images are of size 520x696. The labels are cell
counts. We have a train-val-test split of 80:10:10. We can verify this as follows:

>> train.X.shape
(15360, 520, 696)
>> train.y.shape
(15360,)
"""
featurizer = dc.feat.UserDefinedFeaturizer([]) # Not actually used
loader: _MolnetLoader
loader = _BBBC005_Loader(featurizer, splitter, transformers, BBBC5_TASKS,
data_dir, save_dir, **kwargs)

return loader.load_dataset('bbbc005', reload)
92 changes: 92 additions & 0 deletions deepchem/molnet/load_function/tests/tests_bbbc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
"""
Tests for BBBC Loaders.
"""

import unittest
import deepchem as dc


class TestBBBCLoader(unittest.TestCase):
"""
Test BBBC Loaders
"""

def test_bbbc001(self):
"""
Test loading BBBC001
"""
loader = dc.molnet.load_bbbc001()
tasks, dataset, transformers = loader
train, val, test = dataset
assert train.X.shape == (4, 512, 512)
assert train.y.shape == (4,)
assert train.w.shape == (4,)
assert train.ids.shape == (4,)
assert val.X.shape == (1, 512, 512)
assert val.y.shape == (1,)
assert val.w.shape == (1,)
assert val.ids.shape == (1,)
assert test.X.shape == (1, 512, 512)
assert test.y.shape == (1,)
assert test.w.shape == (1,)
assert test.ids.shape == (1,)

def test_bbbc002(self):
"""
Test loading BBBC002
"""
loader = dc.molnet.load_bbbc002()
tasks, dataset, transformers = loader
train, val, test = dataset
assert train.X.shape == (40, 512, 512)
assert train.y.shape == (40,)
assert train.w.shape == (40,)
assert train.ids.shape == (40,)
assert val.X.shape == (5, 512, 512)
assert val.y.shape == (5,)
assert val.w.shape == (5,)
assert val.ids.shape == (5,)
assert test.X.shape == (5, 512, 512)
assert test.y.shape == (5,)
assert test.w.shape == (5,)
assert test.ids.shape == (5,)

def test_bbbc004_segmentation(self):
"""
Test loading BBBC004 Segmentation Masks as labels
"""
loader = dc.molnet.load_bbbc004(load_segmentation_masks=True)
tasks, dataset, transformers = loader
train, val, test = dataset
assert train.X.shape == (16, 950, 950)
assert train.y.shape == (16, 950, 950, 3)
assert train.w.shape == (16, 1)
assert train.ids.shape == (16,)
assert val.X.shape == (2, 950, 950)
assert val.y.shape == (2, 950, 950, 3)
assert val.w.shape == (2, 1)
assert val.ids.shape == (2,)
assert test.X.shape == (2, 950, 950)
assert test.y.shape == (2, 950, 950, 3)
assert test.w.shape == (2, 1)
assert test.ids.shape == (2,)

def test_bbbc004_counts(self):
"""
Test loading BBBC004 Cell Counts as labels
"""
loader = dc.molnet.load_bbbc004()
tasks, dataset, transformers = loader
train, val, test = dataset
assert train.X.shape == (16, 950, 950)
assert train.y.shape == (16,)
assert train.w.shape == (16,)
assert train.ids.shape == (16,)
assert val.X.shape == (2, 950, 950)
assert val.y.shape == (2,)
assert val.w.shape == (2,)
assert val.ids.shape == (2,)
assert test.X.shape == (2, 950, 950)
assert test.y.shape == (2,)
assert test.w.shape == (2,)
assert test.ids.shape == (2,)
2 changes: 2 additions & 0 deletions docs/source/api_reference/moleculenet.rst
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ BBBC Datasets

.. autofunction:: deepchem.molnet.load_bbbc004

.. autofunction:: deepchem.molnet.load_bbbc005

BBBP Datasets
-------------
BBBP stands for Blood-Brain-Barrier Penetration
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ BBBC (BBBC001),Images of HT29 colon cancer cells,Images,6,`ref <https://data.bro
BBBC (BBBC002),Images of Drosophilia Kc167 cells,Images,50,`ref <https://data.broadinstitute.org/bbbc/BBBC002/>`_
BBBC (BBBC003),DIC Images of Mouse Embryos,Images,15,`ref <https://data.broadinstitute.org/bbbc/BBBC003/>`_
BBBC (BBBC004),Synthetic Images of clustered nuclei,Images,20,`ref <https://data.broadinstitute.org/bbbc/BBBC004/>`_
BBBC (BBBC004),Synthetic Images of clustered nuclei,Images,19200,`ref <https://data.broadinstitute.org/bbbc/BBBC005/>`_
BBBP,Blood-Brain Barrier Penetration designed for the modeling and prediction of barrier permeability,Binary labels on permeability properties,2000,`ref <https://pubs.rsc.org/en/content/articlehtml/2018/sc/c7sc02664a>`_
Cell Counting,Synthetic emulations of fluorescence microscopic images of bacterial cells,Images,200,`ref <http://www.robots.ox.ac.uk/~vgg/research/counting/index_org.html.>`_
ChEMBL (set = ‘sparse’),A sparse subset of ChEMBL with activity data for one target,Molecules,244 245,`ref <https://www.ebi.ac.uk/chembl/.>`_
Expand Down
Loading