deepchem · rbharath · Feb 2, 2024 · Jan 5, 2024 · Jan 22, 2024 · Jan 22, 2024
diff --git a/deepchem/molnet/__init__.py b/deepchem/molnet/__init__.py
@@ -1,5 +1,5 @@
 from deepchem.molnet.load_function.bace_datasets import load_bace_classification, load_bace_regression
-from deepchem.molnet.load_function.bbbc_datasets import load_bbbc001, load_bbbc002, load_bbbc003, load_bbbc004
+from deepchem.molnet.load_function.bbbc_datasets import load_bbbc001, load_bbbc002, load_bbbc003, load_bbbc004, load_bbbc005
 from deepchem.molnet.load_function.bbbp_datasets import load_bbbp
 from deepchem.molnet.load_function.cell_counting_datasets import load_cell_counting
 from deepchem.molnet.load_function.chembl_datasets import load_chembl

diff --git a/deepchem/molnet/load_function/bbbc_datasets.py b/deepchem/molnet/load_function/bbbc_datasets.py
@@ -8,6 +8,7 @@
 from deepchem.molnet.load_function.molnet_loader import TransformerGenerator, _MolnetLoader
 from deepchem.data import Dataset
 from typing import List, Optional, Tuple, Union
+import zipfile
 import numpy as np
 import pandas as pd
 
@@ -26,8 +27,13 @@
 
 BBBC4_TASKS = ["cell-count"]
 
+BBBC5_IMAGE_URL = 'https://data.broadinstitute.org/bbbc/BBBC005/BBBC005_v1_images.zip'
+BBBC5_FOREGROUND_URL = 'https://data.broadinstitute.org/bbbc/BBBC005/BBBC005_v1_ground_truth.zip'
+BBBC5_TASKS = ["cell-count"]
+
 
 class _BBBC001_Loader(_MolnetLoader):
+    """BBBC001 cell count dataset loader"""
 
     def create_dataset(self) -> Dataset:
         """Creates a dataset from BBBC001 images and cell counts as labels"""
@@ -91,6 +97,7 @@ def load_bbbc001(
 
 
 class _BBBC002_Loader(_MolnetLoader):
+    """BBBC002 cell count dataset loader"""
 
     def create_dataset(self) -> Dataset:
         """Creates a dataset from BBBC002 images and cell counts as labels"""
@@ -487,3 +494,99 @@ def load_bbbc004(
                                  **kwargs)
 
     return loader.load_dataset('bbbc004', reload)
+
+
+class _BBBC005_Loader(_MolnetLoader):
+    """BBBC005 cell count dataset loader"""
+
+    def create_dataset(self):
+        dataset_file = os.path.join(self.data_dir, "BBBC005_v1_images.zip")
+        if not os.path.exists(dataset_file):
+            dc.utils.data_utils.download_url(url=BBBC5_IMAGE_URL,
+                                             dest_dir=self.data_dir)
+
+        labels = []
+
+        # Read the zip file
+        with zipfile.ZipFile(dataset_file, 'r') as zip_ref:
+            file_list = zip_ref.namelist()
+
+        file_list = file_list[1:]
+
+        # Get the labels from filenames
+        for filename in file_list:
+            if filename.split('/')[-1].split('.')[-1] == 'TIF':
+                labels.append(int(filename.split('/')[-1].split('_')[2][1:]))
+
+        lbx = np.array(labels, dtype=np.int32)
+
+        loader = dc.data.ImageLoader(sorting=False)
+        return loader.create_dataset(inputs=(dataset_file, lbx),
+                                     in_memory=False)
+
+
+def load_bbbc005(
+    splitter: Union[dc.splits.Splitter, str, None] = 'index',
+    transformers: List[Union[TransformerGenerator, str]] = [],
+    reload: bool = True,
+    data_dir: Optional[str] = None,
+    save_dir: Optional[str] = None,
+    **kwargs
+) -> Tuple[List[str], Tuple[Dataset, ...], List[dc.trans.Transformer]]:
+    """Load BBBC005 dataset
+
+    This dataset contains data corresponding to 19,200 samples of synthetically generated
+    fluorescent cell population images. These images were simulated for a given cell count
+    with a clustering probablity of 25% and a CCD noise variance of 0.0001. Focus blur
+    was simulated by applying varying Guassian filters to the images. Each image is of
+    size 520x696. Ground truth labels contain cell counts for this dataset. Full details
+    about this dataset are present at
+    https://data.broadinstitute.org/bbbc/BBBC005/.
+
+    Parameters
+    ----------
+    splitter: Splitter or str
+        the splitter to use for splitting the data into training, validation, and
+        test sets.  Alternatively you can pass one of the names from
+        dc.molnet.splitters as a shortcut.  If this is None, all the data
+        will be included in a single dataset.
+    transformers: list of TransformerGenerators or strings
+        the Transformers to apply to the data.  Each one is specified by a
+        TransformerGenerator or, as a shortcut, one of the names from
+        dc.molnet.transformers.
+    reload: bool
+        if True, the first call for a particular featurizer and splitter will cache
+        the datasets to disk, and subsequent calls will reload the cached datasets.
+    data_dir: str
+        a directory to save the raw data in
+    save_dir: str
+        a directory to save the dataset in
+
+    Examples
+    --------
+    Importing necessary modules
+
+    >> import deepchem as dc
+    >> import numpy as np
+
+    We will now load the BBBC005 dataset with cell counts as labels.
+
+    >> loader = dc.molnet.load_bbbc005()
+    >> tasks, dataset, transformers = loader
+    >> train, val, test = dataset
+
+    We now have a dataset with a total of 19,200 samples with cell counts in
+    the range of 1-100. The images are of size 520x696. The labels are cell
+    counts. We have a train-val-test split of 80:10:10. We can verify this as follows:
+
+    >> train.X.shape
+    (15360, 520, 696)
+    >> train.y.shape
+    (15360,)
+    """
+    featurizer = dc.feat.UserDefinedFeaturizer([])  # Not actually used
+    loader: _MolnetLoader
+    loader = _BBBC005_Loader(featurizer, splitter, transformers, BBBC5_TASKS,
+                             data_dir, save_dir, **kwargs)
+
+    return loader.load_dataset('bbbc005', reload)
diff --git a/deepchem/molnet/load_function/tests/tests_bbbc.py b/deepchem/molnet/load_function/tests/tests_bbbc.py
@@ -0,0 +1,92 @@
+"""
+Tests for BBBC Loaders.
+"""
+
+import unittest
+import deepchem as dc
+
+
+class TestBBBCLoader(unittest.TestCase):
+    """
+    Test BBBC Loaders
+    """
+
+    def test_bbbc001(self):
+        """
+        Test loading BBBC001
+        """
+        loader = dc.molnet.load_bbbc001()
+        tasks, dataset, transformers = loader
+        train, val, test = dataset
+        assert train.X.shape == (4, 512, 512)
+        assert train.y.shape == (4,)
+        assert train.w.shape == (4,)
+        assert train.ids.shape == (4,)
+        assert val.X.shape == (1, 512, 512)
+        assert val.y.shape == (1,)
+        assert val.w.shape == (1,)
+        assert val.ids.shape == (1,)
+        assert test.X.shape == (1, 512, 512)
+        assert test.y.shape == (1,)
+        assert test.w.shape == (1,)
+        assert test.ids.shape == (1,)
+
+    def test_bbbc002(self):
+        """
+        Test loading BBBC002
+        """
+        loader = dc.molnet.load_bbbc002()
+        tasks, dataset, transformers = loader
+        train, val, test = dataset
+        assert train.X.shape == (40, 512, 512)
+        assert train.y.shape == (40,)
+        assert train.w.shape == (40,)
+        assert train.ids.shape == (40,)
+        assert val.X.shape == (5, 512, 512)
+        assert val.y.shape == (5,)
+        assert val.w.shape == (5,)
+        assert val.ids.shape == (5,)
+        assert test.X.shape == (5, 512, 512)
+        assert test.y.shape == (5,)
+        assert test.w.shape == (5,)
+        assert test.ids.shape == (5,)
+
+    def test_bbbc004_segmentation(self):
+        """
+        Test loading BBBC004 Segmentation Masks as labels
+        """
+        loader = dc.molnet.load_bbbc004(load_segmentation_masks=True)
+        tasks, dataset, transformers = loader
+        train, val, test = dataset
+        assert train.X.shape == (16, 950, 950)
+        assert train.y.shape == (16, 950, 950, 3)
+        assert train.w.shape == (16, 1)
+        assert train.ids.shape == (16,)
+        assert val.X.shape == (2, 950, 950)
+        assert val.y.shape == (2, 950, 950, 3)
+        assert val.w.shape == (2, 1)
+        assert val.ids.shape == (2,)
+        assert test.X.shape == (2, 950, 950)
+        assert test.y.shape == (2, 950, 950, 3)
+        assert test.w.shape == (2, 1)
+        assert test.ids.shape == (2,)
+
+    def test_bbbc004_counts(self):
+        """
+        Test loading BBBC004 Cell Counts as labels
+        """
+        loader = dc.molnet.load_bbbc004()
+        tasks, dataset, transformers = loader
+        train, val, test = dataset
+        assert train.X.shape == (16, 950, 950)
+        assert train.y.shape == (16,)
+        assert train.w.shape == (16,)
+        assert train.ids.shape == (16,)
+        assert val.X.shape == (2, 950, 950)
+        assert val.y.shape == (2,)
+        assert val.w.shape == (2,)
+        assert val.ids.shape == (2,)
+        assert test.X.shape == (2, 950, 950)
+        assert test.y.shape == (2,)
+        assert test.w.shape == (2,)
+        assert test.ids.shape == (2,)
diff --git a/docs/source/api_reference/moleculenet.rst b/docs/source/api_reference/moleculenet.rst
@@ -70,6 +70,8 @@ BBBC Datasets
 
 .. autofunction:: deepchem.molnet.load_bbbc004
 
+.. autofunction:: deepchem.molnet.load_bbbc005
+
 BBBP Datasets
 -------------
 BBBP stands for Blood-Brain-Barrier Penetration

diff --git a/docs/source/api_reference/moleculenet_datasets_description.csv b/docs/source/api_reference/moleculenet_datasets_description.csv
@@ -5,6 +5,7 @@ BBBC (BBBC001),Images of HT29 colon cancer cells,Images,6,`ref <https://data.bro
 BBBC (BBBC002),Images of Drosophilia Kc167 cells,Images,50,`ref <https://data.broadinstitute.org/bbbc/BBBC002/>`_
 BBBC (BBBC003),DIC Images of Mouse Embryos,Images,15,`ref <https://data.broadinstitute.org/bbbc/BBBC003/>`_
 BBBC (BBBC004),Synthetic Images of clustered nuclei,Images,20,`ref <https://data.broadinstitute.org/bbbc/BBBC004/>`_
+BBBC (BBBC004),Synthetic Images of clustered nuclei,Images,19200,`ref <https://data.broadinstitute.org/bbbc/BBBC005/>`_
 BBBP,Blood-Brain Barrier Penetration designed for the modeling and prediction of barrier permeability,Binary labels on permeability properties,2000,`ref <https://pubs.rsc.org/en/content/articlehtml/2018/sc/c7sc02664a>`_
 Cell Counting,Synthetic emulations of fluorescence microscopic images of bacterial cells,Images,200,`ref <http://www.robots.ox.ac.uk/~vgg/research/counting/index_org.html.>`_
 ChEMBL (set = ‘sparse’),A sparse subset of ChEMBL with activity data for one target,Molecules,244 245,`ref <https://www.ebi.ac.uk/chembl/.>`_