constantinpape · constantinpape · Jun 24, 2024 · Jun 21, 2024
diff --git a/scripts/datasets/medical/check_piccolo.py b/scripts/datasets/medical/check_piccolo.py
@@ -0,0 +1,20 @@
+from torch_em.util.debug import check_loader
+from torch_em.data.datasets.medical import get_piccolo_loader
+
+
+ROOT = "/media/anwai/ANWAI/data/piccolo"
+
+
+def check_piccolo():
+    loader = get_piccolo_loader(
+        path=ROOT,
+        patch_shape=(512, 512),
+        batch_size=2,
+        split="train",
+        resize_inputs=True,
+    )
+    check_loader(loader, 8)
+
+
+if __name__ == "__main__":
+    check_piccolo()
diff --git a/torch_em/data/datasets/medical/__init__.py b/torch_em/data/datasets/medical/__init__.py
@@ -12,6 +12,7 @@
 from .oimhs import get_oimhs_dataset, get_oimhs_loader
 from .osic_pulmofib import get_osic_pulmofib_dataset, get_osic_pulmofib_loader
 from .papila import get_papila_dataset, get_papila_loader
+from .piccolo import get_piccolo_dataset, get_piccolo_loader
 from .plethora import get_plethora_dataset, get_plethora_loader
 from .sa_med2d import get_sa_med2d_dataset, get_sa_med2d_loader
 from .sega import get_sega_dataset, get_sega_loader

diff --git a/torch_em/data/datasets/medical/piccolo.py b/torch_em/data/datasets/medical/piccolo.py
@@ -0,0 +1,105 @@
+import os
+from glob import glob
+from natsort import natsorted
+from typing import Union, Tuple, Literal
+
+import torch_em
+
+from .. import util
+
+
+def get_piccolo_data(path, download):
+    """The database is located at:
+    - https://www.biobancovasco.bioef.eus/en/Sample-and-data-e-catalog/Databases/PD178-PICCOLO-EN1.html
+
+    Follow the instructions below to get access to the dataset.
+    - Visit the attached website above
+    - Fill up the access request form: https://labur.eus/EzJUN
+    - Send an email to Basque Biobank at solicitudes.biobancovasco@bioef.eus, requesting access to the dataset.
+    - The team will request you to follow-up with some formalities.
+    - Then, you will gain access to the ".rar" file.
+    - Finally, provide the path where the rar file is stored, and you should be good to go.
+    """
+    if download:
+        raise NotImplementedError(
+            "Automatic download is not possible for this dataset. See 'get_piccolo_data' for details."
+        )
+
+    data_dir = os.path.join(path, r"piccolo dataset-release0.1")
+    if os.path.exists(data_dir):
+        return data_dir
+
+    rar_file = os.path.join(path, r"piccolo dataset_widefield-release0.1.rar")
+    if not os.path.exists(rar_file):
+        raise FileNotFoundError(
+            "You must download the PICCOLO dataset from the Basque Biobank, see 'get_piccolo_data' for details."
+        )
+
+    util.unzip_rarfile(rar_path=rar_file, dst=path, remove=False)
+    return data_dir
+
+
+def _get_piccolo_paths(path, split, download):
+    data_dir = get_piccolo_data(path=path, download=download)
+
+    split_dir = os.path.join(data_dir, split)
+
+    image_paths = natsorted(glob(os.path.join(split_dir, "polyps", "*")))
+    gt_paths = natsorted(glob(os.path.join(split_dir, "masks", "*")))
+
+    return image_paths, gt_paths
+
+
+def get_piccolo_dataset(
+    path: Union[os.PathLike, str],
+    patch_shape: Tuple[int, int],
+    split: Literal["train", "validation", "test"],
+    resize_inputs: bool = False,
+    download: bool = False,
+    **kwargs
+):
+    """Dataset for polyp segmentation in narrow band imaging colonoscopy images.
+
+    This dataset is from Sánchez-Peralta et al. - https://doi.org/10.3390/app10238501.
+    To access the dataset, see `get_piccolo_data` for details.
+
+    Please cite it if you use this data in a publication.
+    """
+    image_paths, gt_paths = _get_piccolo_paths(path=path, split=split, download=download)
+
+    if resize_inputs:
+        resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True}
+        kwargs, patch_shape = util.update_kwargs_for_resize_trafo(
+            kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs
+        )
+
+    dataset = torch_em.default_segmentation_dataset(
+        raw_paths=image_paths,
+        raw_key=None,
+        label_paths=gt_paths,
+        label_key=None,
+        patch_shape=patch_shape,
+        is_seg_dataset=False,
+        **kwargs
+    )
+    return dataset
+
+
+def get_piccolo_loader(
+    path: Union[os.PathLike, str],
+    patch_shape: Tuple[int, int],
+    batch_size: int,
+    split: Literal["train", "validation", "test"],
+    resize_inputs: bool = False,
+    download: bool = False,
+    **kwargs
+):
+    """Dataloader for polyp segmentation in narrow band imaging colonoscopy images.
+    See `get_piccolo_dataset` for details.
+    """
+    ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
+    dataset = get_piccolo_dataset(
+        path=path, patch_shape=patch_shape, split=split, resize_inputs=resize_inputs, download=download, **ds_kwargs
+    )
+    loader = torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs)
+    return loader
diff --git a/torch_em/data/datasets/util.py b/torch_em/data/datasets/util.py
@@ -221,6 +221,15 @@ def unzip_tarfile(tar_path, dst, remove=True):
         os.remove(tar_path)
 
 
+def unzip_rarfile(rar_path, dst, remove=True):
+    import rarfile
+    with rarfile.RarFile(rar_path) as f:
+        f.extractall(path=dst)
+
+    if remove:
+        os.remove(rar_path)
+
+
 def unzip(zip_path, dst, remove=True):
     with zipfile.ZipFile(zip_path, "r") as f:
         f.extractall(dst)