diff --git a/scripts/datasets/medical/check_piccolo.py b/scripts/datasets/medical/check_piccolo.py new file mode 100644 index 00000000..7d43313f --- /dev/null +++ b/scripts/datasets/medical/check_piccolo.py @@ -0,0 +1,20 @@ +from torch_em.util.debug import check_loader +from torch_em.data.datasets.medical import get_piccolo_loader + + +ROOT = "/media/anwai/ANWAI/data/piccolo" + + +def check_piccolo(): + loader = get_piccolo_loader( + path=ROOT, + patch_shape=(512, 512), + batch_size=2, + split="train", + resize_inputs=True, + ) + check_loader(loader, 8) + + +if __name__ == "__main__": + check_piccolo() diff --git a/torch_em/data/datasets/medical/__init__.py b/torch_em/data/datasets/medical/__init__.py index 84c77d76..9e3528c0 100644 --- a/torch_em/data/datasets/medical/__init__.py +++ b/torch_em/data/datasets/medical/__init__.py @@ -12,6 +12,7 @@ from .oimhs import get_oimhs_dataset, get_oimhs_loader from .osic_pulmofib import get_osic_pulmofib_dataset, get_osic_pulmofib_loader from .papila import get_papila_dataset, get_papila_loader +from .piccolo import get_piccolo_dataset, get_piccolo_loader from .plethora import get_plethora_dataset, get_plethora_loader from .sa_med2d import get_sa_med2d_dataset, get_sa_med2d_loader from .sega import get_sega_dataset, get_sega_loader diff --git a/torch_em/data/datasets/medical/piccolo.py b/torch_em/data/datasets/medical/piccolo.py new file mode 100644 index 00000000..0b8738f5 --- /dev/null +++ b/torch_em/data/datasets/medical/piccolo.py @@ -0,0 +1,105 @@ +import os +from glob import glob +from natsort import natsorted +from typing import Union, Tuple, Literal + +import torch_em + +from .. import util + + +def get_piccolo_data(path, download): + """The database is located at: + - https://www.biobancovasco.bioef.eus/en/Sample-and-data-e-catalog/Databases/PD178-PICCOLO-EN1.html + + Follow the instructions below to get access to the dataset. + - Visit the attached website above + - Fill up the access request form: https://labur.eus/EzJUN + - Send an email to Basque Biobank at solicitudes.biobancovasco@bioef.eus, requesting access to the dataset. + - The team will request you to follow-up with some formalities. + - Then, you will gain access to the ".rar" file. + - Finally, provide the path where the rar file is stored, and you should be good to go. + """ + if download: + raise NotImplementedError( + "Automatic download is not possible for this dataset. See 'get_piccolo_data' for details." + ) + + data_dir = os.path.join(path, r"piccolo dataset-release0.1") + if os.path.exists(data_dir): + return data_dir + + rar_file = os.path.join(path, r"piccolo dataset_widefield-release0.1.rar") + if not os.path.exists(rar_file): + raise FileNotFoundError( + "You must download the PICCOLO dataset from the Basque Biobank, see 'get_piccolo_data' for details." + ) + + util.unzip_rarfile(rar_path=rar_file, dst=path, remove=False) + return data_dir + + +def _get_piccolo_paths(path, split, download): + data_dir = get_piccolo_data(path=path, download=download) + + split_dir = os.path.join(data_dir, split) + + image_paths = natsorted(glob(os.path.join(split_dir, "polyps", "*"))) + gt_paths = natsorted(glob(os.path.join(split_dir, "masks", "*"))) + + return image_paths, gt_paths + + +def get_piccolo_dataset( + path: Union[os.PathLike, str], + patch_shape: Tuple[int, int], + split: Literal["train", "validation", "test"], + resize_inputs: bool = False, + download: bool = False, + **kwargs +): + """Dataset for polyp segmentation in narrow band imaging colonoscopy images. + + This dataset is from Sánchez-Peralta et al. - https://doi.org/10.3390/app10238501. + To access the dataset, see `get_piccolo_data` for details. + + Please cite it if you use this data in a publication. + """ + image_paths, gt_paths = _get_piccolo_paths(path=path, split=split, download=download) + + if resize_inputs: + resize_kwargs = {"patch_shape": patch_shape, "is_rgb": True} + kwargs, patch_shape = util.update_kwargs_for_resize_trafo( + kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs + ) + + dataset = torch_em.default_segmentation_dataset( + raw_paths=image_paths, + raw_key=None, + label_paths=gt_paths, + label_key=None, + patch_shape=patch_shape, + is_seg_dataset=False, + **kwargs + ) + return dataset + + +def get_piccolo_loader( + path: Union[os.PathLike, str], + patch_shape: Tuple[int, int], + batch_size: int, + split: Literal["train", "validation", "test"], + resize_inputs: bool = False, + download: bool = False, + **kwargs +): + """Dataloader for polyp segmentation in narrow band imaging colonoscopy images. + See `get_piccolo_dataset` for details. + """ + ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) + dataset = get_piccolo_dataset( + path=path, patch_shape=patch_shape, split=split, resize_inputs=resize_inputs, download=download, **ds_kwargs + ) + loader = torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs) + return loader diff --git a/torch_em/data/datasets/util.py b/torch_em/data/datasets/util.py index 2164b660..8a0d2c35 100644 --- a/torch_em/data/datasets/util.py +++ b/torch_em/data/datasets/util.py @@ -221,6 +221,15 @@ def unzip_tarfile(tar_path, dst, remove=True): os.remove(tar_path) +def unzip_rarfile(rar_path, dst, remove=True): + import rarfile + with rarfile.RarFile(rar_path) as f: + f.extractall(path=dst) + + if remove: + os.remove(rar_path) + + def unzip(zip_path, dst, remove=True): with zipfile.ZipFile(zip_path, "r") as f: f.extractall(dst)