diff --git a/scripts/datasets/medical/check_cbis_ddsm.py b/scripts/datasets/medical/check_cbis_ddsm.py new file mode 100644 index 00000000..ca305183 --- /dev/null +++ b/scripts/datasets/medical/check_cbis_ddsm.py @@ -0,0 +1,24 @@ +from torch_em.data import MinInstanceSampler +from torch_em.util.debug import check_loader +from torch_em.data.datasets.medical import get_cbis_ddsm_loader + + +ROOT = "/media/anwai/ANWAI/data/cbis_ddsm" + + +def check_cbis_ddsm(): + loader = get_cbis_ddsm_loader( + path=ROOT, + patch_shape=(512, 512), + batch_size=2, + split="Train", + task=None, + tumour_type=None, + resize_inputs=True, + sampler=MinInstanceSampler() + ) + check_loader(loader, 8) + + +if __name__ == "__main__": + check_cbis_ddsm() diff --git a/torch_em/data/datasets/medical/__init__.py b/torch_em/data/datasets/medical/__init__.py index f2ad34c7..56ba358d 100644 --- a/torch_em/data/datasets/medical/__init__.py +++ b/torch_em/data/datasets/medical/__init__.py @@ -2,6 +2,7 @@ from .btcv import get_btcv_dataset, get_btcv_loader from .busi import get_busi_dataset, get_busi_loader from .camus import get_camus_dataset, get_camus_loader +from .cbis_ddsm import get_cbis_ddsm_dataset, get_cbis_ddsm_loader from .cholecseg8k import get_cholecseg8k_dataset, get_cholecseg8k_loader from .drive import get_drive_dataset, get_drive_loader from .duke_liver import get_duke_liver_dataset, get_duke_liver_loader diff --git a/torch_em/data/datasets/medical/cbis_ddsm.py b/torch_em/data/datasets/medical/cbis_ddsm.py new file mode 100644 index 00000000..81a5830b --- /dev/null +++ b/torch_em/data/datasets/medical/cbis_ddsm.py @@ -0,0 +1,120 @@ +import os +from glob import glob +from natsort import natsorted +from typing import Union, Tuple, Literal, Optional + +import torch_em + +from .. import util + + +def get_cbis_ddsm_data(path, split, task, tumour_type, download): + os.makedirs(path, exist_ok=True) + + assert split in ["Train", "Test"] + + if task is None: + task = "*" + else: + assert task in ["Calc", "Mass"] + + if tumour_type is None: + tumour_type = "*" + else: + assert tumour_type in ["MALIGNANT", "BENIGN"] + + data_dir = os.path.join(path, "DATA") + if os.path.exists(data_dir): + return os.path.join(path, "DATA", task, split, tumour_type) + + util.download_source_kaggle( + path=path, dataset_name="mohamedbenticha/cbis-ddsm/", download=download, + ) + zip_path = os.path.join(path, "cbis-ddsm.zip") + util.unzip(zip_path=zip_path, dst=path) + return os.path.join(path, "DATA", task, split, tumour_type) + + +def _get_cbis_ddsm_paths(path, split, task, tumour_type, download): + data_dir = get_cbis_ddsm_data( + path=path, + split=split, + task=task, + tumour_type=tumour_type, + download=download + ) + + image_paths = natsorted(glob(os.path.join(data_dir, "*_FULL_*.png"))) + gt_paths = natsorted(glob(os.path.join(data_dir, "*_MASK_*.png"))) + + assert len(image_paths) == len(gt_paths) + + return image_paths, gt_paths + + +def get_cbis_ddsm_dataset( + path: Union[os.PathLike, str], + patch_shape: Tuple[int, int], + split: Literal["Train", "Test"], + task: Optional[Literal["Calc", "Mass"]] = None, + tumour_type: Optional[Literal["MALIGNANT", "BENIGN"]] = None, + resize_inputs: bool = False, + download: bool = False, + **kwargs +): + """Dataset for segmentation of calcification and mass in mammography. + + This dataset is a preprocessed version of https://www.cancerimagingarchive.net/collection/cbis-ddsm/ available + at https://www.kaggle.com/datasets/mohamedbenticha/cbis-ddsm/data. The related publication is: + - https://doi.org/10.1038/sdata.2017.177 + + Please cite it if you use this dataset in a publication. + """ + image_paths, gt_paths = _get_cbis_ddsm_paths( + path=path, split=split, task=task, tumour_type=tumour_type, download=download + ) + + if resize_inputs: + resize_kwargs = {"patch_shape": patch_shape, "is_rgb": False} + kwargs, patch_shape = util.update_kwargs_for_resize_trafo( + kwargs=kwargs, patch_shape=patch_shape, resize_inputs=resize_inputs, resize_kwargs=resize_kwargs + ) + + dataset = torch_em.default_segmentation_dataset( + raw_paths=image_paths, + raw_key=None, + label_paths=gt_paths, + label_key=None, + patch_shape=patch_shape, + is_seg_dataset=False, + **kwargs + ) + return dataset + + +def get_cbis_ddsm_loader( + path: Union[os.PathLike, str], + patch_shape: Tuple[int, int], + batch_size: int, + split: Literal["Train", "Test"], + task: Optional[Literal["Calc", "Mass"]] = None, + tumour_type: Optional[Literal["MALIGNANT", "BENIGN"]] = None, + resize_inputs: bool = False, + download: bool = False, + **kwargs +): + """Dataloader for segmentation of calcification and mass in mammography. See `get_cbis_ddsm_dataset` for details. + """ + ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) + dataset = get_cbis_ddsm_dataset( + path=path, + patch_shape=patch_shape, + split=split, + task=task, + tumour_type=tumour_type, + resize_inputs=resize_inputs, + download=download, + **ds_kwargs + ) + loader = torch_em.get_data_loader(dataset=dataset, batch_size=batch_size, **loader_kwargs) + return loader