From 6a9096244e4862bfeb8d8a8a1ad410fe7561f23a Mon Sep 17 00:00:00 2001 From: anwai98 Date: Wed, 18 Oct 2023 17:32:28 +0200 Subject: [PATCH 1/9] Add Processing for MoNuSeg Inputs --- torch_em/data/datasets/monuseg.py | 114 ++++++++++++++++++++++++++++-- 1 file changed, 107 insertions(+), 7 deletions(-) diff --git a/torch_em/data/datasets/monuseg.py b/torch_em/data/datasets/monuseg.py index b9a7723f..7feebba0 100644 --- a/torch_em/data/datasets/monuseg.py +++ b/torch_em/data/datasets/monuseg.py @@ -1,10 +1,21 @@ import os +import shutil +import numpy as np +from tqdm import tqdm +from glob import glob +from xml.dom import minidom + +import imageio.v2 as imageio +from skimage.draw import polygon + import torch_em +from torch_em.data.datasets import util -from . import util URL = "https://drive.google.com/uc?export=download&id=1ZgqFJomqQGNnsx7w7QBzQQMVA16lbVCA" -CHECKSUM = "" +# TODO: add labeled test set (monuseg) - https://drive.google.com/file/d/1NKkSQ5T0ZNQ8aUhh0a8Dt2YKYCQXIViw + +CHECKSUM = "25d3d3185bb2970b397cafa72eb664c9b4d24294aee382e7e3df9885affce742" # TODO separate via organ @@ -15,16 +26,85 @@ def _download_monuseg(path, download): if os.path.exists(im_path) and os.path.exists(label_path): return - raise NotImplementedError("Download and post-processing for the monuseg data is not yet implemented.") - os.makedirs(path, exist_ok=True) zip_path = os.path.join(path, "monuseg.zip") util.download_source_gdrive(zip_path, URL, download=download, checksum=CHECKSUM) + _process_monuseg(path) + + +def generate_labeled_array(shape, xml_file): + """Function taken from: https://github.com/rshwndsz/hover-net/blob/master/lightning_hovernet.ipynb + + Given image shape and path to annotations (xml file), generatebit mask with the region inside a contour being white + shape: The image shape on which bit mask will be made + xml_file: path relative to the current working directory where the xml file is present + + Returns: + An image of given shape with region inside contour being white.. + """ + # DOM object created by the minidom parser + xDoc = minidom.parse(xml_file) + + # List of all Region tags + regions = xDoc.getElementsByTagName('Region') + + # List which will store the vertices for each region + xy = [] + for region in regions: + # Loading all the vertices in the region + vertices = region.getElementsByTagName('Vertex') + + # The vertices of a region will be stored in a array + vw = np.zeros((len(vertices), 2)) + + for index, vertex in enumerate(vertices): + # Storing the values of x and y coordinate after conversion + vw[index][0] = float(vertex.getAttribute('X')) + vw[index][1] = float(vertex.getAttribute('Y')) + + # Append the vertices of a region + xy.append(np.int32(vw)) + + # Creating a completely black image + mask = np.zeros(shape, np.float32) + + for i, contour in enumerate(xy): + r, c = polygon(np.array(contour)[:, 1], np.array(contour)[:, 0], shape=shape) + mask[r, c] = i + return mask + -# TODO -def _process_monuseg(): - pass +def _process_monuseg(path): + util.unzip(os.path.join(path, "monuseg.zip"), path) + + # assorting the images into expected dir; + # converting the label xml files to numpy arrays (of same dimension as input images) in the expected dir + root_img_save_dir = os.path.join(path, "images") + root_label_save_dir = os.path.join(path, "labels") + + os.makedirs(root_img_save_dir, exist_ok=True) + os.makedirs(root_label_save_dir, exist_ok=True) + + all_img_dir = sorted(glob(os.path.join(path, "*", "Tissue*", "*"))) + all_xml_label_dir = sorted(glob(os.path.join(path, "*", "Annotations", "*"))) + assert len(all_img_dir) == len(all_xml_label_dir) + + for img_path, xml_label_path in tqdm(zip(all_img_dir, all_xml_label_dir), + desc="Converting inputs to the expected format", + total=len(all_img_dir)): + desired_label_shape = imageio.imread(img_path).shape[:-1] + + img_id = os.path.split(img_path)[-1] + dst = os.path.join(root_img_save_dir, img_id) + shutil.move(src=img_path, dst=dst) + + _label = generate_labeled_array(shape=desired_label_shape, xml_file=xml_label_path) + _fileid = img_id.split(".")[0] + imageio.imwrite(os.path.join(root_label_save_dir, f"{_fileid}.tif"), _label) + + shutil.rmtree(glob(os.path.join(path, "MoNuSeg*"))[0]) + shutil.rmtree(glob(os.path.join(path, "__MACOSX"))[0]) def get_monuseg_dataset( @@ -56,3 +136,23 @@ def get_monuseg_loader( ) loader = torch_em.get_data_loader(dataset, batch_size, **loader_kwargs) return loader + + +def main(): + path = "/scratch/usr/nimanwai/data/monuseg/" + patch_shape = (512, 512) + + loader = get_monuseg_loader( + path=path, + patch_shape=patch_shape, + batch_size=2, + download=True + ) + + print("Length of loader: ", len(loader)) + + breakpoint() + + +if __name__ == "__main__": + main() From 3175f74e3f64babb586bd436c0c5c7e20539b48e Mon Sep 17 00:00:00 2001 From: anwai98 Date: Wed, 18 Oct 2023 17:37:41 +0200 Subject: [PATCH 2/9] Update __init__.py --- torch_em/data/datasets/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/torch_em/data/datasets/__init__.py b/torch_em/data/datasets/__init__.py index 68ec8241..8ff94f0a 100644 --- a/torch_em/data/datasets/__init__.py +++ b/torch_em/data/datasets/__init__.py @@ -11,8 +11,7 @@ from .lizard import get_lizard_loader, get_lizard_dataset from .lucchi import get_lucchi_loader, get_lucchi_dataset from .mitoem import get_mitoem_loader, get_mitoem_dataset -# monuseg is only partially implemented -# from .monuseg import get_monuseg_loader, get_monuseg_dataset +from .monuseg import get_monuseg_loader, get_monuseg_dataset from .mouse_embryo import get_mouse_embryo_loader, get_mouse_embryo_dataset from .neurips_cell_seg import ( get_neurips_cellseg_supervised_loader, get_neurips_cellseg_supervised_dataset, From fe900bb8d60fb4df85b95aa9a4b1c70ea4746b8d Mon Sep 17 00:00:00 2001 From: anwai98 Date: Wed, 18 Oct 2023 18:18:20 +0200 Subject: [PATCH 3/9] Add monuseg test split --- torch_em/data/datasets/monuseg.py | 62 +++++++++++++++++++------------ 1 file changed, 39 insertions(+), 23 deletions(-) diff --git a/torch_em/data/datasets/monuseg.py b/torch_em/data/datasets/monuseg.py index 7feebba0..938ff37a 100644 --- a/torch_em/data/datasets/monuseg.py +++ b/torch_em/data/datasets/monuseg.py @@ -12,25 +12,32 @@ from torch_em.data.datasets import util -URL = "https://drive.google.com/uc?export=download&id=1ZgqFJomqQGNnsx7w7QBzQQMVA16lbVCA" -# TODO: add labeled test set (monuseg) - https://drive.google.com/file/d/1NKkSQ5T0ZNQ8aUhh0a8Dt2YKYCQXIViw +URL = { + "train": "https://drive.google.com/uc?export=download&id=1ZgqFJomqQGNnsx7w7QBzQQMVA16lbVCA", + "test": "https://drive.google.com/uc?export=download&id=1NKkSQ5T0ZNQ8aUhh0a8Dt2YKYCQXIViw" +} -CHECKSUM = "25d3d3185bb2970b397cafa72eb664c9b4d24294aee382e7e3df9885affce742" +CHECKSUM = { + "train": "25d3d3185bb2970b397cafa72eb664c9b4d24294aee382e7e3df9885affce742", + "test": "13e522387ae8b1bcc0530e13ff9c7b4d91ec74959ef6f6e57747368d7ee6f88a" +} # TODO separate via organ -def _download_monuseg(path, download): +def _download_monuseg(path, download, split): + assert split in ["train", "test"], "The split choices in MoNuSeg datset are train/test, please choose from them" + # check if we have extracted the images and labels already - im_path = os.path.join(path, "images") - label_path = os.path.join(path, "labels") + im_path = os.path.join(path, "images", split) + label_path = os.path.join(path, "labels", split) if os.path.exists(im_path) and os.path.exists(label_path): return os.makedirs(path, exist_ok=True) - zip_path = os.path.join(path, "monuseg.zip") - util.download_source_gdrive(zip_path, URL, download=download, checksum=CHECKSUM) + zip_path = os.path.join(path, f"monuseg_{split}.zip") + util.download_source_gdrive(zip_path, URL[split], download=download, checksum=CHECKSUM[split]) - _process_monuseg(path) + _process_monuseg(path, split) def generate_labeled_array(shape, xml_file): @@ -75,23 +82,28 @@ def generate_labeled_array(shape, xml_file): return mask -def _process_monuseg(path): - util.unzip(os.path.join(path, "monuseg.zip"), path) +def _process_monuseg(path, split): + util.unzip(os.path.join(path, f"monuseg_{split}.zip"), path) # assorting the images into expected dir; # converting the label xml files to numpy arrays (of same dimension as input images) in the expected dir - root_img_save_dir = os.path.join(path, "images") - root_label_save_dir = os.path.join(path, "labels") + root_img_save_dir = os.path.join(path, "images", split) + root_label_save_dir = os.path.join(path, "labels", split) os.makedirs(root_img_save_dir, exist_ok=True) os.makedirs(root_label_save_dir, exist_ok=True) - all_img_dir = sorted(glob(os.path.join(path, "*", "Tissue*", "*"))) - all_xml_label_dir = sorted(glob(os.path.join(path, "*", "Annotations", "*"))) + if split == "train": + all_img_dir = sorted(glob(os.path.join(path, "*", "Tissue*", "*"))) + all_xml_label_dir = sorted(glob(os.path.join(path, "*", "Annotations", "*"))) + else: + all_img_dir = sorted(glob(os.path.join(path, "MoNuSegTestData", "*.tif"))) + all_xml_label_dir = sorted(glob(os.path.join(path, "MoNuSegTestData", "*.xml"))) + assert len(all_img_dir) == len(all_xml_label_dir) for img_path, xml_label_path in tqdm(zip(all_img_dir, all_xml_label_dir), - desc="Converting inputs to the expected format", + desc=f"Converting {split} split to the expected format", total=len(all_img_dir)): desired_label_shape = imageio.imread(img_path).shape[:-1] @@ -104,16 +116,19 @@ def _process_monuseg(path): imageio.imwrite(os.path.join(root_label_save_dir, f"{_fileid}.tif"), _label) shutil.rmtree(glob(os.path.join(path, "MoNuSeg*"))[0]) - shutil.rmtree(glob(os.path.join(path, "__MACOSX"))[0]) + if split == "train": + shutil.rmtree(glob(os.path.join(path, "__MACOSX"))[0]) def get_monuseg_dataset( - path, patch_shape, download=False, offsets=None, boundaries=False, binary=False, **kwargs + path, patch_shape, split, download=False, offsets=None, boundaries=False, binary=False, **kwargs ): - _download_monuseg(path, download) + """Dataset from https://monuseg.grand-challenge.org/Data/ + """ + _download_monuseg(path, download, split) - image_path = os.path.join(path, "images") - label_path = os.path.join(path, "labels") + image_path = os.path.join(path, "images", split) + label_path = os.path.join(path, "labels", split) kwargs, _ = util.add_instance_label_transform( kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets @@ -125,13 +140,13 @@ def get_monuseg_dataset( # TODO implement selecting organ def get_monuseg_loader( - path, patch_shape, batch_size, download=False, offsets=None, boundaries=False, binary=False, **kwargs + path, patch_shape, batch_size, split, download=False, offsets=None, boundaries=False, binary=False, **kwargs ): ds_kwargs, loader_kwargs = util.split_kwargs( torch_em.default_segmentation_dataset, **kwargs ) dataset = get_monuseg_dataset( - path, patch_shape, download=download, + path, patch_shape, split, download=download, offsets=offsets, boundaries=boundaries, binary=binary, **ds_kwargs ) loader = torch_em.get_data_loader(dataset, batch_size, **loader_kwargs) @@ -145,6 +160,7 @@ def main(): loader = get_monuseg_loader( path=path, patch_shape=patch_shape, + split="test", batch_size=2, download=True ) From e1d73071f3b964025fe4da3e75846ea990d5d48b Mon Sep 17 00:00:00 2001 From: anwai98 Date: Wed, 18 Oct 2023 19:06:19 +0200 Subject: [PATCH 4/9] Move xml to array conversion fn to util --- torch_em/data/datasets/monuseg.py | 47 +---------------------------- torch_em/data/datasets/util.py | 50 +++++++++++++++++++++++++++++-- 2 files changed, 49 insertions(+), 48 deletions(-) diff --git a/torch_em/data/datasets/monuseg.py b/torch_em/data/datasets/monuseg.py index 938ff37a..d61a37d3 100644 --- a/torch_em/data/datasets/monuseg.py +++ b/torch_em/data/datasets/monuseg.py @@ -1,12 +1,9 @@ import os import shutil -import numpy as np from tqdm import tqdm from glob import glob -from xml.dom import minidom import imageio.v2 as imageio -from skimage.draw import polygon import torch_em from torch_em.data.datasets import util @@ -40,48 +37,6 @@ def _download_monuseg(path, download, split): _process_monuseg(path, split) -def generate_labeled_array(shape, xml_file): - """Function taken from: https://github.com/rshwndsz/hover-net/blob/master/lightning_hovernet.ipynb - - Given image shape and path to annotations (xml file), generatebit mask with the region inside a contour being white - shape: The image shape on which bit mask will be made - xml_file: path relative to the current working directory where the xml file is present - - Returns: - An image of given shape with region inside contour being white.. - """ - # DOM object created by the minidom parser - xDoc = minidom.parse(xml_file) - - # List of all Region tags - regions = xDoc.getElementsByTagName('Region') - - # List which will store the vertices for each region - xy = [] - for region in regions: - # Loading all the vertices in the region - vertices = region.getElementsByTagName('Vertex') - - # The vertices of a region will be stored in a array - vw = np.zeros((len(vertices), 2)) - - for index, vertex in enumerate(vertices): - # Storing the values of x and y coordinate after conversion - vw[index][0] = float(vertex.getAttribute('X')) - vw[index][1] = float(vertex.getAttribute('Y')) - - # Append the vertices of a region - xy.append(np.int32(vw)) - - # Creating a completely black image - mask = np.zeros(shape, np.float32) - - for i, contour in enumerate(xy): - r, c = polygon(np.array(contour)[:, 1], np.array(contour)[:, 0], shape=shape) - mask[r, c] = i - return mask - - def _process_monuseg(path, split): util.unzip(os.path.join(path, f"monuseg_{split}.zip"), path) @@ -111,7 +66,7 @@ def _process_monuseg(path, split): dst = os.path.join(root_img_save_dir, img_id) shutil.move(src=img_path, dst=dst) - _label = generate_labeled_array(shape=desired_label_shape, xml_file=xml_label_path) + _label = util.generate_labeled_array(shape=desired_label_shape, xml_file=xml_label_path) _fileid = img_id.split(".")[0] imageio.imwrite(os.path.join(root_label_save_dir, f"{_fileid}.tif"), _label) diff --git a/torch_em/data/datasets/util.py b/torch_em/data/datasets/util.py index 44ac4206..f485b3b7 100644 --- a/torch_em/data/datasets/util.py +++ b/torch_em/data/datasets/util.py @@ -2,8 +2,13 @@ import os import hashlib import zipfile -from shutil import copyfileobj +import numpy as np +from tqdm import tqdm from warnings import warn +from xml.dom import minidom +from shutil import copyfileobj + +from skimage.draw import polygon import torch import torch_em @@ -14,7 +19,6 @@ except ImportError: gdown = None -from tqdm import tqdm BIOIMAGEIO_IDS = { "covid_if": "ilastik/covid_if_training_data", @@ -158,3 +162,45 @@ def add_instance_label_transform( kwargs = update_kwargs(kwargs, "label_transform", label_transform, msg=msg) label_dtype = torch.float32 return kwargs, label_dtype + + +def generate_labeled_array(shape, xml_file): + """Function taken from: https://github.com/rshwndsz/hover-net/blob/master/lightning_hovernet.ipynb + + Given image shape and path to annotations (xml file), generatebit mask with the region inside a contour being white + shape: The image shape on which bit mask will be made + xml_file: path relative to the current working directory where the xml file is present + + Returns: + An image of given shape with region inside contour being white.. + """ + # DOM object created by the minidom parser + xDoc = minidom.parse(xml_file) + + # List of all Region tags + regions = xDoc.getElementsByTagName('Region') + + # List which will store the vertices for each region + xy = [] + for region in regions: + # Loading all the vertices in the region + vertices = region.getElementsByTagName('Vertex') + + # The vertices of a region will be stored in a array + vw = np.zeros((len(vertices), 2)) + + for index, vertex in enumerate(vertices): + # Storing the values of x and y coordinate after conversion + vw[index][0] = float(vertex.getAttribute('X')) + vw[index][1] = float(vertex.getAttribute('Y')) + + # Append the vertices of a region + xy.append(np.int32(vw)) + + # Creating a completely black image + mask = np.zeros(shape, np.float32) + + for i, contour in enumerate(xy): + r, c = polygon(np.array(contour)[:, 1], np.array(contour)[:, 0], shape=shape) + mask[r, c] = i + return mask From abf985ccd67572f3ce8f56e542efc3615a5a42e8 Mon Sep 17 00:00:00 2001 From: anwai98 Date: Wed, 18 Oct 2023 19:08:58 +0200 Subject: [PATCH 5/9] Rename xml to array function --- torch_em/data/datasets/monuseg.py | 2 +- torch_em/data/datasets/util.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/torch_em/data/datasets/monuseg.py b/torch_em/data/datasets/monuseg.py index d61a37d3..9cbf0c73 100644 --- a/torch_em/data/datasets/monuseg.py +++ b/torch_em/data/datasets/monuseg.py @@ -66,7 +66,7 @@ def _process_monuseg(path, split): dst = os.path.join(root_img_save_dir, img_id) shutil.move(src=img_path, dst=dst) - _label = util.generate_labeled_array(shape=desired_label_shape, xml_file=xml_label_path) + _label = util.generate_labeled_array_from_xml(shape=desired_label_shape, xml_file=xml_label_path) _fileid = img_id.split(".")[0] imageio.imwrite(os.path.join(root_label_save_dir, f"{_fileid}.tif"), _label) diff --git a/torch_em/data/datasets/util.py b/torch_em/data/datasets/util.py index f485b3b7..f4987d9b 100644 --- a/torch_em/data/datasets/util.py +++ b/torch_em/data/datasets/util.py @@ -164,7 +164,7 @@ def add_instance_label_transform( return kwargs, label_dtype -def generate_labeled_array(shape, xml_file): +def generate_labeled_array_from_xml(shape, xml_file): """Function taken from: https://github.com/rshwndsz/hover-net/blob/master/lightning_hovernet.ipynb Given image shape and path to annotations (xml file), generatebit mask with the region inside a contour being white From 4089ca833f04b025d657e0247d2f6dccdc680211 Mon Sep 17 00:00:00 2001 From: anwai98 Date: Wed, 18 Oct 2023 23:08:14 +0200 Subject: [PATCH 6/9] Add script for checking monuseg --- scripts/datasets/check_monuseg.py | 28 ++++++++++++++++++++++++++++ torch_em/data/datasets/monuseg.py | 23 +---------------------- 2 files changed, 29 insertions(+), 22 deletions(-) create mode 100644 scripts/datasets/check_monuseg.py diff --git a/scripts/datasets/check_monuseg.py b/scripts/datasets/check_monuseg.py new file mode 100644 index 00000000..193bb2fc --- /dev/null +++ b/scripts/datasets/check_monuseg.py @@ -0,0 +1,28 @@ +from torch_em.util.debug import check_loader +from torch_em.data.datasets import get_monuseg_loader + + +MONUSEG_ROOT = "/scratch/usr/nimanwai/data/monuseg" + + +def check_monuseg(): + train_loader = get_monuseg_loader( + path=MONUSEG_ROOT, + patch_shape=(512, 512), + batch_size=2, + split="train", + download=True + ) + test_loader = get_monuseg_loader( + path=MONUSEG_ROOT, + patch_shape=(512, 512), + batch_size=1, + split="test", + download=True + ) + check_loader(train_loader, 15, instance_labels=True, rgb=False) + check_loader(test_loader, 15, instance_labels=True, rgb=False) + + +if __name__ == "__main__": + check_monuseg() diff --git a/torch_em/data/datasets/monuseg.py b/torch_em/data/datasets/monuseg.py index 9cbf0c73..59df6245 100644 --- a/torch_em/data/datasets/monuseg.py +++ b/torch_em/data/datasets/monuseg.py @@ -3,7 +3,7 @@ from tqdm import tqdm from glob import glob -import imageio.v2 as imageio +import imageio.v3 as imageio import torch_em from torch_em.data.datasets import util @@ -106,24 +106,3 @@ def get_monuseg_loader( ) loader = torch_em.get_data_loader(dataset, batch_size, **loader_kwargs) return loader - - -def main(): - path = "/scratch/usr/nimanwai/data/monuseg/" - patch_shape = (512, 512) - - loader = get_monuseg_loader( - path=path, - patch_shape=patch_shape, - split="test", - batch_size=2, - download=True - ) - - print("Length of loader: ", len(loader)) - - breakpoint() - - -if __name__ == "__main__": - main() From 0cf2dda14e98f5166ea8a26da45c6e3e0435bf2f Mon Sep 17 00:00:00 2001 From: anwai98 Date: Thu, 19 Oct 2023 21:33:02 +0200 Subject: [PATCH 7/9] Add organ-level splits for monuseg --- scripts/datasets/check_monuseg.py | 8 +++--- torch_em/data/datasets/monuseg.py | 43 +++++++++++++++++++++++++------ 2 files changed, 40 insertions(+), 11 deletions(-) diff --git a/scripts/datasets/check_monuseg.py b/scripts/datasets/check_monuseg.py index 193bb2fc..f4e0dd9a 100644 --- a/scripts/datasets/check_monuseg.py +++ b/scripts/datasets/check_monuseg.py @@ -11,8 +11,11 @@ def check_monuseg(): patch_shape=(512, 512), batch_size=2, split="train", - download=True + download=True, + organ_type=None ) + check_loader(train_loader, 8, instance_labels=True, rgb=True, plt=True, save_path="./monuseg_train.png") + test_loader = get_monuseg_loader( path=MONUSEG_ROOT, patch_shape=(512, 512), @@ -20,8 +23,7 @@ def check_monuseg(): split="test", download=True ) - check_loader(train_loader, 15, instance_labels=True, rgb=False) - check_loader(test_loader, 15, instance_labels=True, rgb=False) + check_loader(test_loader, 8, instance_labels=True, rgb=True, plt=True, save_path="./monuseg_test.png") if __name__ == "__main__": diff --git a/torch_em/data/datasets/monuseg.py b/torch_em/data/datasets/monuseg.py index 59df6245..adb7dcb2 100644 --- a/torch_em/data/datasets/monuseg.py +++ b/torch_em/data/datasets/monuseg.py @@ -2,6 +2,7 @@ import shutil from tqdm import tqdm from glob import glob +from typing import List, Optional import imageio.v3 as imageio @@ -19,8 +20,22 @@ "test": "13e522387ae8b1bcc0530e13ff9c7b4d91ec74959ef6f6e57747368d7ee6f88a" } +# here's the description: https://drive.google.com/file/d/1xYyQ31CHFRnvTCTuuHdconlJCMk2SK7Z/view?usp=sharing +ORGAN_SPLITS = { + "breast": ["TCGA-A7-A13E-01Z-00-DX1", "TCGA-A7-A13F-01Z-00-DX1", "TCGA-AR-A1AK-01Z-00-DX1", + "TCGA-AR-A1AS-01Z-00-DX1", "TCGA-E2-A1B5-01Z-00-DX1", "TCGA-E2-A14V-01Z-00-DX1"], + "kidney": ["TCGA-B0-5711-01Z-00-DX1", "TCGA-HE-7128-01Z-00-DX1", "TCGA-HE-7129-01Z-00-DX1", + "TCGA-HE-7130-01Z-00-DX1", "TCGA-B0-5710-01Z-00-DX1", "TCGA-B0-5698-01Z-00-DX1"], + "liver": ["TCGA-18-5592-01Z-00-DX1", "TCGA-38-6178-01Z-00-DX1", "TCGA-49-4488-01Z-00-DX1", + "TCGA-50-5931-01Z-00-DX1", "TCGA-21-5784-01Z-00-DX1", "TCGA-21-5786-01Z-00-DX1"], + "prostate": ["TCGA-G9-6336-01Z-00-DX1", "TCGA-G9-6348-01Z-00-DX1", "TCGA-G9-6356-01Z-00-DX1", + "TCGA-G9-6363-01Z-00-DX1", "TCGA-CH-5767-01Z-00-DX1", "TCGA-G9-6362-01Z-00-DX1"], + "bladder": ["TCGA-DK-A2I6-01A-01-TS1", "TCGA-G2-A2EK-01A-02-TSB"], + "colon": ["TCGA-AY-A8YK-01A-01-TS1", "TCGA-NH-A8F7-01A-01-TS1"], + "stomach": ["TCGA-KB-A93J-01A-01-TS1", "TCGA-RD-A8N9-01A-01-TS1"] +} + -# TODO separate via organ def _download_monuseg(path, download, split): assert split in ["train", "test"], "The split choices in MoNuSeg datset are train/test, please choose from them" @@ -76,32 +91,44 @@ def _process_monuseg(path, split): def get_monuseg_dataset( - path, patch_shape, split, download=False, offsets=None, boundaries=False, binary=False, **kwargs + path, patch_shape, split, organ_type: Optional[List[str]] = None, download=False, + offsets=None, boundaries=False, binary=False, **kwargs ): """Dataset from https://monuseg.grand-challenge.org/Data/ """ _download_monuseg(path, download, split) - image_path = os.path.join(path, "images", split) - label_path = os.path.join(path, "labels", split) + image_paths = sorted(glob(os.path.join(path, "images", split, "*"))) + label_paths = sorted(glob(os.path.join(path, "labels", split, "*"))) + + if split == "train" and organ_type is not None: + # get all patients for multiple organ selection + all_organ_splits = sum([ORGAN_SPLITS[o][:] for o in organ_type], []) + + image_paths = [ + _path for _path in image_paths if os.path.split(_path)[-1].split(".")[0] in all_organ_splits + ] + label_paths = [ + _path for _path in label_paths if os.path.split(_path)[-1].split(".")[0] in all_organ_splits + ] kwargs, _ = util.add_instance_label_transform( kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets ) return torch_em.default_segmentation_dataset( - image_path, "*.tif", label_path, "*.tif", patch_shape, is_seg_dataset=False, **kwargs + image_paths, None, label_paths, None, patch_shape, is_seg_dataset=False, **kwargs ) -# TODO implement selecting organ def get_monuseg_loader( - path, patch_shape, batch_size, split, download=False, offsets=None, boundaries=False, binary=False, **kwargs + path, patch_shape, batch_size, split, organ_type=None, download=False, offsets=None, boundaries=False, binary=False, + **kwargs ): ds_kwargs, loader_kwargs = util.split_kwargs( torch_em.default_segmentation_dataset, **kwargs ) dataset = get_monuseg_dataset( - path, patch_shape, split, download=download, + path, patch_shape, split, organ_type=organ_type, download=download, offsets=offsets, boundaries=boundaries, binary=binary, **ds_kwargs ) loader = torch_em.get_data_loader(dataset, batch_size, **loader_kwargs) From 374fac16094f8e8d0747015a0767f10adac8db2e Mon Sep 17 00:00:00 2001 From: anwai98 Date: Thu, 19 Oct 2023 22:24:01 +0200 Subject: [PATCH 8/9] Check organ splits for train loader --- scripts/datasets/check_monuseg.py | 2 +- torch_em/data/datasets/monuseg.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/datasets/check_monuseg.py b/scripts/datasets/check_monuseg.py index f4e0dd9a..b999fdaa 100644 --- a/scripts/datasets/check_monuseg.py +++ b/scripts/datasets/check_monuseg.py @@ -12,7 +12,7 @@ def check_monuseg(): batch_size=2, split="train", download=True, - organ_type=None + organ_type=["colon", "breast"] ) check_loader(train_loader, 8, instance_labels=True, rgb=True, plt=True, save_path="./monuseg_train.png") diff --git a/torch_em/data/datasets/monuseg.py b/torch_em/data/datasets/monuseg.py index adb7dcb2..b32cee68 100644 --- a/torch_em/data/datasets/monuseg.py +++ b/torch_em/data/datasets/monuseg.py @@ -103,7 +103,7 @@ def get_monuseg_dataset( if split == "train" and organ_type is not None: # get all patients for multiple organ selection - all_organ_splits = sum([ORGAN_SPLITS[o][:] for o in organ_type], []) + all_organ_splits = sum([ORGAN_SPLITS[o] for o in organ_type], []) image_paths = [ _path for _path in image_paths if os.path.split(_path)[-1].split(".")[0] in all_organ_splits From 41d2fbc2879c79e26f3248dcb5f616b76b9c4962 Mon Sep 17 00:00:00 2001 From: anwai98 Date: Fri, 20 Oct 2023 10:55:58 +0200 Subject: [PATCH 9/9] Raise error for organ splits in test --- torch_em/data/datasets/monuseg.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/torch_em/data/datasets/monuseg.py b/torch_em/data/datasets/monuseg.py index b32cee68..2f2556d3 100644 --- a/torch_em/data/datasets/monuseg.py +++ b/torch_em/data/datasets/monuseg.py @@ -2,6 +2,7 @@ import shutil from tqdm import tqdm from glob import glob +from pathlib import Path from typing import List, Optional import imageio.v3 as imageio @@ -103,14 +104,14 @@ def get_monuseg_dataset( if split == "train" and organ_type is not None: # get all patients for multiple organ selection - all_organ_splits = sum([ORGAN_SPLITS[o] for o in organ_type], []) - - image_paths = [ - _path for _path in image_paths if os.path.split(_path)[-1].split(".")[0] in all_organ_splits - ] - label_paths = [ - _path for _path in label_paths if os.path.split(_path)[-1].split(".")[0] in all_organ_splits - ] + all_organ_splits = sum([ORGAN_SPLITS[_o] for _o in organ_type], []) + + image_paths = [_path for _path in image_paths if Path(_path).stem in all_organ_splits] + label_paths = [_path for _path in label_paths if Path(_path).stem in all_organ_splits] + + elif split == "test" and organ_type is not None: + # we don't have organ splits in the test dataset + raise ValueError("The test split does not have any organ informations, please pass `organ_type=None`") kwargs, _ = util.add_instance_label_transform( kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets