From 6a9096244e4862bfeb8d8a8a1ad410fe7561f23a Mon Sep 17 00:00:00 2001
From: anwai98 <anwai.archit@gmail.com>
Date: Wed, 18 Oct 2023 17:32:28 +0200
Subject: [PATCH 1/9] Add Processing for MoNuSeg Inputs

---
 torch_em/data/datasets/monuseg.py | 114 ++++++++++++++++++++++++++++--
 1 file changed, 107 insertions(+), 7 deletions(-)

diff --git a/torch_em/data/datasets/monuseg.py b/torch_em/data/datasets/monuseg.py
index b9a7723f..7feebba0 100644
--- a/torch_em/data/datasets/monuseg.py
+++ b/torch_em/data/datasets/monuseg.py
@@ -1,10 +1,21 @@
 import os
+import shutil
+import numpy as np
+from tqdm import tqdm
+from glob import glob
+from xml.dom import minidom
+
+import imageio.v2 as imageio
+from skimage.draw import polygon
+
 import torch_em
+from torch_em.data.datasets import util
 
-from . import util
 
 URL = "https://drive.google.com/uc?export=download&id=1ZgqFJomqQGNnsx7w7QBzQQMVA16lbVCA"
-CHECKSUM = ""
+# TODO: add labeled test set (monuseg) - https://drive.google.com/file/d/1NKkSQ5T0ZNQ8aUhh0a8Dt2YKYCQXIViw
+
+CHECKSUM = "25d3d3185bb2970b397cafa72eb664c9b4d24294aee382e7e3df9885affce742"
 
 
 # TODO separate via organ
@@ -15,16 +26,85 @@ def _download_monuseg(path, download):
     if os.path.exists(im_path) and os.path.exists(label_path):
         return
 
-    raise NotImplementedError("Download and post-processing for the monuseg data is not yet implemented.")
-
     os.makedirs(path, exist_ok=True)
     zip_path = os.path.join(path, "monuseg.zip")
     util.download_source_gdrive(zip_path, URL, download=download, checksum=CHECKSUM)
 
+    _process_monuseg(path)
+
+
+def generate_labeled_array(shape, xml_file):
+    """Function taken from: https://github.com/rshwndsz/hover-net/blob/master/lightning_hovernet.ipynb
+
+    Given image shape and path to annotations (xml file), generatebit mask with the region inside a contour being white
+        shape: The image shape on which bit mask will be made
+        xml_file: path relative to the current working directory where the xml file is present
+
+    Returns:
+        An image of given shape with region inside contour being white..
+    """
+    # DOM object created by the minidom parser
+    xDoc = minidom.parse(xml_file)
+
+    # List of all Region tags
+    regions = xDoc.getElementsByTagName('Region')
+
+    # List which will store the vertices for each region
+    xy = []
+    for region in regions:
+        # Loading all the vertices in the region
+        vertices = region.getElementsByTagName('Vertex')
+
+        # The vertices of a region will be stored in a array
+        vw = np.zeros((len(vertices), 2))
+
+        for index, vertex in enumerate(vertices):
+            # Storing the values of x and y coordinate after conversion
+            vw[index][0] = float(vertex.getAttribute('X'))
+            vw[index][1] = float(vertex.getAttribute('Y'))
+
+        # Append the vertices of a region
+        xy.append(np.int32(vw))
+
+    # Creating a completely black image
+    mask = np.zeros(shape, np.float32)
+
+    for i, contour in enumerate(xy):
+        r, c = polygon(np.array(contour)[:, 1], np.array(contour)[:, 0], shape=shape)
+        mask[r, c] = i
+    return mask
+
 
-# TODO
-def _process_monuseg():
-    pass
+def _process_monuseg(path):
+    util.unzip(os.path.join(path, "monuseg.zip"), path)
+
+    # assorting the images into expected dir;
+    # converting the label xml files to numpy arrays (of same dimension as input images) in the expected dir
+    root_img_save_dir = os.path.join(path, "images")
+    root_label_save_dir = os.path.join(path, "labels")
+
+    os.makedirs(root_img_save_dir, exist_ok=True)
+    os.makedirs(root_label_save_dir, exist_ok=True)
+
+    all_img_dir = sorted(glob(os.path.join(path, "*", "Tissue*", "*")))
+    all_xml_label_dir = sorted(glob(os.path.join(path, "*", "Annotations", "*")))
+    assert len(all_img_dir) == len(all_xml_label_dir)
+
+    for img_path, xml_label_path in tqdm(zip(all_img_dir, all_xml_label_dir),
+                                         desc="Converting inputs to the expected format", 
+                                         total=len(all_img_dir)):
+        desired_label_shape = imageio.imread(img_path).shape[:-1]
+
+        img_id = os.path.split(img_path)[-1]
+        dst = os.path.join(root_img_save_dir, img_id)
+        shutil.move(src=img_path, dst=dst)
+
+        _label = generate_labeled_array(shape=desired_label_shape, xml_file=xml_label_path)
+        _fileid = img_id.split(".")[0]
+        imageio.imwrite(os.path.join(root_label_save_dir, f"{_fileid}.tif"), _label)
+
+    shutil.rmtree(glob(os.path.join(path, "MoNuSeg*"))[0])
+    shutil.rmtree(glob(os.path.join(path, "__MACOSX"))[0])
 
 
 def get_monuseg_dataset(
@@ -56,3 +136,23 @@ def get_monuseg_loader(
     )
     loader = torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
     return loader
+
+
+def main():
+    path = "/scratch/usr/nimanwai/data/monuseg/"
+    patch_shape = (512, 512)
+
+    loader = get_monuseg_loader(
+        path=path,
+        patch_shape=patch_shape,
+        batch_size=2,
+        download=True
+    )
+
+    print("Length of loader: ", len(loader))
+
+    breakpoint()
+
+
+if __name__ == "__main__":
+    main()

From 3175f74e3f64babb586bd436c0c5c7e20539b48e Mon Sep 17 00:00:00 2001
From: anwai98 <anwai.archit@gmail.com>
Date: Wed, 18 Oct 2023 17:37:41 +0200
Subject: [PATCH 2/9] Update __init__.py

---
 torch_em/data/datasets/__init__.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/torch_em/data/datasets/__init__.py b/torch_em/data/datasets/__init__.py
index 68ec8241..8ff94f0a 100644
--- a/torch_em/data/datasets/__init__.py
+++ b/torch_em/data/datasets/__init__.py
@@ -11,8 +11,7 @@
 from .lizard import get_lizard_loader, get_lizard_dataset
 from .lucchi import get_lucchi_loader, get_lucchi_dataset
 from .mitoem import get_mitoem_loader, get_mitoem_dataset
-# monuseg is only partially implemented
-# from .monuseg import get_monuseg_loader, get_monuseg_dataset
+from .monuseg import get_monuseg_loader, get_monuseg_dataset
 from .mouse_embryo import get_mouse_embryo_loader, get_mouse_embryo_dataset
 from .neurips_cell_seg import (
     get_neurips_cellseg_supervised_loader, get_neurips_cellseg_supervised_dataset,

From fe900bb8d60fb4df85b95aa9a4b1c70ea4746b8d Mon Sep 17 00:00:00 2001
From: anwai98 <anwai.archit@gmail.com>
Date: Wed, 18 Oct 2023 18:18:20 +0200
Subject: [PATCH 3/9] Add monuseg test split

---
 torch_em/data/datasets/monuseg.py | 62 +++++++++++++++++++------------
 1 file changed, 39 insertions(+), 23 deletions(-)

diff --git a/torch_em/data/datasets/monuseg.py b/torch_em/data/datasets/monuseg.py
index 7feebba0..938ff37a 100644
--- a/torch_em/data/datasets/monuseg.py
+++ b/torch_em/data/datasets/monuseg.py
@@ -12,25 +12,32 @@
 from torch_em.data.datasets import util
 
 
-URL = "https://drive.google.com/uc?export=download&id=1ZgqFJomqQGNnsx7w7QBzQQMVA16lbVCA"
-# TODO: add labeled test set (monuseg) - https://drive.google.com/file/d/1NKkSQ5T0ZNQ8aUhh0a8Dt2YKYCQXIViw
+URL = {
+    "train": "https://drive.google.com/uc?export=download&id=1ZgqFJomqQGNnsx7w7QBzQQMVA16lbVCA",
+    "test": "https://drive.google.com/uc?export=download&id=1NKkSQ5T0ZNQ8aUhh0a8Dt2YKYCQXIViw"
+}
 
-CHECKSUM = "25d3d3185bb2970b397cafa72eb664c9b4d24294aee382e7e3df9885affce742"
+CHECKSUM = {
+    "train": "25d3d3185bb2970b397cafa72eb664c9b4d24294aee382e7e3df9885affce742",
+    "test": "13e522387ae8b1bcc0530e13ff9c7b4d91ec74959ef6f6e57747368d7ee6f88a"
+}
 
 
 # TODO separate via organ
-def _download_monuseg(path, download):
+def _download_monuseg(path, download, split):
+    assert split in ["train", "test"], "The split choices in MoNuSeg datset are train/test, please choose from them"
+
     # check if we have extracted the images and labels already
-    im_path = os.path.join(path, "images")
-    label_path = os.path.join(path, "labels")
+    im_path = os.path.join(path, "images", split)
+    label_path = os.path.join(path, "labels", split)
     if os.path.exists(im_path) and os.path.exists(label_path):
         return
 
     os.makedirs(path, exist_ok=True)
-    zip_path = os.path.join(path, "monuseg.zip")
-    util.download_source_gdrive(zip_path, URL, download=download, checksum=CHECKSUM)
+    zip_path = os.path.join(path, f"monuseg_{split}.zip")
+    util.download_source_gdrive(zip_path, URL[split], download=download, checksum=CHECKSUM[split])
 
-    _process_monuseg(path)
+    _process_monuseg(path, split)
 
 
 def generate_labeled_array(shape, xml_file):
@@ -75,23 +82,28 @@ def generate_labeled_array(shape, xml_file):
     return mask
 
 
-def _process_monuseg(path):
-    util.unzip(os.path.join(path, "monuseg.zip"), path)
+def _process_monuseg(path, split):
+    util.unzip(os.path.join(path, f"monuseg_{split}.zip"), path)
 
     # assorting the images into expected dir;
     # converting the label xml files to numpy arrays (of same dimension as input images) in the expected dir
-    root_img_save_dir = os.path.join(path, "images")
-    root_label_save_dir = os.path.join(path, "labels")
+    root_img_save_dir = os.path.join(path, "images", split)
+    root_label_save_dir = os.path.join(path, "labels", split)
 
     os.makedirs(root_img_save_dir, exist_ok=True)
     os.makedirs(root_label_save_dir, exist_ok=True)
 
-    all_img_dir = sorted(glob(os.path.join(path, "*", "Tissue*", "*")))
-    all_xml_label_dir = sorted(glob(os.path.join(path, "*", "Annotations", "*")))
+    if split == "train":
+        all_img_dir = sorted(glob(os.path.join(path, "*", "Tissue*", "*")))
+        all_xml_label_dir = sorted(glob(os.path.join(path, "*", "Annotations", "*")))
+    else:
+        all_img_dir = sorted(glob(os.path.join(path, "MoNuSegTestData", "*.tif")))
+        all_xml_label_dir = sorted(glob(os.path.join(path, "MoNuSegTestData", "*.xml")))
+
     assert len(all_img_dir) == len(all_xml_label_dir)
 
     for img_path, xml_label_path in tqdm(zip(all_img_dir, all_xml_label_dir),
-                                         desc="Converting inputs to the expected format", 
+                                         desc=f"Converting {split} split to the expected format",
                                          total=len(all_img_dir)):
         desired_label_shape = imageio.imread(img_path).shape[:-1]
 
@@ -104,16 +116,19 @@ def _process_monuseg(path):
         imageio.imwrite(os.path.join(root_label_save_dir, f"{_fileid}.tif"), _label)
 
     shutil.rmtree(glob(os.path.join(path, "MoNuSeg*"))[0])
-    shutil.rmtree(glob(os.path.join(path, "__MACOSX"))[0])
+    if split == "train":
+        shutil.rmtree(glob(os.path.join(path, "__MACOSX"))[0])
 
 
 def get_monuseg_dataset(
-    path, patch_shape, download=False, offsets=None, boundaries=False, binary=False, **kwargs
+    path, patch_shape, split, download=False, offsets=None, boundaries=False, binary=False, **kwargs
 ):
-    _download_monuseg(path, download)
+    """Dataset from https://monuseg.grand-challenge.org/Data/
+    """
+    _download_monuseg(path, download, split)
 
-    image_path = os.path.join(path, "images")
-    label_path = os.path.join(path, "labels")
+    image_path = os.path.join(path, "images", split)
+    label_path = os.path.join(path, "labels", split)
 
     kwargs, _ = util.add_instance_label_transform(
         kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets
@@ -125,13 +140,13 @@ def get_monuseg_dataset(
 
 # TODO implement selecting organ
 def get_monuseg_loader(
-    path, patch_shape, batch_size, download=False, offsets=None, boundaries=False, binary=False, **kwargs
+    path, patch_shape, batch_size, split, download=False, offsets=None, boundaries=False, binary=False, **kwargs
 ):
     ds_kwargs, loader_kwargs = util.split_kwargs(
         torch_em.default_segmentation_dataset, **kwargs
     )
     dataset = get_monuseg_dataset(
-        path, patch_shape, download=download,
+        path, patch_shape, split, download=download,
         offsets=offsets, boundaries=boundaries, binary=binary, **ds_kwargs
     )
     loader = torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
@@ -145,6 +160,7 @@ def main():
     loader = get_monuseg_loader(
         path=path,
         patch_shape=patch_shape,
+        split="test",
         batch_size=2,
         download=True
     )

From e1d73071f3b964025fe4da3e75846ea990d5d48b Mon Sep 17 00:00:00 2001
From: anwai98 <anwai.archit@gmail.com>
Date: Wed, 18 Oct 2023 19:06:19 +0200
Subject: [PATCH 4/9] Move xml to array conversion fn to util

---
 torch_em/data/datasets/monuseg.py | 47 +----------------------------
 torch_em/data/datasets/util.py    | 50 +++++++++++++++++++++++++++++--
 2 files changed, 49 insertions(+), 48 deletions(-)

diff --git a/torch_em/data/datasets/monuseg.py b/torch_em/data/datasets/monuseg.py
index 938ff37a..d61a37d3 100644
--- a/torch_em/data/datasets/monuseg.py
+++ b/torch_em/data/datasets/monuseg.py
@@ -1,12 +1,9 @@
 import os
 import shutil
-import numpy as np
 from tqdm import tqdm
 from glob import glob
-from xml.dom import minidom
 
 import imageio.v2 as imageio
-from skimage.draw import polygon
 
 import torch_em
 from torch_em.data.datasets import util
@@ -40,48 +37,6 @@ def _download_monuseg(path, download, split):
     _process_monuseg(path, split)
 
 
-def generate_labeled_array(shape, xml_file):
-    """Function taken from: https://github.com/rshwndsz/hover-net/blob/master/lightning_hovernet.ipynb
-
-    Given image shape and path to annotations (xml file), generatebit mask with the region inside a contour being white
-        shape: The image shape on which bit mask will be made
-        xml_file: path relative to the current working directory where the xml file is present
-
-    Returns:
-        An image of given shape with region inside contour being white..
-    """
-    # DOM object created by the minidom parser
-    xDoc = minidom.parse(xml_file)
-
-    # List of all Region tags
-    regions = xDoc.getElementsByTagName('Region')
-
-    # List which will store the vertices for each region
-    xy = []
-    for region in regions:
-        # Loading all the vertices in the region
-        vertices = region.getElementsByTagName('Vertex')
-
-        # The vertices of a region will be stored in a array
-        vw = np.zeros((len(vertices), 2))
-
-        for index, vertex in enumerate(vertices):
-            # Storing the values of x and y coordinate after conversion
-            vw[index][0] = float(vertex.getAttribute('X'))
-            vw[index][1] = float(vertex.getAttribute('Y'))
-
-        # Append the vertices of a region
-        xy.append(np.int32(vw))
-
-    # Creating a completely black image
-    mask = np.zeros(shape, np.float32)
-
-    for i, contour in enumerate(xy):
-        r, c = polygon(np.array(contour)[:, 1], np.array(contour)[:, 0], shape=shape)
-        mask[r, c] = i
-    return mask
-
-
 def _process_monuseg(path, split):
     util.unzip(os.path.join(path, f"monuseg_{split}.zip"), path)
 
@@ -111,7 +66,7 @@ def _process_monuseg(path, split):
         dst = os.path.join(root_img_save_dir, img_id)
         shutil.move(src=img_path, dst=dst)
 
-        _label = generate_labeled_array(shape=desired_label_shape, xml_file=xml_label_path)
+        _label = util.generate_labeled_array(shape=desired_label_shape, xml_file=xml_label_path)
         _fileid = img_id.split(".")[0]
         imageio.imwrite(os.path.join(root_label_save_dir, f"{_fileid}.tif"), _label)
 
diff --git a/torch_em/data/datasets/util.py b/torch_em/data/datasets/util.py
index 44ac4206..f485b3b7 100644
--- a/torch_em/data/datasets/util.py
+++ b/torch_em/data/datasets/util.py
@@ -2,8 +2,13 @@
 import os
 import hashlib
 import zipfile
-from shutil import copyfileobj
+import numpy as np
+from tqdm import tqdm
 from warnings import warn
+from xml.dom import minidom
+from shutil import copyfileobj
+
+from skimage.draw import polygon
 
 import torch
 import torch_em
@@ -14,7 +19,6 @@
 except ImportError:
     gdown = None
 
-from tqdm import tqdm
 
 BIOIMAGEIO_IDS = {
     "covid_if": "ilastik/covid_if_training_data",
@@ -158,3 +162,45 @@ def add_instance_label_transform(
         kwargs = update_kwargs(kwargs, "label_transform", label_transform, msg=msg)
         label_dtype = torch.float32
     return kwargs, label_dtype
+
+
+def generate_labeled_array(shape, xml_file):
+    """Function taken from: https://github.com/rshwndsz/hover-net/blob/master/lightning_hovernet.ipynb
+
+    Given image shape and path to annotations (xml file), generatebit mask with the region inside a contour being white
+        shape: The image shape on which bit mask will be made
+        xml_file: path relative to the current working directory where the xml file is present
+
+    Returns:
+        An image of given shape with region inside contour being white..
+    """
+    # DOM object created by the minidom parser
+    xDoc = minidom.parse(xml_file)
+
+    # List of all Region tags
+    regions = xDoc.getElementsByTagName('Region')
+
+    # List which will store the vertices for each region
+    xy = []
+    for region in regions:
+        # Loading all the vertices in the region
+        vertices = region.getElementsByTagName('Vertex')
+
+        # The vertices of a region will be stored in a array
+        vw = np.zeros((len(vertices), 2))
+
+        for index, vertex in enumerate(vertices):
+            # Storing the values of x and y coordinate after conversion
+            vw[index][0] = float(vertex.getAttribute('X'))
+            vw[index][1] = float(vertex.getAttribute('Y'))
+
+        # Append the vertices of a region
+        xy.append(np.int32(vw))
+
+    # Creating a completely black image
+    mask = np.zeros(shape, np.float32)
+
+    for i, contour in enumerate(xy):
+        r, c = polygon(np.array(contour)[:, 1], np.array(contour)[:, 0], shape=shape)
+        mask[r, c] = i
+    return mask

From abf985ccd67572f3ce8f56e542efc3615a5a42e8 Mon Sep 17 00:00:00 2001
From: anwai98 <anwai.archit@gmail.com>
Date: Wed, 18 Oct 2023 19:08:58 +0200
Subject: [PATCH 5/9] Rename xml to array function

---
 torch_em/data/datasets/monuseg.py | 2 +-
 torch_em/data/datasets/util.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch_em/data/datasets/monuseg.py b/torch_em/data/datasets/monuseg.py
index d61a37d3..9cbf0c73 100644
--- a/torch_em/data/datasets/monuseg.py
+++ b/torch_em/data/datasets/monuseg.py
@@ -66,7 +66,7 @@ def _process_monuseg(path, split):
         dst = os.path.join(root_img_save_dir, img_id)
         shutil.move(src=img_path, dst=dst)
 
-        _label = util.generate_labeled_array(shape=desired_label_shape, xml_file=xml_label_path)
+        _label = util.generate_labeled_array_from_xml(shape=desired_label_shape, xml_file=xml_label_path)
         _fileid = img_id.split(".")[0]
         imageio.imwrite(os.path.join(root_label_save_dir, f"{_fileid}.tif"), _label)
 
diff --git a/torch_em/data/datasets/util.py b/torch_em/data/datasets/util.py
index f485b3b7..f4987d9b 100644
--- a/torch_em/data/datasets/util.py
+++ b/torch_em/data/datasets/util.py
@@ -164,7 +164,7 @@ def add_instance_label_transform(
     return kwargs, label_dtype
 
 
-def generate_labeled_array(shape, xml_file):
+def generate_labeled_array_from_xml(shape, xml_file):
     """Function taken from: https://github.com/rshwndsz/hover-net/blob/master/lightning_hovernet.ipynb
 
     Given image shape and path to annotations (xml file), generatebit mask with the region inside a contour being white

From 4089ca833f04b025d657e0247d2f6dccdc680211 Mon Sep 17 00:00:00 2001
From: anwai98 <anwai.archit@gmail.com>
Date: Wed, 18 Oct 2023 23:08:14 +0200
Subject: [PATCH 6/9] Add script for checking monuseg

---
 scripts/datasets/check_monuseg.py | 28 ++++++++++++++++++++++++++++
 torch_em/data/datasets/monuseg.py | 23 +----------------------
 2 files changed, 29 insertions(+), 22 deletions(-)
 create mode 100644 scripts/datasets/check_monuseg.py

diff --git a/scripts/datasets/check_monuseg.py b/scripts/datasets/check_monuseg.py
new file mode 100644
index 00000000..193bb2fc
--- /dev/null
+++ b/scripts/datasets/check_monuseg.py
@@ -0,0 +1,28 @@
+from torch_em.util.debug import check_loader
+from torch_em.data.datasets import get_monuseg_loader
+
+
+MONUSEG_ROOT = "/scratch/usr/nimanwai/data/monuseg"
+
+
+def check_monuseg():
+    train_loader = get_monuseg_loader(
+        path=MONUSEG_ROOT,
+        patch_shape=(512, 512),
+        batch_size=2,
+        split="train",
+        download=True
+    )
+    test_loader = get_monuseg_loader(
+        path=MONUSEG_ROOT,
+        patch_shape=(512, 512),
+        batch_size=1,
+        split="test",
+        download=True
+    )
+    check_loader(train_loader, 15, instance_labels=True, rgb=False)
+    check_loader(test_loader, 15, instance_labels=True, rgb=False)
+
+
+if __name__ == "__main__":
+    check_monuseg()
diff --git a/torch_em/data/datasets/monuseg.py b/torch_em/data/datasets/monuseg.py
index 9cbf0c73..59df6245 100644
--- a/torch_em/data/datasets/monuseg.py
+++ b/torch_em/data/datasets/monuseg.py
@@ -3,7 +3,7 @@
 from tqdm import tqdm
 from glob import glob
 
-import imageio.v2 as imageio
+import imageio.v3 as imageio
 
 import torch_em
 from torch_em.data.datasets import util
@@ -106,24 +106,3 @@ def get_monuseg_loader(
     )
     loader = torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
     return loader
-
-
-def main():
-    path = "/scratch/usr/nimanwai/data/monuseg/"
-    patch_shape = (512, 512)
-
-    loader = get_monuseg_loader(
-        path=path,
-        patch_shape=patch_shape,
-        split="test",
-        batch_size=2,
-        download=True
-    )
-
-    print("Length of loader: ", len(loader))
-
-    breakpoint()
-
-
-if __name__ == "__main__":
-    main()

From 0cf2dda14e98f5166ea8a26da45c6e3e0435bf2f Mon Sep 17 00:00:00 2001
From: anwai98 <anwai.archit@gmail.com>
Date: Thu, 19 Oct 2023 21:33:02 +0200
Subject: [PATCH 7/9] Add organ-level splits for monuseg

---
 scripts/datasets/check_monuseg.py |  8 +++---
 torch_em/data/datasets/monuseg.py | 43 +++++++++++++++++++++++++------
 2 files changed, 40 insertions(+), 11 deletions(-)

diff --git a/scripts/datasets/check_monuseg.py b/scripts/datasets/check_monuseg.py
index 193bb2fc..f4e0dd9a 100644
--- a/scripts/datasets/check_monuseg.py
+++ b/scripts/datasets/check_monuseg.py
@@ -11,8 +11,11 @@ def check_monuseg():
         patch_shape=(512, 512),
         batch_size=2,
         split="train",
-        download=True
+        download=True,
+        organ_type=None
     )
+    check_loader(train_loader, 8, instance_labels=True, rgb=True, plt=True, save_path="./monuseg_train.png")
+
     test_loader = get_monuseg_loader(
         path=MONUSEG_ROOT,
         patch_shape=(512, 512),
@@ -20,8 +23,7 @@ def check_monuseg():
         split="test",
         download=True
     )
-    check_loader(train_loader, 15, instance_labels=True, rgb=False)
-    check_loader(test_loader, 15, instance_labels=True, rgb=False)
+    check_loader(test_loader, 8, instance_labels=True, rgb=True, plt=True, save_path="./monuseg_test.png")
 
 
 if __name__ == "__main__":
diff --git a/torch_em/data/datasets/monuseg.py b/torch_em/data/datasets/monuseg.py
index 59df6245..adb7dcb2 100644
--- a/torch_em/data/datasets/monuseg.py
+++ b/torch_em/data/datasets/monuseg.py
@@ -2,6 +2,7 @@
 import shutil
 from tqdm import tqdm
 from glob import glob
+from typing import List, Optional
 
 import imageio.v3 as imageio
 
@@ -19,8 +20,22 @@
     "test": "13e522387ae8b1bcc0530e13ff9c7b4d91ec74959ef6f6e57747368d7ee6f88a"
 }
 
+# here's the description: https://drive.google.com/file/d/1xYyQ31CHFRnvTCTuuHdconlJCMk2SK7Z/view?usp=sharing
+ORGAN_SPLITS = {
+    "breast": ["TCGA-A7-A13E-01Z-00-DX1", "TCGA-A7-A13F-01Z-00-DX1", "TCGA-AR-A1AK-01Z-00-DX1",
+               "TCGA-AR-A1AS-01Z-00-DX1", "TCGA-E2-A1B5-01Z-00-DX1", "TCGA-E2-A14V-01Z-00-DX1"],
+    "kidney": ["TCGA-B0-5711-01Z-00-DX1", "TCGA-HE-7128-01Z-00-DX1", "TCGA-HE-7129-01Z-00-DX1",
+               "TCGA-HE-7130-01Z-00-DX1", "TCGA-B0-5710-01Z-00-DX1", "TCGA-B0-5698-01Z-00-DX1"],
+    "liver": ["TCGA-18-5592-01Z-00-DX1", "TCGA-38-6178-01Z-00-DX1", "TCGA-49-4488-01Z-00-DX1",
+              "TCGA-50-5931-01Z-00-DX1", "TCGA-21-5784-01Z-00-DX1", "TCGA-21-5786-01Z-00-DX1"],
+    "prostate": ["TCGA-G9-6336-01Z-00-DX1", "TCGA-G9-6348-01Z-00-DX1", "TCGA-G9-6356-01Z-00-DX1",
+                 "TCGA-G9-6363-01Z-00-DX1", "TCGA-CH-5767-01Z-00-DX1", "TCGA-G9-6362-01Z-00-DX1"],
+    "bladder": ["TCGA-DK-A2I6-01A-01-TS1", "TCGA-G2-A2EK-01A-02-TSB"],
+    "colon": ["TCGA-AY-A8YK-01A-01-TS1", "TCGA-NH-A8F7-01A-01-TS1"],
+    "stomach": ["TCGA-KB-A93J-01A-01-TS1", "TCGA-RD-A8N9-01A-01-TS1"]
+}
+
 
-# TODO separate via organ
 def _download_monuseg(path, download, split):
     assert split in ["train", "test"], "The split choices in MoNuSeg datset are train/test, please choose from them"
 
@@ -76,32 +91,44 @@ def _process_monuseg(path, split):
 
 
 def get_monuseg_dataset(
-    path, patch_shape, split, download=False, offsets=None, boundaries=False, binary=False, **kwargs
+    path, patch_shape, split, organ_type: Optional[List[str]] = None, download=False,
+    offsets=None, boundaries=False, binary=False, **kwargs
 ):
     """Dataset from https://monuseg.grand-challenge.org/Data/
     """
     _download_monuseg(path, download, split)
 
-    image_path = os.path.join(path, "images", split)
-    label_path = os.path.join(path, "labels", split)
+    image_paths = sorted(glob(os.path.join(path, "images", split, "*")))
+    label_paths = sorted(glob(os.path.join(path, "labels", split, "*")))
+
+    if split == "train" and organ_type is not None:
+        # get all patients for multiple organ selection
+        all_organ_splits = sum([ORGAN_SPLITS[o][:] for o in organ_type], [])
+
+        image_paths = [
+            _path for _path in image_paths if os.path.split(_path)[-1].split(".")[0] in all_organ_splits
+        ]
+        label_paths = [
+            _path for _path in label_paths if os.path.split(_path)[-1].split(".")[0] in all_organ_splits
+        ]
 
     kwargs, _ = util.add_instance_label_transform(
         kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets
     )
     return torch_em.default_segmentation_dataset(
-        image_path, "*.tif", label_path, "*.tif", patch_shape, is_seg_dataset=False, **kwargs
+        image_paths, None, label_paths, None, patch_shape, is_seg_dataset=False, **kwargs
     )
 
 
-# TODO implement selecting organ
 def get_monuseg_loader(
-    path, patch_shape, batch_size, split, download=False, offsets=None, boundaries=False, binary=False, **kwargs
+    path, patch_shape, batch_size, split, organ_type=None, download=False, offsets=None, boundaries=False, binary=False,
+    **kwargs
 ):
     ds_kwargs, loader_kwargs = util.split_kwargs(
         torch_em.default_segmentation_dataset, **kwargs
     )
     dataset = get_monuseg_dataset(
-        path, patch_shape, split, download=download,
+        path, patch_shape, split, organ_type=organ_type, download=download,
         offsets=offsets, boundaries=boundaries, binary=binary, **ds_kwargs
     )
     loader = torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)

From 374fac16094f8e8d0747015a0767f10adac8db2e Mon Sep 17 00:00:00 2001
From: anwai98 <anwai.archit@gmail.com>
Date: Thu, 19 Oct 2023 22:24:01 +0200
Subject: [PATCH 8/9] Check organ splits for train loader

---
 scripts/datasets/check_monuseg.py | 2 +-
 torch_em/data/datasets/monuseg.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/datasets/check_monuseg.py b/scripts/datasets/check_monuseg.py
index f4e0dd9a..b999fdaa 100644
--- a/scripts/datasets/check_monuseg.py
+++ b/scripts/datasets/check_monuseg.py
@@ -12,7 +12,7 @@ def check_monuseg():
         batch_size=2,
         split="train",
         download=True,
-        organ_type=None
+        organ_type=["colon", "breast"]
     )
     check_loader(train_loader, 8, instance_labels=True, rgb=True, plt=True, save_path="./monuseg_train.png")
 
diff --git a/torch_em/data/datasets/monuseg.py b/torch_em/data/datasets/monuseg.py
index adb7dcb2..b32cee68 100644
--- a/torch_em/data/datasets/monuseg.py
+++ b/torch_em/data/datasets/monuseg.py
@@ -103,7 +103,7 @@ def get_monuseg_dataset(
 
     if split == "train" and organ_type is not None:
         # get all patients for multiple organ selection
-        all_organ_splits = sum([ORGAN_SPLITS[o][:] for o in organ_type], [])
+        all_organ_splits = sum([ORGAN_SPLITS[o] for o in organ_type], [])
 
         image_paths = [
             _path for _path in image_paths if os.path.split(_path)[-1].split(".")[0] in all_organ_splits

From 41d2fbc2879c79e26f3248dcb5f616b76b9c4962 Mon Sep 17 00:00:00 2001
From: anwai98 <anwai.archit@gmail.com>
Date: Fri, 20 Oct 2023 10:55:58 +0200
Subject: [PATCH 9/9] Raise error for organ splits in test

---
 torch_em/data/datasets/monuseg.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/torch_em/data/datasets/monuseg.py b/torch_em/data/datasets/monuseg.py
index b32cee68..2f2556d3 100644
--- a/torch_em/data/datasets/monuseg.py
+++ b/torch_em/data/datasets/monuseg.py
@@ -2,6 +2,7 @@
 import shutil
 from tqdm import tqdm
 from glob import glob
+from pathlib import Path
 from typing import List, Optional
 
 import imageio.v3 as imageio
@@ -103,14 +104,14 @@ def get_monuseg_dataset(
 
     if split == "train" and organ_type is not None:
         # get all patients for multiple organ selection
-        all_organ_splits = sum([ORGAN_SPLITS[o] for o in organ_type], [])
-
-        image_paths = [
-            _path for _path in image_paths if os.path.split(_path)[-1].split(".")[0] in all_organ_splits
-        ]
-        label_paths = [
-            _path for _path in label_paths if os.path.split(_path)[-1].split(".")[0] in all_organ_splits
-        ]
+        all_organ_splits = sum([ORGAN_SPLITS[_o] for _o in organ_type], [])
+
+        image_paths = [_path for _path in image_paths if Path(_path).stem in all_organ_splits]
+        label_paths = [_path for _path in label_paths if Path(_path).stem in all_organ_splits]
+
+    elif split == "test" and organ_type is not None:
+        # we don't have organ splits in the test dataset
+        raise ValueError("The test split does not have any organ informations, please pass `organ_type=None`")
 
     kwargs, _ = util.add_instance_label_transform(
         kwargs, add_binary_target=True, binary=binary, boundaries=boundaries, offsets=offsets