From 4485838e9584390415e5071b7cec93076db64728 Mon Sep 17 00:00:00 2001 From: Anwai Archit Date: Sun, 17 Mar 2024 23:03:24 +0100 Subject: [PATCH 1/3] Add DynamicNuclearNet data - from DeepCell --- scripts/datasets/check_dynamicnuclearnet.py | 29 +++++++ torch_em/data/datasets/__init__.py | 1 + torch_em/data/datasets/dynamicnuclearnet.py | 88 +++++++++++++++++++++ 3 files changed, 118 insertions(+) create mode 100644 scripts/datasets/check_dynamicnuclearnet.py create mode 100644 torch_em/data/datasets/dynamicnuclearnet.py diff --git a/scripts/datasets/check_dynamicnuclearnet.py b/scripts/datasets/check_dynamicnuclearnet.py new file mode 100644 index 00000000..ccdaa70a --- /dev/null +++ b/scripts/datasets/check_dynamicnuclearnet.py @@ -0,0 +1,29 @@ +import numpy as np +from torch_em.transform.raw import standardize, normalize_percentile + +from torch_em.data.datasets import get_dynamicnuclearnet_loader +from torch_em.util.debug import check_loader + +DYNAMICNUCLEARNET_ROOT = "/home/anwai/data/deepcell/" + + +def raw_trafo(raw): + raw = normalize_percentile(raw, axis=(1, 2)) + raw = np.mean(raw, axis=0) + raw = standardize(raw) + return raw + + +# NOTE: the DynamicNuclearNet data cannot be downloaded automatically. +# you need to download it yourself from https://datasets.deepcell.org/data +def check_dynamicnuclearnet(): + # set this path to where you have downloaded the dynamicnuclearnet data + loader = get_dynamicnuclearnet_loader( + DYNAMICNUCLEARNET_ROOT, "train", + patch_shape=(512, 512), batch_size=2, download=True + ) + check_loader(loader, 10, instance_labels=True, rgb=False) + + +if __name__ == "__main__": + check_dynamicnuclearnet() diff --git a/torch_em/data/datasets/__init__.py b/torch_em/data/datasets/__init__.py index fd9fc7bf..e63eb6bc 100644 --- a/torch_em/data/datasets/__init__.py +++ b/torch_em/data/datasets/__init__.py @@ -6,6 +6,7 @@ from .ctc import get_ctc_segmentation_loader, get_ctc_segmentation_dataset from .deepbacs import get_deepbacs_loader, get_deepbacs_dataset from .dsb import get_dsb_loader, get_dsb_dataset +from .dynamicnuclearnet import get_dynamicnuclearnet_loader, get_dynamicnuclearnet_dataset from .hpa import get_hpa_segmentation_loader, get_hpa_segmentation_dataset from .isbi2012 import get_isbi_loader, get_isbi_dataset from .kasthuri import get_kasthuri_loader, get_kasthuri_dataset diff --git a/torch_em/data/datasets/dynamicnuclearnet.py b/torch_em/data/datasets/dynamicnuclearnet.py new file mode 100644 index 00000000..5f815e0d --- /dev/null +++ b/torch_em/data/datasets/dynamicnuclearnet.py @@ -0,0 +1,88 @@ +import os +from tqdm import tqdm +from glob import glob + +import z5py +import numpy as np +import pandas as pd + +import torch_em + +from . import util + + +# Automatic download is currently not possible, because of authentication +URL = None # TODO: here - https://datasets.deepcell.org/data + + +def _create_split(path, split): + split_file = os.path.join(path, "DynamicNuclearNet-segmentation-v1_0", f"{split}.npz") + split_folder = os.path.join(path, split) + os.makedirs(split_folder, exist_ok=True) + data = np.load(split_file, allow_pickle=True) + + x, y = data["X"], data["y"] + metadata = data["meta"] + metadata = pd.DataFrame(metadata[1:], columns=metadata[0]) + + for i, (im, label) in tqdm(enumerate(zip(x, y)), total=len(x), desc=f"Creating files for {split}-split"): + out_path = os.path.join(split_folder, f"image_{i:04}.zarr") + image_channel = im[..., 0] + label_channel = label[..., 0] + chunks = image_channel.shape + with z5py.File(out_path, "a") as f: + f.create_dataset("raw", data=image_channel, compression="gzip", chunks=chunks) + f.create_dataset("labels", data=label_channel, compression="gzip", chunks=chunks) + + os.remove(split_file) + + +def _create_dataset(path, zip_path): + util.unzip(zip_path, path, remove=False) + splits = ["train", "val", "test"] + assert all( + [os.path.exists(os.path.join(path, "DynamicNuclearNet-segmentation-v1_0", f"{split}.npz")) for split in splits] + ) + for split in splits: + _create_split(path, split) + + +def get_dynamicnuclearnet_dataset( + path, split, patch_shape, download=False, **kwargs +): + """TODO""" + splits = ["train", "val", "test"] + assert split in splits + + # check if the dataset exists already + zip_path = os.path.join(path, "DynamicNuclearNet-segmentation-v1_0.zip") + if all([os.path.exists(os.path.join(path, split)) for split in splits]): # yes it does + pass + elif os.path.exists(zip_path): # no it does not, but we have the zip there and can unpack it + _create_dataset(path, zip_path) + else: + raise RuntimeError( + "We do not support automatic download for the dynamic nuclear net dataset yet." + f"Please download the dataset from https://datasets.deepcell.org/data and put it here: {zip_path}" + ) + + split_folder = os.path.join(path, split) + assert os.path.exists(split_folder) + data_path = glob(os.path.join(split_folder, "*.zarr")) + assert len(data_path) > 0 + + raw_key, label_key = "raw", "labels" + + return torch_em.default_segmentation_dataset( + data_path, raw_key, data_path, label_key, patch_shape, is_seg_dataset=True, ndim=2, **kwargs + ) + + +def get_dynamicnuclearnet_loader( + path, split, patch_shape, batch_size, download, **kwargs +): + """TODO""" + ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) + dataset = get_dynamicnuclearnet_dataset(path, split, patch_shape, download, **ds_kwargs) + loader = torch_em.get_data_loader(dataset, batch_size, **loader_kwargs) + return loader From 3fc1fc05b81274e09e41c567121df6f88852470d Mon Sep 17 00:00:00 2001 From: Anwai Archit Date: Sun, 17 Mar 2024 23:05:00 +0100 Subject: [PATCH 2/3] Remove raw transform --- scripts/datasets/check_dynamicnuclearnet.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/scripts/datasets/check_dynamicnuclearnet.py b/scripts/datasets/check_dynamicnuclearnet.py index ccdaa70a..1b4c1256 100644 --- a/scripts/datasets/check_dynamicnuclearnet.py +++ b/scripts/datasets/check_dynamicnuclearnet.py @@ -1,17 +1,8 @@ -import numpy as np -from torch_em.transform.raw import standardize, normalize_percentile - -from torch_em.data.datasets import get_dynamicnuclearnet_loader from torch_em.util.debug import check_loader - -DYNAMICNUCLEARNET_ROOT = "/home/anwai/data/deepcell/" +from torch_em.data.datasets import get_dynamicnuclearnet_loader -def raw_trafo(raw): - raw = normalize_percentile(raw, axis=(1, 2)) - raw = np.mean(raw, axis=0) - raw = standardize(raw) - return raw +DYNAMICNUCLEARNET_ROOT = "/home/anwai/data/deepcell/" # NOTE: the DynamicNuclearNet data cannot be downloaded automatically. From ca0a7612e5b83037dac6e699ac083305b28869e4 Mon Sep 17 00:00:00 2001 From: Anwai Archit Date: Sun, 17 Mar 2024 23:15:59 +0100 Subject: [PATCH 3/3] Add docstring for the dataset --- torch_em/data/datasets/dynamicnuclearnet.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/torch_em/data/datasets/dynamicnuclearnet.py b/torch_em/data/datasets/dynamicnuclearnet.py index 5f815e0d..a4e831df 100644 --- a/torch_em/data/datasets/dynamicnuclearnet.py +++ b/torch_em/data/datasets/dynamicnuclearnet.py @@ -50,7 +50,10 @@ def _create_dataset(path, zip_path): def get_dynamicnuclearnet_dataset( path, split, patch_shape, download=False, **kwargs ): - """TODO""" + """Dataset for the segmentation of cell nuclei imaged with fluorescene microscopy. + + This dataset is from the publication https://doi.org/10.1101/803205. + Please cite it if you use this dataset for a publication.""" splits = ["train", "val", "test"] assert split in splits @@ -81,7 +84,9 @@ def get_dynamicnuclearnet_dataset( def get_dynamicnuclearnet_loader( path, split, patch_shape, batch_size, download, **kwargs ): - """TODO""" + """Dataloader for the segmentation of cell nuclei for 5 different cell lines in fluorescence microscopes. + See `get_dynamicnuclearnet_dataset` for details. +""" ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs) dataset = get_dynamicnuclearnet_dataset(path, split, patch_shape, download, **ds_kwargs) loader = torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)