From 9dbb258d0d0fd191e3485f5ff98f07efccfbb6d7 Mon Sep 17 00:00:00 2001
From: Constantin Pape <constantin.pape@informatik.uni-goettingen.de>
Date: Sat, 15 Jul 2023 22:11:51 +0200
Subject: [PATCH] Add doc strings to datasets to provide the correct dataset
 references

---
 torch_em/data/datasets/axondeepseg.py      |  7 ++
 torch_em/data/datasets/covid_if.py         |  7 ++
 torch_em/data/datasets/cremi.py            | 80 ++++++++++++----------
 torch_em/data/datasets/deepbacs.py         |  7 ++
 torch_em/data/datasets/dsb.py              |  7 ++
 torch_em/data/datasets/hpa.py              |  7 ++
 torch_em/data/datasets/isbi2012.py         |  8 ++-
 torch_em/data/datasets/kasthuri.py         |  6 ++
 torch_em/data/datasets/livecell.py         | 14 +++-
 torch_em/data/datasets/lizard.py           |  6 ++
 torch_em/data/datasets/lucchi.py           |  6 ++
 torch_em/data/datasets/mitoem.py           |  6 ++
 torch_em/data/datasets/mouse_embryo.py     |  5 ++
 torch_em/data/datasets/neurips_cell_seg.py | 10 +++
 torch_em/data/datasets/nuc_mm.py           |  6 ++
 torch_em/data/datasets/plantseg.py         |  6 ++
 torch_em/data/datasets/platynereis.py      | 24 +++++++
 torch_em/data/datasets/snemi.py            | 56 ++++++++-------
 torch_em/data/datasets/sponge_em.py        |  6 ++
 torch_em/data/datasets/tissuenet.py        |  8 +++
 torch_em/data/datasets/uro_cell.py         |  6 ++
 torch_em/data/datasets/vnc.py              |  6 ++
 22 files changed, 226 insertions(+), 68 deletions(-)

diff --git a/torch_em/data/datasets/axondeepseg.py b/torch_em/data/datasets/axondeepseg.py
index 0fae333d..856a0852 100644
--- a/torch_em/data/datasets/axondeepseg.py
+++ b/torch_em/data/datasets/axondeepseg.py
@@ -127,6 +127,11 @@ def _require_axondeepseg_data(path, name, download):
 def get_axondeepseg_dataset(
     path, name, patch_shape, download=False, one_hot_encoding=False, data_fraction=None, split=None, **kwargs
 ):
+    """Dataset for the segmentation of myelinated axons in EM.
+
+    This dataset is from the publication https://doi.org/10.1038/s41598-018-22181-4.
+    Please cite it if you use this dataset for a publication.
+    """
     if isinstance(name, str):
         name = [name]
     assert isinstance(name, (tuple, list))
@@ -168,6 +173,8 @@ def get_axondeepseg_loader(
     download=False, one_hot_encoding=False,
     data_fraction=None, split=None, **kwargs
 ):
+    """Dataloader for the segmentation of myelinated axons. See 'get_axondeepseg_dataset' for details.
+    """
     ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
     dataset = get_axondeepseg_dataset(
         path, name, patch_shape, download=download, one_hot_encoding=one_hot_encoding,
diff --git a/torch_em/data/datasets/covid_if.py b/torch_em/data/datasets/covid_if.py
index d2885989..22d3d53d 100644
--- a/torch_em/data/datasets/covid_if.py
+++ b/torch_em/data/datasets/covid_if.py
@@ -25,6 +25,11 @@ def get_covid_if_dataset(
     path, patch_shape, sample_range=None, target="cells", download=False,
     offsets=None, boundaries=False, binary=False, **kwargs
 ):
+    """Dataset for the cells and nuclei in immunofluorescence.
+
+    This dataset is from the publication https://doi.org/10.1002/bies.202000257.
+    Please cite it if you use this dataset for a publication.
+    """
     available_targets = ("cells", "nuclei")
     # TODO also support infected_cells
     # available_targets = ("cells", "nuclei", "infected_cells")
@@ -63,6 +68,8 @@ def get_covid_if_loader(
     path, patch_shape, batch_size, sample_range=None, target="cells", download=False,
     offsets=None, boundaries=False, binary=False, **kwargs
 ):
+    """Dataloader for the segmentation of myelinated axons. See 'get_covid_if_loader' for details.
+    """
     ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
     dataset = get_covid_if_dataset(
         path, patch_shape, sample_range=sample_range, target=target, download=download,
diff --git a/torch_em/data/datasets/cremi.py b/torch_em/data/datasets/cremi.py
index e035eba8..d0a07c3f 100644
--- a/torch_em/data/datasets/cremi.py
+++ b/torch_em/data/datasets/cremi.py
@@ -25,44 +25,6 @@
 
 
 # TODO add support for realigned volumes
-def get_cremi_loader(
-    path,
-    patch_shape,
-    batch_size,
-    samples=("A", "B", "C"),
-    use_realigned=False,
-    download=False,
-    offsets=None,
-    boundaries=False,
-    rois={},
-    defect_augmentation_kwargs={
-        "p_drop_slice": 0.025,
-        "p_low_contrast": 0.025,
-        "p_deform_slice": 0.0,
-        "deformation_mode": "compress",
-    },
-    **kwargs,
-):
-    """
-    """
-    dataset_kwargs, loader_kwargs = util.split_kwargs(
-        torch_em.default_segmentation_dataset, **kwargs
-    )
-    ds = get_cremi_dataset(
-        path=path,
-        patch_shape=patch_shape,
-        samples=samples,
-        use_realigned=use_realigned,
-        download=download,
-        offsets=offsets,
-        boundaries=boundaries,
-        rois=rois,
-        defect_augmentation_kwargs=defect_augmentation_kwargs,
-        **dataset_kwargs,
-    )
-    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
-
-
 def get_cremi_dataset(
     path,
     patch_shape,
@@ -80,6 +42,10 @@ def get_cremi_dataset(
     },
     **kwargs,
 ):
+    """Dataset for the segmentation of neurons in EM.
+
+    This dataset is from the CREMI challenge: https://cremi.org/.
+    """
     assert len(patch_shape) == 3
     if rois is not None:
         assert isinstance(rois, dict)
@@ -132,3 +98,41 @@ def get_cremi_dataset(
     )
 
     return torch_em.default_segmentation_dataset(data_paths, raw_key, data_paths, label_key, patch_shape, **kwargs)
+
+
+def get_cremi_loader(
+    path,
+    patch_shape,
+    batch_size,
+    samples=("A", "B", "C"),
+    use_realigned=False,
+    download=False,
+    offsets=None,
+    boundaries=False,
+    rois={},
+    defect_augmentation_kwargs={
+        "p_drop_slice": 0.025,
+        "p_low_contrast": 0.025,
+        "p_deform_slice": 0.0,
+        "deformation_mode": "compress",
+    },
+    **kwargs,
+):
+    """Dataset for the segmentation of neurons in EM. See 'get_cremi_dataset' for details.
+    """
+    dataset_kwargs, loader_kwargs = util.split_kwargs(
+        torch_em.default_segmentation_dataset, **kwargs
+    )
+    ds = get_cremi_dataset(
+        path=path,
+        patch_shape=patch_shape,
+        samples=samples,
+        use_realigned=use_realigned,
+        download=download,
+        offsets=offsets,
+        boundaries=boundaries,
+        rois=rois,
+        defect_augmentation_kwargs=defect_augmentation_kwargs,
+        **dataset_kwargs,
+    )
+    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
diff --git a/torch_em/data/datasets/deepbacs.py b/torch_em/data/datasets/deepbacs.py
index 25d5647b..32bd582b 100644
--- a/torch_em/data/datasets/deepbacs.py
+++ b/torch_em/data/datasets/deepbacs.py
@@ -43,6 +43,11 @@ def _get_paths(path, bac_type, split):
 def get_deepbacs_dataset(
     path, split, patch_shape, bac_type="mixed", download=False, **kwargs
 ):
+    """Dataset for the segmentation of bacteria in light microscopy.
+
+    This dataset is from the publication https://doi.org/10.1038/s42003-022-03634-z.
+    Please cite it if you use this dataset for a publication.
+    """
     assert split in ("train", "test")
     bac_types = list(URLS.keys())
     assert bac_type in bac_types, f"{bac_type} is not in expected bacteria types: {bac_types}"
@@ -60,6 +65,8 @@ def get_deepbacs_dataset(
 
 
 def get_deepbacs_loader(path, split, patch_shape, batch_size, bac_type="mixed", download=False, **kwargs):
+    """Dataloader for the segmentation of bacteria in light microscopy. See 'get_deepbacs_dataset' for details.
+    """
     ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
     dataset = get_deepbacs_dataset(path, split, patch_shape, bac_type=bac_type, download=download, **ds_kwargs)
     loader = torch_em.get_data_loader(dataset, batch_size, **loader_kwargs)
diff --git a/torch_em/data/datasets/dsb.py b/torch_em/data/datasets/dsb.py
index fd244285..559d573c 100644
--- a/torch_em/data/datasets/dsb.py
+++ b/torch_em/data/datasets/dsb.py
@@ -38,6 +38,11 @@ def get_dsb_dataset(
     offsets=None, boundaries=False, binary=False,
     source="reduced", **kwargs
 ):
+    """Dataset for the segmentation of nuclei in light microscopy.
+
+    This dataset is from the publication https://doi.org/10.1038/s41592-019-0612-7.
+    Please cite it if you use this dataset for a publication.
+    """
     assert split in ("test", "train"), split
     _download_dsb(path, source, download)
 
@@ -58,6 +63,8 @@ def get_dsb_loader(
     offsets=None, boundaries=False, binary=False,
     source="reduced", **kwargs
 ):
+    """Dataloader for the segmentation of nuclei in light microscopy. See 'get_dsb_dataset' for details.
+    """
     ds_kwargs, loader_kwargs = util.split_kwargs(
         torch_em.default_segmentation_dataset, **kwargs
     )
diff --git a/torch_em/data/datasets/hpa.py b/torch_em/data/datasets/hpa.py
index dae1b0e1..dc54062f 100644
--- a/torch_em/data/datasets/hpa.py
+++ b/torch_em/data/datasets/hpa.py
@@ -312,6 +312,11 @@ def get_hpa_segmentation_dataset(
     channels=["microtubules", "protein", "nuclei", "er"],
     download=False, n_workers_preproc=8, **kwargs
 ):
+    """Dataset for the segmentation of cells in light microscopy.
+
+    This dataset is from the publication https://doi.org/10.1038/s41592-019-0658-6.
+    Please cite it if you use this dataset for a publication.
+    """
     data_is_complete = _check_data(path)
     if not data_is_complete:
         _download_hpa_data(path, "segmentation", download)
@@ -336,6 +341,8 @@ def get_hpa_segmentation_loader(
     channels=["microtubules", "protein", "nuclei", "er"],
     download=False, n_workers_preproc=8, **kwargs
 ):
+    """Dataloader for the segmentation of cells in light microscopy. See 'get_hpa_segmentation_dataset' for details.
+    """
     ds_kwargs, loader_kwargs = util.split_kwargs(
         torch_em.default_segmentation_dataset, **kwargs
     )
diff --git a/torch_em/data/datasets/isbi2012.py b/torch_em/data/datasets/isbi2012.py
index 309b3554..b3f390fd 100644
--- a/torch_em/data/datasets/isbi2012.py
+++ b/torch_em/data/datasets/isbi2012.py
@@ -9,6 +9,11 @@ def get_isbi_dataset(
     path, patch_shape, download=False, offsets=None, boundaries=False,
     use_original_labels=False, **kwargs
 ):
+    """Dataset for the segmentation of neurons in EM.
+
+    This dataset is from the publication https://doi.org/10.3389/fnana.2015.00142.
+    Please cite it if you use this dataset for a publication.
+    """
     if not path.endswith(".h5"):
         raise ValueError("Isbi path must be a hdf5 file.")
     assert len(patch_shape) == 3
@@ -32,8 +37,7 @@ def get_isbi_loader(
     use_original_labels=False,
     **kwargs
 ):
-    """
-    """
+    """Dataloader for the segmentation of neurons in EM. See 'get_isbi_dataset' for details."""
     ds_kwargs, loader_kwargs = util.split_kwargs(
         torch_em.default_segmentation_dataset, **kwargs
     )
diff --git a/torch_em/data/datasets/kasthuri.py b/torch_em/data/datasets/kasthuri.py
index f1a6c21a..76b30869 100644
--- a/torch_em/data/datasets/kasthuri.py
+++ b/torch_em/data/datasets/kasthuri.py
@@ -81,6 +81,11 @@ def _require_kasthuri_data(path, download):
 
 
 def get_kasthuri_dataset(path, split, patch_shape, download=False, **kwargs):
+    """Dataset for the segmentation of mitochondria in EM.
+
+    This dataset is from the publication https://doi.org/10.48550/arXiv.1812.06024.
+    Please cite it if you use this dataset for a publication.
+    """
     assert split in ("train", "test")
     _require_kasthuri_data(path, download)
     data_path = os.path.join(path, f"kasthuri_{split}.h5")
@@ -90,6 +95,7 @@ def get_kasthuri_dataset(path, split, patch_shape, download=False, **kwargs):
 
 
 def get_kasthuri_loader(path, split, patch_shape, batch_size, download=False, **kwargs):
+    """Dataloader for the segmentation of mitochondria in EM. See 'get_kasthuri_dataset' for details."""
     ds_kwargs, loader_kwargs = util.split_kwargs(
         torch_em.default_segmentation_dataset, **kwargs
     )
diff --git a/torch_em/data/datasets/livecell.py b/torch_em/data/datasets/livecell.py
index e4c63b42..df50c843 100644
--- a/torch_em/data/datasets/livecell.py
+++ b/torch_em/data/datasets/livecell.py
@@ -149,6 +149,11 @@ def get_livecell_dataset(
     offsets=None, boundaries=False, binary=False,
     cell_types=None, label_path=None, label_dtype=torch.int64, **kwargs
 ):
+    """Dataset for the segmentation of cells in phase-contrast microscopy.
+
+    This dataset is from the publication https://doi.org/10.1038/s41592-021-01249-6.
+    Please cite it if you use this dataset for a publication.
+    """
     assert split in ("train", "val", "test")
     if cell_types is not None:
         assert isinstance(cell_types, (list, tuple)),\
@@ -169,9 +174,12 @@ def get_livecell_dataset(
     return dataset
 
 
-def get_livecell_loader(path, split, patch_shape, batch_size, download=False,
-                        offsets=None, boundaries=False, binary=False,
-                        cell_types=None, label_path=None, label_dtype=torch.int64, **kwargs):
+def get_livecell_loader(
+    path, split, patch_shape, batch_size, download=False,
+    offsets=None, boundaries=False, binary=False,
+    cell_types=None, label_path=None, label_dtype=torch.int64, **kwargs
+):
+    """Dataloader for the segmentation of cells in phase-contrast microscopy. See 'get_livecell_dataset' for details."""
     ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
     dataset = get_livecell_dataset(
         path, split, patch_shape, download=download, offsets=offsets, boundaries=boundaries, binary=binary,
diff --git a/torch_em/data/datasets/lizard.py b/torch_em/data/datasets/lizard.py
index 7cc1b8cc..30384e91 100644
--- a/torch_em/data/datasets/lizard.py
+++ b/torch_em/data/datasets/lizard.py
@@ -80,6 +80,11 @@ def _require_lizard_data(path, download):
 
 
 def get_lizard_dataset(path, patch_shape, download=False, **kwargs):
+    """Dataset for the segmentation of nuclei in histopathology.
+
+    This dataset is from the publication https://doi.org/10.48550/arXiv.2108.11195.
+    Please cite it if you use this dataset for a publication.
+    """
     _require_lizard_data(path, download)
 
     data_paths = glob(os.path.join(path, "*.h5"))
@@ -96,6 +101,7 @@ def get_lizard_dataset(path, patch_shape, download=False, **kwargs):
 # TODO implement selecting different tissue types
 # TODO implement train / val / test split (is pre-defined in a csv)
 def get_lizard_loader(path, patch_shape, batch_size, download=False, **kwargs):
+    """Dataloader for the segmentation of nuclei in histopathology. See 'get_lizard_dataset' for details."""
     ds_kwargs, loader_kwargs = util.split_kwargs(
         torch_em.default_segmentation_dataset, **kwargs
     )
diff --git a/torch_em/data/datasets/lucchi.py b/torch_em/data/datasets/lucchi.py
index ea3116a9..981c4eef 100644
--- a/torch_em/data/datasets/lucchi.py
+++ b/torch_em/data/datasets/lucchi.py
@@ -78,6 +78,11 @@ def _require_lucchi_data(path, download):
 
 
 def get_lucchi_dataset(path, split, patch_shape, download=False, **kwargs):
+    """Dataset for the segmentation of mitochondria in EM.
+
+    This dataset is from the publication https://doi.org/10.48550/arXiv.1812.06024.
+    Please cite it if you use this dataset for a publication.
+    """
     assert split in ("train", "test")
     _require_lucchi_data(path, download)
     data_path = os.path.join(path, f"lucchi_{split}.h5")
@@ -87,6 +92,7 @@ def get_lucchi_dataset(path, split, patch_shape, download=False, **kwargs):
 
 
 def get_lucchi_loader(path, split, patch_shape, batch_size, download=False, **kwargs):
+    """Dataloader for the segmentation of mitochondria in EM. See 'get_lucchi_dataset' for details"""
     ds_kwargs, loader_kwargs = util.split_kwargs(
         torch_em.default_segmentation_dataset, **kwargs
     )
diff --git a/torch_em/data/datasets/mitoem.py b/torch_em/data/datasets/mitoem.py
index e8ff86c1..b78128b1 100644
--- a/torch_em/data/datasets/mitoem.py
+++ b/torch_em/data/datasets/mitoem.py
@@ -135,6 +135,11 @@ def get_mitoem_dataset(
     binary=False,
     **kwargs,
 ):
+    """Dataset for the segmentation of mitochondria in EM.
+
+    This dataset is from the publication https://doi.org/10.1007/978-3-030-59722-1_7.
+    Please cite it if you use this dataset for a publication.
+    """
     assert len(patch_shape) == 3
     if isinstance(splits, str):
         splits = [splits]
@@ -175,6 +180,7 @@ def get_mitoem_loader(
     binary=False,
     **kwargs,
 ):
+    """Dataloader for the segmentation of mitochondria in EM. See 'get_mitoem_dataset' for details."""
     ds_kwargs, loader_kwargs = util.split_kwargs(
         torch_em.default_segmentation_dataset, **kwargs
     )
diff --git a/torch_em/data/datasets/mouse_embryo.py b/torch_em/data/datasets/mouse_embryo.py
index 10f0ef77..fb6839dd 100644
--- a/torch_em/data/datasets/mouse_embryo.py
+++ b/torch_em/data/datasets/mouse_embryo.py
@@ -30,6 +30,10 @@ def get_mouse_embryo_dataset(
     binary=False,
     **kwargs,
 ):
+    """Dataset for the segmentation of nuclei in confocal microscopy.
+
+    This dataset is stored on zenodo: https://zenodo.org/record/6546550.
+    """
     assert name in ("membrane", "nuclei")
     assert split in ("train", "val")
     assert len(patch_shape) == 3
@@ -62,6 +66,7 @@ def get_mouse_embryo_loader(
     binary=False,
     **kwargs,
 ):
+    """Dataloader for the segmentation of nuclei in confocal microscopy. See 'get_mouse_embryo_dataset' for details."""
     ds_kwargs, loader_kwargs = util.split_kwargs(
         torch_em.default_segmentation_dataset, **kwargs
     )
diff --git a/torch_em/data/datasets/neurips_cell_seg.py b/torch_em/data/datasets/neurips_cell_seg.py
index 27a1660b..9a55bb26 100644
--- a/torch_em/data/datasets/neurips_cell_seg.py
+++ b/torch_em/data/datasets/neurips_cell_seg.py
@@ -75,6 +75,10 @@ def get_neurips_cellseg_supervised_dataset(
     sampler=None,
     val_fraction=0.1,
 ):
+    """Dataset for the segmentation of cells in light microscopy.
+
+    This dataset is part of the NeuRIPS Cell Segmentation challenge: https://neurips22-cellseg.grand-challenge.org/.
+    """
     assert split in ("train", "val", None), split
     image_paths, label_paths = _get_image_and_label_paths(root, split, val_fraction)
 
@@ -111,6 +115,7 @@ def get_neurips_cellseg_supervised_loader(
     val_fraction=0.1,
     **loader_kwargs
 ):
+    """Dataloader for the segmentation of cells in light microscopy. See 'get_neurips_cellseg_supervised_dataset'."""
     ds = get_neurips_cellseg_supervised_dataset(
         root, split, patch_shape, make_rgb=make_rgb, label_transform=label_transform,
         label_transform2=label_transform2, raw_transform=raw_transform, transform=transform,
@@ -157,6 +162,10 @@ def get_neurips_cellseg_unsupervised_dataset(
     use_images=True,
     use_wholeslide=True,
 ):
+    """Dataset for the segmentation of cells in light microscopy.
+
+    This dataset is part of the NeuRIPS Cell Segmentation challenge: https://neurips22-cellseg.grand-challenge.org/.
+    """
     if raw_transform is None:
         trafo = to_rgb if make_rgb else None
         raw_transform = torch_em.transform.get_raw_transform(augmentation2=trafo)
@@ -196,6 +205,7 @@ def get_neurips_cellseg_unsupervised_loader(
     use_wholeslide=True,
     **loader_kwargs,
 ):
+    """Dataloader for the segmentation of cells in light microscopy. See 'get_neurips_cellseg_unsupervised_dataset'."""
     ds = get_neurips_cellseg_unsupervised_dataset(
         root, patch_shape, make_rgb=make_rgb, raw_transform=raw_transform, transform=transform,
         dtype=dtype, sampler=sampler, use_images=use_images, use_wholeslide=use_wholeslide
diff --git a/torch_em/data/datasets/nuc_mm.py b/torch_em/data/datasets/nuc_mm.py
index 03b8ebd2..bf1c41fd 100644
--- a/torch_em/data/datasets/nuc_mm.py
+++ b/torch_em/data/datasets/nuc_mm.py
@@ -44,6 +44,11 @@ def _require_dataset(path, sample):
 
 
 def get_nuc_mm_dataset(path, sample, split, patch_shape, download=False, **kwargs):
+    """Dataset for the segmentation of nuclei in EM and X-Ray.
+
+    This dataset is from the publication https://doi.org/10.1007/978-3-030-87193-2_16.
+    Please cite it if you use this dataset for a publication.
+    """
     assert sample in ("mouse", "zebrafish")
     assert split in ("train", "val")
 
@@ -61,6 +66,7 @@ def get_nuc_mm_dataset(path, sample, split, patch_shape, download=False, **kwarg
 
 
 def get_nuc_mm_loader(path, sample, split, patch_shape, batch_size, download=False, **kwargs):
+    """Dataset for the segmentation of nuclei in EM and X-Ray. See 'get_nuc_mm_dataset' for details."""
     ds_kwargs, loader_kwargs = util.split_kwargs(
         torch_em.default_segmentation_dataset, **kwargs
     )
diff --git a/torch_em/data/datasets/plantseg.py b/torch_em/data/datasets/plantseg.py
index 2af977c0..1a85f1f2 100644
--- a/torch_em/data/datasets/plantseg.py
+++ b/torch_em/data/datasets/plantseg.py
@@ -71,6 +71,11 @@ def get_plantseg_dataset(
     binary=False,
     **kwargs,
 ):
+    """Dataset for the segmentation of plant cells in confocal and light-sheet microscopy.
+
+    This dataset is from the publication https://doi.org/10.7554/eLife.57613.
+    Please cite it if you use this dataset for a publication.
+    """
     assert len(patch_shape) == 3
     data_path = _require_plantseg_data(path, download, name, split)
 
@@ -99,6 +104,7 @@ def get_plantseg_loader(
     binary=False,
     **kwargs,
 ):
+    """Dataloader for the segmentation of cells in confocal and light-sheet microscopy. See 'get_plantseg_dataset'."""
     ds_kwargs, loader_kwargs = util.split_kwargs(
         torch_em.default_segmentation_dataset, **kwargs
     )
diff --git a/torch_em/data/datasets/platynereis.py b/torch_em/data/datasets/platynereis.py
index 43dd3768..95d51c38 100644
--- a/torch_em/data/datasets/platynereis.py
+++ b/torch_em/data/datasets/platynereis.py
@@ -44,6 +44,11 @@ def _check_data(path, prefix, extension, n_files):
 
 
 def get_platynereis_cuticle_dataset(path, patch_shape, sample_ids=None, download=False, **kwargs):
+    """Dataset for the segmentation of cuticle in EM.
+
+    This dataset is from the publication https://doi.org/10.1016/j.cell.2021.07.017.
+    Please cite it if you use this dataset for a publication.
+    """
     cuticle_root = os.path.join(path, "cuticle")
 
     ext = ".n5"
@@ -64,6 +69,7 @@ def get_platynereis_cuticle_dataset(path, patch_shape, sample_ids=None, download
 def get_platynereis_cuticle_loader(
     path, patch_shape, batch_size, sample_ids=None, download=False, **kwargs
 ):
+    """Dataloader for the segmentation of cuticle in EM. See 'get_platynereis_cuticle_loader'."""
     ds_kwargs, loader_kwargs = util.split_kwargs(
         torch_em.default_segmentation_dataset, **kwargs
     )
@@ -78,6 +84,11 @@ def get_platynereis_cilia_dataset(
     offsets=None, boundaries=False, binary=False,
     download=False, **kwargs
 ):
+    """Dataset for the segmentation of cilia in EM.
+
+    This dataset is from the publication https://doi.org/10.1016/j.cell.2021.07.017.
+    Please cite it if you use this dataset for a publication.
+    """
     assert split in ("train", "val")
     cilia_root = os.path.join(path, "cilia")
 
@@ -102,6 +113,7 @@ def get_platynereis_cilia_loader(
     offsets=None, boundaries=False, binary=False,
     download=False, **kwargs
 ):
+    """Dataloader for the segmentation of cilia in EM. See 'get_platynereis_cilia_dataset'."""
     ds_kwargs, loader_kwargs = util.split_kwargs(
         torch_em.default_segmentation_dataset, **kwargs
     )
@@ -119,6 +131,11 @@ def get_platynereis_cell_dataset(
     offsets=None, boundaries=False,
     download=False, **kwargs
 ):
+    """Dataset for the segmentation of cells in EM.
+
+    This dataset is from the publication https://doi.org/10.1016/j.cell.2021.07.017.
+    Please cite it if you use this dataset for a publication.
+    """
     cell_root = os.path.join(path, "membrane")
 
     prefix = "train_data_membrane_"
@@ -157,6 +174,7 @@ def get_platynereis_cell_loader(
     offsets=None, boundaries=False,
     download=False, **kwargs
 ):
+    """Dataloader for the segmentation of cells in EM. See 'get_platynereis_cell_dataset'."""
     ds_kwargs, loader_kwargs = util.split_kwargs(
         torch_em.default_segmentation_dataset, **kwargs
     )
@@ -173,6 +191,11 @@ def get_platynereis_nuclei_dataset(
     offsets=None, boundaries=False, binary=False,
     download=False, **kwargs,
 ):
+    """Dataset for the segmentation of nuclei in EM.
+
+    This dataset is from the publication https://doi.org/10.1016/j.cell.2021.07.017.
+    Please cite it if you use this dataset for a publication.
+    """
     nuc_root = os.path.join(path, "nuclei")
     prefix = "train_data_nuclei_"
     ext = ".h5"
@@ -211,6 +234,7 @@ def get_platynereis_nuclei_loader(
     offsets=None, boundaries=False, binary=False,
     download=False, **kwargs
 ):
+    """Dataloader for the segmentation of nuclei in EM. See 'get_platynereis_nuclei_dataset'."""
     ds_kwargs, loader_kwargs = util.split_kwargs(
         torch_em.default_segmentation_dataset, **kwargs
     )
diff --git a/torch_em/data/datasets/snemi.py b/torch_em/data/datasets/snemi.py
index 3b9148f6..ca1255f2 100644
--- a/torch_em/data/datasets/snemi.py
+++ b/torch_em/data/datasets/snemi.py
@@ -13,31 +13,6 @@
 }
 
 
-def get_snemi_loader(
-    path,
-    patch_shape,
-    batch_size,
-    sample="train",
-    download=False,
-    offsets=None,
-    boundaries=False,
-    **kwargs,
-):
-    ds_kwargs, loader_kwargs = util.split_kwargs(
-        torch_em.default_segmentation_dataset, **kwargs
-    )
-    ds = get_snemi_dataset(
-        path=path,
-        patch_shape=patch_shape,
-        sample=sample,
-        download=download,
-        offsets=offsets,
-        boundaries=boundaries,
-        **ds_kwargs,
-    )
-    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
-
-
 def get_snemi_dataset(
     path,
     patch_shape,
@@ -47,6 +22,11 @@ def get_snemi_dataset(
     boundaries=False,
     **kwargs,
 ):
+    """Dataset for the segmentation of neurons in EM.
+
+    This dataset is from the publication https://doi.org/10.1016/j.cell.2015.06.054.
+    Please cite it if you use this dataset for a publication.
+    """
     assert len(patch_shape) == 3
     os.makedirs(path, exist_ok=True)
 
@@ -62,3 +42,29 @@ def get_snemi_dataset(
     raw_key = "volumes/raw"
     label_key = "volumes/labels/neuron_ids"
     return torch_em.default_segmentation_dataset(data_path, raw_key, data_path, label_key, patch_shape, **kwargs)
+
+
+def get_snemi_loader(
+    path,
+    patch_shape,
+    batch_size,
+    sample="train",
+    download=False,
+    offsets=None,
+    boundaries=False,
+    **kwargs,
+):
+    """Dataloader for the segmentation of neurons in EM. See 'get_snemi_dataset'."""
+    ds_kwargs, loader_kwargs = util.split_kwargs(
+        torch_em.default_segmentation_dataset, **kwargs
+    )
+    ds = get_snemi_dataset(
+        path=path,
+        patch_shape=patch_shape,
+        sample=sample,
+        download=download,
+        offsets=offsets,
+        boundaries=boundaries,
+        **ds_kwargs,
+    )
+    return torch_em.get_data_loader(ds, batch_size=batch_size, **loader_kwargs)
diff --git a/torch_em/data/datasets/sponge_em.py b/torch_em/data/datasets/sponge_em.py
index 15de1b57..f5553584 100644
--- a/torch_em/data/datasets/sponge_em.py
+++ b/torch_em/data/datasets/sponge_em.py
@@ -16,6 +16,11 @@ def _require_sponge_em_data(path, download):
 
 
 def get_sponge_em_dataset(path, mode, patch_shape, sample_ids=None, download=False, **kwargs):
+    """Dataset for the segmentation of sponge cells and organelles in EM.
+
+    This dataset is from the publication https://doi.org/10.1126/science.abj2949.
+    Please cite it if you use this dataset for a publication.
+    """
     assert mode in ("semantic", "instances")
 
     n_files = len(glob(os.path.join(path, "*.h5")))
@@ -34,6 +39,7 @@ def get_sponge_em_dataset(path, mode, patch_shape, sample_ids=None, download=Fal
 
 
 def get_sponge_em_loader(path, mode, patch_shape, batch_size, sample_ids=None, download=False, **kwargs):
+    """Dataloader for the segmentation of sponge cells and organelles in EM. See 'get_sponge_em_dataset'."""
     ds_kwargs, loader_kwargs = util.split_kwargs(
         torch_em.default_segmentation_dataset, **kwargs
     )
diff --git a/torch_em/data/datasets/tissuenet.py b/torch_em/data/datasets/tissuenet.py
index 7d41f2f3..f6098eee 100644
--- a/torch_em/data/datasets/tissuenet.py
+++ b/torch_em/data/datasets/tissuenet.py
@@ -53,6 +53,11 @@ def _create_dataset(path, zip_path):
 def get_tissuenet_dataset(
     path, split, patch_shape, raw_channel, label_channel, download=False, **kwargs
 ):
+    """Dataset for the segmentation of cells in tissue imaged with light microscopy.
+
+    This dataset is from the publication https://doi.org/10.1038/s41587-021-01094-0.
+    Please cite it if you use this dataset for a publication.
+    """
     assert raw_channel in ("nucleus", "cell", "rgb")
     assert label_channel in ("nucleus", "cell")
 
@@ -90,6 +95,9 @@ def get_tissuenet_dataset(
 def get_tissuenet_loader(
     path, split, patch_shape, batch_size, raw_channel, label_channel, download=False, **kwargs
 ):
+    """Dataloader for the segmentation of cells in tissue imaged with light microscopy.
+    See 'get_tissuenet_dataset' for details.
+    """
     ds_kwargs, loader_kwargs = util.split_kwargs(torch_em.default_segmentation_dataset, **kwargs)
     dataset = get_tissuenet_dataset(
         path, split, patch_shape, raw_channel, label_channel, download, **ds_kwargs
diff --git a/torch_em/data/datasets/uro_cell.py b/torch_em/data/datasets/uro_cell.py
index 332c2e07..fb8e92ad 100644
--- a/torch_em/data/datasets/uro_cell.py
+++ b/torch_em/data/datasets/uro_cell.py
@@ -85,6 +85,11 @@ def get_uro_cell_dataset(
     binary=False,
     **kwargs
 ):
+    """Dataset for the segmentation of mitochondria and other organelles in EM.
+
+    This dataset is from the publication https://doi.org/10.1016/j.compbiomed.2020.103693.
+    Please cite it if you use this dataset for a publication.
+    """
     assert target in ("fv", "golgi", "lyso", "mito")
     _require_urocell_data(path, download)
     paths, label_key = _get_paths(path, target)
@@ -132,6 +137,7 @@ def get_uro_cell_loader(
     binary=False,
     **kwargs
 ):
+    """Dataloader for the segmentation of mitochondria and other organelles in EM. See 'get_uro_cell_dataset'."""
     ds_kwargs, loader_kwargs = util.split_kwargs(
         torch_em.default_segmentation_dataset, **kwargs
     )
diff --git a/torch_em/data/datasets/vnc.py b/torch_em/data/datasets/vnc.py
index be623296..676dc132 100644
--- a/torch_em/data/datasets/vnc.py
+++ b/torch_em/data/datasets/vnc.py
@@ -59,6 +59,11 @@ def get_vnc_mito_dataset(
     download=False,
     **kwargs
 ):
+    """Dataset for the segmentation of mitochondria in EM.
+
+    This dataset is from https://doi.org/10.6084/m9.figshare.856713.v1.
+    Please cite it if you use this dataset for a publication.
+    """
     _get_vnc_data(path, download)
     data_path = os.path.join(path, "vnc_train.h5")
 
@@ -81,6 +86,7 @@ def get_vnc_mito_loader(
     download=False,
     **kwargs
 ):
+    """Dataloader for the segmentation of mitochondria in EM. See 'get_vnc_mito_loader'."""
     ds_kwargs, loader_kwargs = util.split_kwargs(
         torch_em.default_segmentation_dataset, **kwargs
     )