Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions lib/galaxy/config/sample/datatypes_conf.xml.sample
Original file line number Diff line number Diff line change
Expand Up @@ -1180,6 +1180,7 @@
<!-- rdeval types -->
<datatype extension="rd" type="galaxy.datatypes.binary:Binary" mimetype="application/octet-stream" display_in_upload="true" subclass="true" description="Rdeval read sketch"/>
<datatype extension="safetensors" type="galaxy.datatypes.binary:Safetensors" mimetype="application/octet-stream" display_in_upload="true" description="A simple format for storing tensors safely (as opposed to pickle) and that is still fast (zero-copy)" description_url="https://huggingface.co/docs/safetensors/index"/>
<datatype extension="spatialdata.zip" type="galaxy.datatypes.binary:SpatialData" mimetype="application/octet-stream" display_in_upload="true" description="A data framework that comprises a FAIR storage format and a collection of python libraries for performant access, alignment, and processing of uni- and multi-modal spatial omics datasets" description_url="https://github.com/scverse/spatialdata"/>
</registration>

<sniffers>
Expand Down Expand Up @@ -1287,6 +1288,7 @@
<sniffer type="galaxy.datatypes.qiime2:QIIME2Metadata"/>
<sniffer type="galaxy.datatypes.qiime2:QIIME2Artifact"/>
<sniffer type="galaxy.datatypes.qiime2:QIIME2Visualization"/>
<sniffer type="galaxy.datatypes.binary:SpatialData"/>
<sniffer type="galaxy.datatypes.binary:CompressedOMEZarrZipArchive"/>
<sniffer type="galaxy.datatypes.binary:CompressedZarrZipArchive"/>
<sniffer type="galaxy.datatypes.binary:CompressedZipArchive"/>
Expand Down
322 changes: 322 additions & 0 deletions lib/galaxy/datatypes/binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,12 @@ def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> N
dataset.metadata.zarr_format = format_version

def sniff(self, filename: str) -> bool:
"""
>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname('Images.zarr.zip')
>>> CompressedZarrZipArchive().sniff(fname)
True
"""
# Check if the zip file contains a zarr store.
# In theory, the zarr store must be in the root of the zip file.
# See: https://github.com/zarr-developers/zarr-python/issues/756#issuecomment-852134901
Expand Down Expand Up @@ -4866,6 +4872,322 @@ def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> N
dataset.metadata.version = struct.unpack("<i", header_bytes[4:8])[0]


class SpatialData(CompressedZarrZipArchive):
"""
Class for SpatialData file: https://spatialdata.scverse.org/

SpatialData: an open and universal framework for processing spatial omics data.
SpatialData aims at implementing a performant in-memory representation in Python
and an on-disk representation based on the Zarr and Parquet data formats
and following, when applicable, the OME-NGFF specification

The format stores multi-modal spatial omics datasets including:
- Images (2D/3D multi-scale)
- Labels (segmentation masks)
- Shapes (polygons, circles)
- Points (transcript locations, point clouds)
- Tables (annotations)
"""

file_ext = "spatialdata.zip"

# Minimal metadata for elements
MetadataElement(
Copy link
Contributor

@davelopez davelopez Oct 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question: Is all this metadata used by any existing or future tool?
I think we want to keep metadata as minimal as possible since this is information that goes into the database for each dataset. So it is important to keep only the essential metadata.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The metadata is really needed for the users to inspect the info in the data.
I tried to minimize it here:
image
is it good enough?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it is needed, then it is needed :)
I just wanted to make sure.

name="images_count",
desc="Number of SpatialData image elements",
default=0,
readonly=True,
visible=True,
no_value=0,
)

MetadataElement(
name="labels_count",
desc="Number of SpatialData label elements",
default=0,
readonly=True,
visible=True,
no_value=0,
)

MetadataElement(
name="shapes_count",
desc="Number of SpatialData shape elements",
default=0,
readonly=True,
visible=True,
no_value=0,
)

MetadataElement(
name="points_count",
desc="Number of SpatialData point elements",
default=0,
readonly=True,
visible=True,
no_value=0,
)

MetadataElement(
name="tables",
desc="SpatialData table elements",
default=[],
param=metadata.SelectParameter,
multiple=True,
readonly=True,
visible=True,
)

MetadataElement(
name="table_shapes",
desc="SpatialData table shapes (n_obs, n_vars)",
default={},
param=metadata.DictParameter,
readonly=True,
visible=False,
)

MetadataElement(
name="coordinate_systems",
desc="SpatialData coordinate systems",
default=[],
param=metadata.SelectParameter,
multiple=True,
readonly=True,
visible=True,
)

MetadataElement(
name="spatialdata_version",
desc="SpatialData software version",
default="",
readonly=True,
visible=True,
no_value="",
)

def set_peek(self, dataset: DatasetProtocol, **kwd) -> None:
if not dataset.dataset.purged:
# Try to make a metadata like spatialdata file itself
peek_lines = ["SpatialData object"]

# Show zarr format if available
if dataset.metadata.zarr_format:
peek_lines[0] += f" (Zarr Format v{dataset.metadata.zarr_format})"

# Show counts for each element type
if getattr(dataset.metadata, "images_count", 0):
peek_lines.append(f"├── Images ({dataset.metadata.images_count})")

if getattr(dataset.metadata, "labels_count", 0):
peek_lines.append(f"├── Labels ({dataset.metadata.labels_count})")

if getattr(dataset.metadata, "shapes_count", 0):
peek_lines.append(f"├── Shapes ({dataset.metadata.shapes_count})")

if getattr(dataset.metadata, "points_count", 0):
peek_lines.append(f"├── Points ({dataset.metadata.points_count})")

if dataset.metadata.tables:
peek_lines.append(f"└── Tables ({len(dataset.metadata.tables)})")
for tbl in dataset.metadata.tables:
# Add shape information if available
if dataset.metadata.table_shapes and tbl in dataset.metadata.table_shapes:
shape = dataset.metadata.table_shapes[tbl]
peek_lines.append(f" └── '{tbl}': AnnData {shape}")
else:
peek_lines.append(f" └── '{tbl}'")

# Show coordinate systems if available
if dataset.metadata.coordinate_systems:
peek_lines.append("")
peek_lines.append("with coordinate systems:")
for cs in dataset.metadata.coordinate_systems:
peek_lines.append(f" • {cs}")

dataset.peek = "\n".join(peek_lines)
dataset.blurb = f"SpatialData file ({nice_size(dataset.get_size())})"
if dataset.metadata.spatialdata_version:
dataset.blurb += f"\nVersion: {dataset.metadata.spatialdata_version}"
else:
dataset.peek = "file does not exist"
dataset.blurb = "file purged from disk"

def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> None:
super().set_meta(dataset, overwrite=overwrite, **kwd)
try:
with zipfile.ZipFile(dataset.get_file_name()) as zf:
# Initialize element dictionaries to track elements by type
images = set()
labels = set()
shapes = set()
points = set()
tables = set()
coordinate_systems = set()
spatialdata_version = ""

# Find the root zarr directory
root_zarr = None
for file in zf.namelist():
if file.endswith(".zarr/.zattrs"):
root_zarr = file.replace("/.zattrs", "")
break

# Read root attributes for version info
if root_zarr:
root_attrs_path = f"{root_zarr}/.zattrs"
try:
with zf.open(root_attrs_path) as f:
root_attrs = json.load(f)
if "spatialdata_attrs" in root_attrs:
spatialdata_attrs = root_attrs["spatialdata_attrs"]
spatialdata_version = spatialdata_attrs.get("spatialdata_software_version", "")
except Exception:
pass

# Parse all files to extract elements and coordinate systems
for file in zf.namelist():
# Extract elements based on directory structure
# Expected structure: <root>.zarr/<element_type>/<element_name>/...
if root_zarr and file.startswith(root_zarr + "/"):
rel_parts = file[len(root_zarr) + 1 :].split("/")
if len(rel_parts) >= 2:
element_type = rel_parts[0]
element_name = rel_parts[1]

# Skip metadata files and empty names
if element_name and not element_name.startswith("."):
if element_type == "images":
images.add(element_name)
elif element_type == "labels":
labels.add(element_name)
elif element_type == "shapes":
shapes.add(element_name)
elif element_type == "points":
points.add(element_name)
elif element_type == "tables":
tables.add(element_name)

# Extract coordinate system information from .zattrs files
if file.endswith(".zattrs"):
try:
with zf.open(file) as f:
attrs = json.load(f)

# Check for coordinate transformations
if "coordinateTransformations" in attrs:
transforms = attrs["coordinateTransformations"]
if isinstance(transforms, list):
for transform in transforms:
if isinstance(transform, dict) and "output" in transform:
output = transform["output"]
if isinstance(output, dict) and "name" in output:
coordinate_systems.add(output["name"])
elif isinstance(output, str):
coordinate_systems.add(output)

# Check for multiscales (images/labels)
if "multiscales" in attrs:
multiscales = attrs["multiscales"]
if isinstance(multiscales, list):
for ms in multiscales:
if isinstance(ms, dict) and "coordinateTransformations" in ms:
for ct in ms["coordinateTransformations"]:
if isinstance(ct, dict) and "output" in ct:
output = ct["output"]
if isinstance(output, dict) and "name" in output:
coordinate_systems.add(output["name"])
elif isinstance(output, str):
coordinate_systems.add(output)

# Check for spatialdata transform attribute (legacy)
if "transform" in attrs:
transform_dict = attrs["transform"]
if isinstance(transform_dict, dict):
coordinate_systems.update(transform_dict.keys())
except Exception:
pass

# Set metadata: counts for most elements, but keep tables and
# coordinate system names and table shapes for compatibility.
dataset.metadata.images_count = len(images)
dataset.metadata.labels_count = len(labels)
dataset.metadata.shapes_count = len(shapes)
dataset.metadata.points_count = len(points)

# Preserve table names and shapes (as before)
dataset.metadata.tables = sorted(tables)
table_shapes = {}
for table_name in tables:
try:
obs_index_path = f"{root_zarr}/tables/{table_name}/obs/_index/.zarray"
var_index_path = f"{root_zarr}/tables/{table_name}/var/_index/.zarray"
n_obs = None
n_vars = None
if obs_index_path in zf.namelist():
with zf.open(obs_index_path) as f:
obs_array = json.load(f)
n_obs = obs_array.get("shape", [None])[0]
if var_index_path in zf.namelist():
with zf.open(var_index_path) as f:
var_array = json.load(f)
n_vars = var_array.get("shape", [None])[0]
if n_obs is not None and n_vars is not None:
table_shapes[table_name] = (n_obs, n_vars)
except Exception:
pass
dataset.metadata.table_shapes = table_shapes
dataset.metadata.coordinate_systems = sorted(coordinate_systems)
dataset.metadata.spatialdata_version = spatialdata_version
except Exception:
pass

def sniff(self, filename: str) -> bool:
"""
Check if the file is a valid SpatialData zarr archive.

SpatialData files are Zarr archives with specific structure containing
a root .zattrs file with spatialdata_attrs metadata and element directories
like images/, labels/, shapes/, points/, or tables/.

>>> from galaxy.datatypes.sniff import get_test_fname
>>> fname = get_test_fname('subsampled_visium.spatialdata.zip')
>>> SpatialData().sniff(fname)
True
>>> fname = get_test_fname('Images.zarr.zip')
>>> SpatialData().sniff(fname)
False
"""

try:
with zipfile.ZipFile(filename) as zf:
# First, check if this is a zarr archive at all
if not super().sniff(filename):
return False

# Look for the root .zattrs file with spatialdata_attrs.
# This can distinguish spatialdata from other zarr archives.
for file in zf.namelist():
# Look for .zattrs file at the root of the zarr store
# The zarr store can be at root or one level deeper
parts = file.split("/")
# Root level: .zattrs or one level deep: <name>.zarr/.zattrs
if file == ".zattrs" or (len(parts) == 2 and parts[0].endswith(".zarr") and parts[1] == ".zattrs"):
try:
with zf.open(file) as f:
attrs = json.load(f)
# Check for SpatialData-specific metadata
if "spatialdata_attrs" in attrs:
return True
except Exception:
pass

return False
except Exception:
# Any exception during parsing means it's not a valid spatialdata file
return False


@build_sniff_from_prefix
class Safetensors(Binary):
"""
Expand Down
Binary file added lib/galaxy/datatypes/test/Images.zarr.zip
Binary file not shown.
Binary file not shown.
Loading