Skip to content

Commit

Permalink
create metadata module, move the metadata classes there, restore prop…
Browse files Browse the repository at this point in the history
…er attributes of metadata classes
  • Loading branch information
d-v-b committed Jun 11, 2024
1 parent ee30347 commit 40ea349
Show file tree
Hide file tree
Showing 7 changed files with 303 additions and 351 deletions.
7 changes: 4 additions & 3 deletions src/zarr/api/asynchronous.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
from zarr.chunk_key_encodings import ChunkKeyEncoding
from zarr.common import JSON, ChunkCoords, MemoryOrder, OpenMode, ZarrFormat
from zarr.group import AsyncGroup
from zarr.metadata import ArrayV2Metadata, ArrayV3Metadata
from zarr.metadata.v2 import ArrayMetadata
from zarr.metadata.v3 import ArrayMetadata
from zarr.store import (
StoreLike,
make_store_path,
Expand Down Expand Up @@ -59,11 +60,11 @@ def _like_args(a: ArrayLike, kwargs: dict[str, Any]) -> dict[str, Any]:

if isinstance(a, AsyncArray):
new["order"] = a.order
if isinstance(a.metadata, ArrayV2Metadata):
if isinstance(a.metadata, ArrayMetadata):
new["compressor"] = a.metadata.compressor
new["filters"] = a.metadata.filters

if isinstance(a.metadata, ArrayV3Metadata):
if isinstance(a.metadata, ArrayMetadata):
new["codecs"] = a.metadata.codecs
else:
raise ValueError(f"Unsupported zarr format: {a.metadata.zarr_format}")
Expand Down
16 changes: 9 additions & 7 deletions src/zarr/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,9 @@
is_scalar,
pop_fields,
)
from zarr.metadata import ArrayMetadata, ArrayV2Metadata, ArrayV3Metadata
from zarr.metadata import ArrayMetadata
from zarr.metadata.v2 import ArrayMetadata
from zarr.metadata.v3 import ArrayMetadata
from zarr.store import StoreLike, StorePath, make_store_path
from zarr.sync import sync

Expand All @@ -69,9 +71,9 @@ def parse_array_metadata(data: Any) -> ArrayMetadata:
return data
elif isinstance(data, dict):
if data["zarr_format"] == 3:
return ArrayV3Metadata.from_dict(data)
return ArrayMetadata.from_dict(data)
elif data["zarr_format"] == 2:
return ArrayV2Metadata.from_dict(data)
return ArrayMetadata.from_dict(data)
raise TypeError


Expand Down Expand Up @@ -231,7 +233,7 @@ async def _create_v3(
else DefaultChunkKeyEncoding(separator=chunk_key_encoding[1])
)

metadata = ArrayV3Metadata(
metadata = ArrayMetadata(
shape=shape,
data_type=dtype,
chunk_grid=RegularChunkGrid(chunk_shape=chunk_shape),
Expand Down Expand Up @@ -274,7 +276,7 @@ async def _create_v2(
if dimension_separator is None:
dimension_separator = "."

metadata = ArrayV2Metadata(
metadata = ArrayMetadata(
shape=shape,
dtype=np.dtype(dtype),
chunks=chunks,
Expand Down Expand Up @@ -349,13 +351,13 @@ async def open(
zarray_dict = json.loads(zarray_bytes.to_bytes())
zattrs_dict = json.loads(zattrs_bytes.to_bytes()) if zattrs_bytes is not None else {}
zarray_dict["attributes"] = zattrs_dict
return cls(store_path=store_path, metadata=ArrayV2Metadata.from_dict(zarray_dict))
return cls(store_path=store_path, metadata=ArrayMetadata.from_dict(zarray_dict))
else:
# V3 arrays are comprised of a zarr.json object
assert zarr_json_bytes is not None
return cls(
store_path=store_path,
metadata=ArrayV3Metadata.from_dict(json.loads(zarr_json_bytes.to_bytes())),
metadata=ArrayMetadata.from_dict(json.loads(zarr_json_bytes.to_bytes())),
)

@property
Expand Down
3 changes: 2 additions & 1 deletion src/zarr/codecs/sharding.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@
product,
)
from zarr.indexing import BasicIndexer, SelectorTuple, c_order_iter, get_indexer, morton_order_iter
from zarr.metadata import ArrayMetadata, parse_codecs
from zarr.metadata import ArrayMetadata
from zarr.metadata.v3 import parse_codecs

if TYPE_CHECKING:
from collections.abc import Awaitable, Callable, Iterator
Expand Down
19 changes: 19 additions & 0 deletions src/zarr/metadata/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from __future__ import annotations
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from typing import Any

import numpy as np

def jsonify_dtype(
o: np.dtype[Any],
) -> str | list[tuple[str, str] | tuple[str, str, tuple[int, ...]]]:
"""
JSON serialization for a numpy dtype
"""
if isinstance(o, np.dtype):
if o.fields is None:
return o.str
else:
return o.descr
raise TypeError
158 changes: 158 additions & 0 deletions src/zarr/metadata/v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
from __future__ import annotations
from typing import TYPE_CHECKING

from zarr.abc.metadata import Metadata
if TYPE_CHECKING:
from typing_extensions import Self
import numpy.typing as npt
from zarr.abc.codec import Codec

from zarr.abc.codec import CodecPipeline
from zarr.array_spec import ArraySpec
from zarr.buffer import Buffer, BufferPrototype
from zarr.chunk_key_encodings import parse_separator
from zarr.codecs._v2 import V2Compressor, V2Filters
from zarr.common import JSON, ZARRAY_JSON, ZATTRS_JSON, ChunkCoords, parse_dtype, parse_fill_value, parse_shapelike
from zarr.config import parse_indexing_order
from zarr.metadata import ArrayMetadata, jsonify_dtype

import numpy as np
import json
from dataclasses import dataclass, field, replace
from typing import Any, Literal

from zarr.metadata.v3 import parse_attributes


def parse_zarr_format(data: Any) -> Literal[2]:
if data == 2:
return 2
raise ValueError(f"Invalid value. Expected 2. Got {data}.")


def parse_filters(data: Any) -> list[dict[str, JSON]] | None:
return data


def parse_compressor(data: Any) -> dict[str, JSON] | None:
return data


def parse_metadata(data: ArrayMetadata) -> ArrayMetadata:
"""
Perform validation of an entire ArrayMetadata instance, raising exceptions if there
are any problems with the metadata. Returns valid metadata.
"""
if (l_chunks := len(data.chunks)) != (l_shape := len(data.shape)):
msg = (
f"The `shape` and `chunks` attributes must have the same length. "
f"`chunks` has length {l_chunks}, but `shape` has length {l_shape}."
)
raise ValueError(msg)
return data


@dataclass(frozen=True, kw_only=True)
class ArrayMetadata(Metadata):
shape: ChunkCoords
chunks: tuple[int, ...]
data_type: np.dtype[Any]
fill_value: None | int | float = 0
order: Literal["C", "F"] = "C"
filters: list[dict[str, JSON]] | None = None
dimension_separator: Literal[".", "/"] = "."
compressor: dict[str, JSON] | None = None
attributes: dict[str, JSON] = field(default_factory=dict)
zarr_format: Literal[2] = field(init=False, default=2)

def __init__(
self,
*,
shape: ChunkCoords,
dtype: npt.DTypeLike,
chunks: ChunkCoords,
fill_value: Any,
order: Literal["C", "F"],
dimension_separator: Literal[".", "/"] = ".",
compressor: dict[str, JSON] | None = None,
filters: list[dict[str, JSON]] | None = None,
attributes: dict[str, JSON] | None = None,
):
"""
Metadata for a Zarr version 2 array.
"""
shape_parsed = parse_shapelike(shape)
data_type_parsed = parse_dtype(dtype)
chunks_parsed = parse_shapelike(chunks)
compressor_parsed = parse_compressor(compressor)
order_parsed = parse_indexing_order(order)
dimension_separator_parsed = parse_separator(dimension_separator)
filters_parsed = parse_filters(filters)
fill_value_parsed = parse_fill_value(fill_value)
attributes_parsed = parse_attributes(attributes)

object.__setattr__(self, "shape", shape_parsed)
object.__setattr__(self, "dtype", data_type_parsed)
object.__setattr__(self, "chunks", chunks_parsed)
object.__setattr__(self, "compressor", compressor_parsed)
object.__setattr__(self, "order", order_parsed)
object.__setattr__(self, "dimension_separator", dimension_separator_parsed)
object.__setattr__(self, "filters", filters_parsed)
object.__setattr__(self, "fill_value", fill_value_parsed)
object.__setattr__(self, "attributes", attributes_parsed)

# ensure that the metadata document is consistent
_ = parse_metadata(self)

@property
def ndim(self) -> int:
return len(self.shape)

@property
def dtype(self) -> np.dtype[Any]:
return self.data_type

@property
def codec_pipeline(self) -> CodecPipeline:
from zarr.codecs import BatchedCodecPipeline

return BatchedCodecPipeline.from_list(
[V2Filters(self.filters or []), V2Compressor(self.compressor)]
)

def to_buffer_dict(self) -> dict[str, Buffer]:
zarray_dict = self.to_dict()
zattrs_dict = zarray_dict.pop("attributes", {})
return {
ZARRAY_JSON: Buffer.from_bytes(json.dumps(zarray_dict, default=jsonify_dtype).encode()),
ZATTRS_JSON: Buffer.from_bytes(json.dumps(zattrs_dict).encode()),
}

@classmethod
def from_dict(cls, data: dict[str, Any]) -> ArrayMetadata:
# check that the zarr_format attribute is correct
_ = parse_zarr_format(data.pop("zarr_format"))
return cls(**data)

def get_chunk_spec(
self, _chunk_coords: ChunkCoords, order: Literal["C", "F"], prototype: BufferPrototype
) -> ArraySpec:
return ArraySpec(
shape=self.chunk_grid.chunk_shape,
dtype=self.dtype,
fill_value=self.fill_value,
order=order,
prototype=prototype,
)

def encode_chunk_key(self, chunk_coords: ChunkCoords) -> str:
chunk_identifier = self.dimension_separator.join(map(str, chunk_coords))
return "0" if chunk_identifier == "" else chunk_identifier

def update_shape(self, shape: ChunkCoords) -> Self:
return replace(self, shape=shape)

def update_attributes(self, attributes: dict[str, JSON]) -> Self:
return replace(self, attributes=attributes)


Loading

0 comments on commit 40ea349

Please sign in to comment.