dClimate · abidsikder · Mar 25, 2025 · Mar 18, 2025 · Mar 20, 2025 · Mar 20, 2025
diff --git a/README.md b/README.md
@@ -7,27 +7,26 @@
 [![codecov](https://codecov.io/gh/dClimate/py-hamt/graph/badge.svg?token=M6Y4D19Y38)](https://codecov.io/gh/dClimate/py-hamt)
 
 # py-hamt
-This is a python implementation of a HAMT, adapted from [rvagg's IAMap project written in JavaScript](https://github.com/rvagg/iamap).
-Like IAMap, py-hamt abstracts over a backing storage layer which lets you store any arbitrary amount of data but returns its own ID, e.g. content-addressed systems.
+This is a python implementation of a HAMT, inspired by [rvagg's IAMap project written in JavaScript](https://github.com/rvagg/iamap).
+Like IAMap, py-hamt abstracts over a content-addressed storage system, something that can keep arbitrary values but will return its own key, like IPFS.
 
-Key differences from IAMap is that the py-hamt data structure is mutable and not asynchronous. But the key idea of abstracting over a value store is the same.
+dClimate primarily created this for storing [zarr](https://zarr.dev/) on IPFS. To see this in action, see our [data ETLs](https://github.com/dClimate/etl-scripts).
 
-dClimate created this library to use IPFS to store [zarr](https://zarr.dev/) files. To see this in action, see our [data ETLs](https://github.com/dClimate/etl-scripts).
-
-# Usage
+# Installation and Usage
 To install, since we do not publish this package to PyPI, add this library to your project directly from git.
 ```sh
 pip install 'git+https://github.com/dClimate/py-hamt'
 ```
 
-Below are some examples, but for more information see the [API documentation](https://dclimate.github.io/py-hamt/py_hamt.html). Looking at the test files, namely `test_hamt.py` is also quite helpful. You can also see this library used in notebooks for data analysis here [dClimate Jupyter Notebooks](https://github.com/dClimate/jupyter-notebooks)
+Below are some examples, but for more information see the [API documentation](https://dclimate.github.io/py-hamt/py_hamt.html). Each major item has example code. You can also see this library used in [Jupyter notebooks for data analysis](https://github.com/dClimate/jupyter-notebooks).
 
-## Basic Writing/Reading from an in memory store
+## Basic Writing/Reading
+A HAMT allows for generic key-value storage.
 ```python
-from py_hamt import HAMT, DictStore
+from py_hamt import HAMT, IPFSStore
 
-# Setup a HAMT with an in memory store
-hamt = HAMT(store=DictStore())
+# Setup a HAMT and connect it to your local ipfs node
+hamt = HAMT(store=IPFSStore())
 
 # Set and get one value
 hamt["foo"] = "bar"
@@ -48,14 +47,17 @@ print (list(hamt)) # [foo, foo2], order depends on the hash function used
 # Delete a value
 del hamt["foo"]
 assert len(hamt) == 1
+
+# Print CID of the HAMT
+print(hamt.root_node_id)
 ```
 
 ## Reading a CID from IPFS
 ```python
 from py_hamt import HAMT, IPFSStore
 from multiformats import CID
 
-# Get the CID you wish to read whether from a blog post, a smart contract, or a friend
+# A CID for data you wish to read, from a blog post, a smart contract, or a friend
 dataset_cid = "baf..."
 
 # Use the multiformats library to decode the CID into an object
@@ -70,32 +72,9 @@ hamt = HAMT(store=IPFSStore(), root_node_id=root_cid) # You can optionally pass
 ...
 ```
 
-## Partially encrypted zarrs
-```python
-from py_hamt import HAMT, IPFSStore, create_zarr_encryption_transformers
-
-ds = ... # example ds with precip and temp data variables
-encryption_key = bytes(32) # change before using, only for demonstration purposes!
-header = "sample-header".encode()
-encrypt, decrypt = create_zarr_encryption_transformers(
-    encryption_key, header, exclude_vars=["temp"]
-)
-hamt = HAMT(
-    store=IPFSStore(), transformer_encode=encrypt, transformer_decode=decrypt
-)
-ds.to_zarr(store=hamt, mode="w")
-
-print("Attempting to read and print metadata of partially encrypted zarr")
-enc_ds = xr.open_zarr(
-    store=HAMT(store=IPFSStore(), root_node_id=hamt.root_node_id, read_only=True)
-)
-print(enc_ds)
-assert enc_ds.temp.sum() == ds.temp.sum()
-try:
-    enc_ds.precip.sum()
-except:
-    print("Couldn't read encrypted variable")
-```
+For an example on how to read and write Zarr v3, check the API documentation and look at the ``IPFSZarr3` class.
+
+For how to create partially encrypted zarrs, check the API documentation's `create_zarr_encryption_transformers` section.
 
 # Development Guide
 ## Setting Up
@@ -141,4 +120,4 @@ uv run pdoc py_hamt
 ```
 
 ## Managing dependencies
-Use `uv add` and `uv remove`, e.g. `uv add numpy` or `uv add --dev pytest`. For more information please see the [uv documentation](https://docs.astral.sh/uv/guides/projects/).
+Use `uv add` and `uv remove`, e.g. `uv add numpy` or `uv add pytest --group dev`. For more information please see the [uv documentation](https://docs.astral.sh/uv/guides/projects/).
diff --git a/py_hamt/__init__.py b/py_hamt/__init__.py
@@ -1,6 +1,7 @@
 from .hamt import HAMT, blake3_hashfn
 from .store import Store, DictStore, IPFSStore
 from .zarr_encryption_transformers import create_zarr_encryption_transformers
+from .ipfszarr3 import IPFSZarr3
 
 __all__ = [
     "HAMT",
@@ -9,4 +10,5 @@
     "DictStore",
     "IPFSStore",
     "create_zarr_encryption_transformers",
+    "IPFSZarr3",
 ]
diff --git a/py_hamt/ipfszarr3.py b/py_hamt/ipfszarr3.py
@@ -0,0 +1,142 @@
+from collections.abc import AsyncIterator, Iterable
+import zarr.abc.store
+import zarr.core.buffer
+from zarr.core.common import BytesLike
+
+from py_hamt.hamt import HAMT
+
+
+class IPFSZarr3(zarr.abc.store.Store):
+    """
+    While Zarr v2 can use a generic key-value map (MutableMapping) that HAMT already conforms to, Zarr v3s require storage classes to conform to a new abstract class. IPFSZarr3 just wraps over a HAMT to provide this compatibility.
+
+    An example of how to write and read a zarr, using xarray, is provided below.
+    # Write and get CID
+    ```python
+    import xarray as xr
+    from py_hamt import IPFSStore, HAMT, IPFSZarr3
+
+    ds = ... # some xarray Dataset
+    ipfszarr3 = IPFSZarr3(HAMT(store=IPFSStore()))
+    xr.to_zarr(store=ipfszarr3)
+    print(ipfszarr3.hamt.root_node_id) # The CID of the root, which is used for reading
+    ```
+
+    # Read from CID
+    ```python
+    import xarray as xr
+    from multiformats import CID
+    from py_hamt import IPFSStore, HAMT, IPFSZarr3
+
+    cid = CID.decode("...") # the CID for the HAMT root
+    ipfszarr3 = IPFSZarr3(HAMT(store=IPFSStore(), root_node_id=cid), read_only=True)
+    ds = xr.open_zarr(store=ipfszarr3)
+    print(ds)
+    ```
+    """
+
+    hamt: HAMT
+    """The internal HAMT. Safe to read the CID from, if done doing operations."""
+
+    def __init__(self, hamt: HAMT, read_only: bool = False) -> None:
+        super().__init__(read_only=read_only)
+        self.hamt = hamt
+        if read_only:
+            self.hamt.make_read_only()
+        else:
+            self.hamt.enable_write()
+
+    @property
+    def read_only(self) -> bool:
+        return self.hamt.read_only
+
+    def __eq__(self, val: object) -> bool:
+        if not isinstance(val, IPFSZarr3):
+            return False
+        return self.hamt.root_node_id == val.hamt.root_node_id
+
+    async def get(
+        self,
+        key: str,
+        prototype: zarr.core.buffer.BufferPrototype,
+        byte_range: zarr.abc.store.ByteRequest | None = None,
+    ) -> zarr.core.buffer.Buffer | None:
+        if key not in self.hamt:
+            return
+        # We know this value will always be bytes since we only store bytes in the HAMT
+        val: bytes = self.hamt[key]  # type: ignore
+        return prototype.buffer.from_bytes(val)
+
+        # Hypothetical code for supporting partial writes, but there is not much point since IPFS itself doesn't support partial write and reads
+        # Untested! If for some reason this is being uncommented and then used in the future, this needs to be tested
+        # subset: bytes
+        # match byte_range:
+        #     case None:
+        #         subset = val
+        #     case zarr.abc.store.RangeByteRequest:
+        #         subset = val[byte_range.start : byte_range.end]
+        #     case zarr.abc.store.OffsetByteRequest:
+        #         subset = val[byte_range.offset :]
+        #     case zarr.abc.store.SuffixByteRequest:
+        #         subset = val[-byte_range.suffix :]
+
+    async def get_partial_values(
+        self,
+        prototype: zarr.core.buffer.BufferPrototype,
+        key_ranges: Iterable[tuple[str, zarr.abc.store.ByteRequest | None]],
+    ) -> list[zarr.core.buffer.Buffer | None]:
+        raise NotImplementedError
+
+    async def exists(self, key: str) -> bool:
+        return key in self.hamt
+
+    @property
+    def supports_writes(self) -> bool:
+        return not self.hamt.read_only
+
+    @property
+    def supports_partial_writes(self) -> bool:
+        return False
+
+    async def set(self, key: str, value: zarr.core.buffer.Buffer) -> None:
+        self.hamt[key] = value.to_bytes()
+
+    async def set_if_not_exists(self, key: str, value: zarr.core.buffer.Buffer) -> None:
+        if key not in self.hamt:
+            await self.set(key, value)
+
+    async def set_partial_values(
+        self, key_start_values: Iterable[tuple[str, int, BytesLike]]
+    ) -> None:
+        raise NotImplementedError
+
+    @property
+    def supports_deletes(self) -> bool:
+        return not self.hamt.read_only
+
+    async def delete(self, key: str) -> None:
+        del self.hamt[key]
+
+    @property
+    def supports_listing(self) -> bool:
+        return True
+
+    async def list(self) -> AsyncIterator[str]:
+        for key in self.hamt:
+            yield key
+
+    async def list_prefix(self, prefix: str) -> AsyncIterator:
+        for key in self.hamt:
+            if key.startswith(prefix):
+                yield key
+
+    async def list_dir(self, prefix: str) -> AsyncIterator:
+        for key in self.hamt:
+            if key.startswith(prefix):
+                suffix = key[len(prefix) :]
+                first_slash = suffix.find("/")
+                if first_slash == -1:
+                    yield suffix
+                else:
+                    name = suffix[0:first_slash]
+                    yield name
diff --git a/py_hamt/zarr_encryption_transformers.py b/py_hamt/zarr_encryption_transformers.py
@@ -4,18 +4,6 @@
 from Crypto.Cipher import ChaCha20_Poly1305
 from Crypto.Random import get_random_bytes
 
-# Metadata files used in zarr v2
-_metadata_files = [
-    # top level metadata
-    ".zattrs",  # Also found within folders for variables
-    ".zgroup",
-    ".zmetadata",
-    # Found within folders for variables
-    ".zarray",
-    # important for coordinate variables, so that we can read bounds
-    "0",
-]
-
 type TransformerFN = Callable[[str, bytes], bytes]
 
 
@@ -31,18 +19,52 @@ def create_zarr_encryption_transformers(
 
     Note that the encryption key must always be 32 bytes long. A header is required by the underlying encryption algorithm. Every time a zarr chunk is encrypted, a random 24-byte nonce is generated. This is saved with the chunk for use when reading back.
 
-    Metadata within a zarr, such as ".zattrs" or ".zgroup" are always ignored, to allow for calculating an encrypted zarr's structure without necessarily having the encryption key. You may also set some variables to be entirely unencrypted with the exclude_vars argument. This allows for partially encrypted zarrs which can be loaded into xarray but the values of encrypted variables cannot be accessed (errors will be thrown).
+    zarr.json metadata files in a zarr v3 are always ignored, to allow for calculating an encrypted zarr's structure without having the encryption key.
+
+    With `exclude_vars` you may also set some variables to be unencrypted. This allows for partially encrypted zarrs which can be loaded into xarray but the values of encrypted variables cannot be accessed (errors will be thrown). You should generally include your coordinate variables along with your data variables in here.
+
+    # Example code
+    ```python
+    from py_hamt import HAMT, IPFSStore, IPFSZarr3
+
+    ds = ... # example xarray Dataset with precip and temp data variables
+    encryption_key = bytes(32) # change before using, only for demonstration purposes!
+    header = "sample-header".encode()
+    encrypt, decrypt = create_zarr_encryption_transformers(
+        encryption_key, header, exclude_vars=["temp"]
+    )
+    hamt = HAMT(
+        store=IPFSStore(), transformer_encode=encrypt, transformer_decode=decrypt
+    )
+    ipfszarr3 = IPFSZarr3(hamt)
+    ds.to_zarr(store=ipfszarr3, mode="w")
+
+    print("Attempting to read and print metadata of partially encrypted zarr")
+    enc_ds = xr.open_zarr(store=ipfszarr3, read_only=True)
+    print(enc_ds)
+    assert enc_ds.temp.sum() == ds.temp.sum()
+    try:
+        enc_ds.precip.sum()
+    except:
+        print("Couldn't read encrypted variable")
+    ```
     """
 
     if len(encryption_key) != 32:
         raise ValueError("Encryption key is not 32 bytes")
 
     def _should_transform(key: str) -> bool:
         p = Path(key)
-        if p.parent.name in exclude_vars:
+
+        # Find the first directory name in the path since zarr v3 chunks are stored in a nested directory structure
+        # e.g. for Path("precip/c/0/0/1") it would return "precip"
+        if p.parts[0] in exclude_vars:
             return False
-        if p.name in _metadata_files:
+
+        # Don't transform metadata files
+        if p.name == "zarr.json":
             return False
+
         return True
 
     def encrypt(key: str, val: bytes) -> bytes:

diff --git a/pyproject.toml b/pyproject.toml
@@ -10,6 +10,7 @@ dependencies = [
     "multiformats[full]>=0.3.1.post4",
     "pycryptodome>=3.21.0",
     "requests>=2.32.3",
+    "zarr",
 ]
 
 [build-system]
@@ -27,6 +28,6 @@ dev = [
     "snakeviz>=2.2.0",
     "pandas>=2.2.3",
     "numpy>=2.1.3",
-    "xarray==2024.11.0",
-    "zarr==2.18.3",
+    "xarray>=2025.1.2",
+    "pytest-asyncio>=0.25.3",
 ]