In [1]:
import datatree as dt
import xarray


# Example inspired from
# https://xarray-datatree.readthedocs.io/en/latest/hierarchical-data.html#ancestry-in-an-evolutionary-tree
# vertebrates = dt.DataTree.from_dict(
vertebrates = xarray.core.datatree.DataTree.from_dict(
    name="Vertebrae",
    d={
        "/Sharks": None,
        "/Bony Skeleton/Ray-finned Fish": None,
        "/Bony Skeleton/Four Limbs/Amphibians": None,
        "/Bony Skeleton/Four Limbs/Amniotic Egg/Hair/Primates": None,
        "/Bony Skeleton/Four Limbs/Amniotic Egg/Hair/Rodents & Rabbits": xarray.Dataset(
            {f"variable_{k}": None for k in "abc"}
        ),
        "/Bony Skeleton/Four Limbs/Amniotic Egg/Two Fenestrae/Dinosaurs": xarray.Dataset(
            {f"variable_{k}": None for k in "abc"}
        ),
        "/Bony Skeleton/Four Limbs/Amniotic Egg/Two Fenestrae/Birds": xarray.Dataset(
            {f"variable_{k}": None for k in "abc"}
        ),
    },
)

vertebrates["/Bony Skeleton"]

In [23]:
from pathlib import PurePosixPath
from typing import Any, Literal
from collections import defaultdict

# Proto 1: Use a defaultdict, few control, rely on defaultdict, no children order,
# no way to distinguish nodes.


def to_dict(
    tree: dt.DataTree[Any],
    data: bool | Literal["list", "array"] = "list",
    encoding: bool = False,
) -> dict[str, Any]:
    """
    Convert this DataTree to a dictionary following xarray naming
    conventions.

    Converts all variables and attributes to native Python objects
    Useful for converting to json. To avoid datetime incompatibility
    use decode_times=False kwarg in xarrray.open_dataset.

    Parameters
    ----------
    data : bool or {"list", "array"}, default: "list"
        Whether to include the actual data in the dictionary. When set to
        False, returns just the schema. If set to "array", returns data as
        underlying array type. If set to "list" (or True for backwards
        compatibility), returns data in lists of Python data types. Note
        that for obtaining the "list" output efficiently, use
        `ds.compute().to_dict(data="list")`.

    encoding : bool, default: False
        Whether to include the Dataset's encoding in the dictionary.

    Returns
    -------
    d : dict
        Dict with keys: "coords", "attrs", "dims", "data_vars" and optionally
        "encoding".

    See Also
    --------
    Dataset.from_dict
    DataArray.to_dict
    """

    nested_dict_factory = lambda: defaultdict(nested_dict_factory)
    nested_dict_root = nested_dict_factory()

    for node in tree.subtree:
        # print(node.path)
        parts = PurePosixPath(node.path).parts
        print(parts)

        nested_dict_current = nested_dict_root
        for part in parts[:-1]:
            nested_dict_current = nested_dict_current[part]

        # nested_dict_current = node.path
        # JSON may not preserve order of keys.
        nested_dict_current[parts[-1]] = node.to_dataset().to_dict()

    return nested_dict_root


xdt = vertebrates
result = to_dict(xdt)

import json

print(json.dumps(result, indent=4))

('/',)
('/', 'Sharks')
('/', 'Bony Skeleton')
('/', 'Bony Skeleton', 'Ray-finned Fish')
('/', 'Bony Skeleton', 'Four Limbs')
('/', 'Bony Skeleton', 'Four Limbs', 'Amphibians')
('/', 'Bony Skeleton', 'Four Limbs', 'Amniotic Egg')
('/', 'Bony Skeleton', 'Four Limbs', 'Amniotic Egg', 'Hair')
('/', 'Bony Skeleton', 'Four Limbs', 'Amniotic Egg', 'Two Fenestrae')
('/', 'Bony Skeleton', 'Four Limbs', 'Amniotic Egg', 'Hair', 'Primates')
('/', 'Bony Skeleton', 'Four Limbs', 'Amniotic Egg', 'Hair', 'Rodents & Rabbits')
('/', 'Bony Skeleton', 'Four Limbs', 'Amniotic Egg', 'Two Fenestrae', 'Dinosaurs')
('/', 'Bony Skeleton', 'Four Limbs', 'Amniotic Egg', 'Two Fenestrae', 'Birds')
{
    "/": {
        "coords": {},
        "attrs": {},
        "dims": {},
        "data_vars": {},
        "Sharks": {
            "coords": {},
            "attrs": {},
            "dims": {},
            "data_vars": {}
        },
        "Bony Skeleton": {
            "coords": {},
            "attrs": {},
          

In [55]:
# Test 2: use the fact that .subtree:
# > yields them in depth-first order.
# This means we can, by doing a simple flat iteration (no recursion!)
# still get the the benefits of knowing the order of children
# thanks to the pre-explored tree.


from typing import Mapping, TypeVar

K = TypeVar("K")
V = TypeVar("V")
T = TypeVar("T")


def decode_numpy_dict_values(attrs: Mapping[K, V]) -> dict[K, V]:
    """Convert attribute values from numpy objects to native Python objects,
    for use in to_dict
    """
    attrs = dict(attrs)
    for k, v in attrs.items():
        if isinstance(v, np.ndarray):
            attrs[k] = v.tolist()
        elif isinstance(v, np.generic):
            attrs[k] = v.item()
    return attrs


def ensure_us_time_resolution(val):
    """Convert val out of numpy time, for use in to_dict.
    Needed because of numpy bug GH#7619"""
    if np.issubdtype(val.dtype, np.datetime64):
        val = val.astype("datetime64[us]")
    elif np.issubdtype(val.dtype, np.timedelta64):
        val = val.astype("timedelta64[us]")
    return val


def to_dict(
    tree: dt.DataTree[Any],
    data: bool | Literal["list", "array"] = "list",
    encoding: bool = False,
    detailed: bool = True,
) -> dict[str, Any]:
    """
    Convert this DataTree to a dictionary following xarray naming
    conventions.

    Converts all variables and attributes to native Python objects
    Useful for converting to json. To avoid datetime incompatibility
    use decode_times=False kwarg in xarrray.open_dataset.

    Parameters
    ----------
    data : bool or {"list", "array"}, default: "list"
        Whether to include the actual data in the dictionary. When set to
        False, returns just the schema. If set to "array", returns data as
        underlying array type. If set to "list" (or True for backwards
        compatibility), returns data in lists of Python data types. Note
        that for obtaining the "list" output efficiently, use
        `ds.compute().to_dict(data="list")`.

    encoding : bool, default: False
        Whether to include the Dataset's encoding in the dictionary.

    Returns
    -------
    d : dict
        Dict with keys: "coords", "attrs", "dims", "data_vars" and optionally
        "encoding".

    See Also
    --------
    Dataset.from_dict
    DataArray.to_dict
    """

    # Python dicts are ordered, JSON objects are NOT
    # But this is to_dict, and it can be ordered. So children will be an ordered dict and not a list.

    super_root = {"children": {}}

    for node in tree.subtree:
        # Create the dict-node
        d: dict = {
            # Addition to Dataset (Nodes (Datasets) are unnamed, DataArrays are ; Nodes become named
            # in a tree context, like DataArrays)
            "name": node.name,
            "coords": {},
            "attrs": decode_numpy_dict_values(node.attrs),
            "dims": dict(node.sizes),
            "data_vars": {},  # Is empty if tree is hollow, filled otherwise.
        }
        if detailed:  # Rename to absolute detailed. Aims independency of sub-json-objects if disabled.
            # Remain agnostic of the hierarchy.
            # The following properties are "absolute"
            # as they are full-root-tree-aware.
            d.update(
                {
                    "path": node.path,
                    "is_root": node.is_root,
                    "is_leaf": node.is_leaf,
                    "level": node.level,
                    "depth": node.depth,
                    "width": node.width,
                }
            )
        # Will be filled later (warning: JSON object do not preserve order)
        d["children"] = {}
        for k in node.coords:
            d["coords"].update(
                {k: node[k].variable.to_dict(data=data, encoding=encoding)}
            )
        for k in node.data_vars:
            d["data_vars"].update(
                {k: node[k].variable.to_dict(data=data, encoding=encoding)}
            )
        if encoding:
            d["encoding"] = dict(node.encoding)

        # Add the node to the dict-tree
        parts = PurePosixPath(node.path).parts

        current = super_root
        for part in parts[:-1]:
            current = current["children"].get(part, {})
        current["children"][parts[-1]] = d

        # Warning.
        # JSON may not preserve order of keys.

    root = super_root["children"]["/"]
    return root


xdt = vertebrates
result = to_dict(xdt)

import json

print(json.dumps(result, indent=4))

('/',)
('/', 'Sharks')
('/', 'Bony Skeleton')
('/', 'Bony Skeleton', 'Ray-finned Fish')
('/', 'Bony Skeleton', 'Four Limbs')
('/', 'Bony Skeleton', 'Four Limbs', 'Amphibians')
('/', 'Bony Skeleton', 'Four Limbs', 'Amniotic Egg')
('/', 'Bony Skeleton', 'Four Limbs', 'Amniotic Egg', 'Hair')
('/', 'Bony Skeleton', 'Four Limbs', 'Amniotic Egg', 'Two Fenestrae')
('/', 'Bony Skeleton', 'Four Limbs', 'Amniotic Egg', 'Hair', 'Primates')
('/', 'Bony Skeleton', 'Four Limbs', 'Amniotic Egg', 'Hair', 'Rodents & Rabbits')
('/', 'Bony Skeleton', 'Four Limbs', 'Amniotic Egg', 'Two Fenestrae', 'Dinosaurs')
('/', 'Bony Skeleton', 'Four Limbs', 'Amniotic Egg', 'Two Fenestrae', 'Birds')
{
    "name": "Vertebrae",
    "coords": {},
    "attrs": {},
    "dims": {},
    "data_vars": {},
    "path": "/",
    "is_root": true,
    "is_leaf": false,
    "level": 0,
    "depth": 5,
    "width": 1,
    "children": {
        "Sharks": {
            "name": "Sharks",
            "coords": {},
            "attrs": {

In [61]:
# Test 2.2: same as Test 2. but delegate as much as possible to Dataset.to_dict


def to_dict(
    tree: dt.DataTree[Any],
    data: bool | Literal["list", "array"] = "list",
    encoding: bool = False,
    with_absolute_details: bool = False,
) -> dict[str, Any]:
    """
    Convert this DataTree to a dictionary following xarray naming
    conventions.

    Converts all variables and attributes to native Python objects
    Useful for converting to json. To avoid datetime incompatibility
    use decode_times=False kwarg in xarrray.open_dataset.

    Remarks:

    # Python dicts are ordered, JSON objects are NOT
    # But this is to_dict, and it can be ordered. So children will be an ordered dict and not a list.
    # Warning.
    # JSON may not preserve order of keys.

    # Note: data_vars will be empty if the tree is hollow, ie. only leaves carry data)
    # Addition to Dataset (Nodes (Datasets) are unnamed, DataArrays are ; Nodes become named
    # in a tree context, like DataArrays). Note: children is to DataTree as data_vars is to Dataset

    # Rename to absolute detailed. Aims independency of sub-json-objects if disabled.
    # Remain agnostic of the hierarchy.
    # The following properties are "absolute"
    # as they are full-root-tree-aware.
    
    Parameters
    ----------
    data : bool or {"list", "array"}, default: "list"
        Whether to include the actual data in the dictionary. When set to
        False, returns just the schema. If set to "array", returns data as
        underlying array type. If set to "list" (or True for backwards
        compatibility), returns data in lists of Python data types. Note
        that for obtaining the "list" output efficiently, use
        `ds.compute().to_dict(data="list")`.

    encoding : bool, default: False
        Whether to include the Dataset's encoding in the dictionary.

    with_absolute_details : bool, default: False
        Whether to include additional absolute details (like the absolue node path, or level)
        in the dictionary.

    Returns
    -------
    d : dict
        Dict with keys: "coords", "attrs", "dims", "data_vars" and optionally
        "encoding".

    See Also
    --------
    DataTree.from_dict
    Dataset.from_dict
    Dataset.to_dict
    """

    super_root_dict = {"children": {}}

    for node in tree.subtree:
        # Initial dict creation is delegated to Dataset
        node_dict = node.ds.to_dict(data=data, encoding=encoding)

        node_dict["name"] = node.name

        if with_absolute_details:
            node_dict.update(
                {
                    "path": node.path,
                    "is_root": node.is_root,
                    "is_leaf": node.is_leaf,
                    "level": node.level,
                    "depth": node.depth,
                    "width": node.width,
                }
            )

        node_dict["children"] = {}

        # Add the node to the dict-tree
        parts = PurePosixPath(node.path).parts
        current_dict = super_root_dict
        for part in parts[:-1]:
            current_dict = current_dict["children"].get(part, {})
        current_dict["children"][parts[-1]] = node_dict

    root_dict = super_root_dict["children"]["/"]
    return root_dict


xdt = vertebrates
result = to_dict(xdt)

import json

print(json.dumps(result, indent=4))

{
    "coords": {},
    "attrs": {},
    "dims": {},
    "data_vars": {},
    "name": "Vertebrae",
    "children": {
        "Sharks": {
            "coords": {},
            "attrs": {},
            "dims": {},
            "data_vars": {},
            "name": "Sharks",
            "children": {}
        },
        "Bony Skeleton": {
            "coords": {},
            "attrs": {},
            "dims": {},
            "data_vars": {},
            "name": "Bony Skeleton",
            "children": {
                "Ray-finned Fish": {
                    "coords": {},
                    "attrs": {},
                    "dims": {},
                    "data_vars": {},
                    "name": "Ray-finned Fish",
                    "children": {}
                },
                "Four Limbs": {
                    "coords": {},
                    "attrs": {},
                    "dims": {},
                    "data_vars": {},
                    "name": "Four Limbs",
             

In [2]:
xarray.show_versions()


INSTALLED VERSIONS
------------------
commit: None
python: 3.10.12 (main, Aug 15 2023, 11:50:32) [GCC 9.4.0]
python-bits: 64
OS: Linux
OS-release: 5.15.0-105-generic
machine: x86_64
processor: x86_64
byteorder: little
LC_ALL: None
LANG: en_US.UTF-8
LOCALE: ('en_US', 'UTF-8')
libhdf5: None
libnetcdf: None

xarray: 2024.5.0
pandas: 2.2.2
numpy: 1.26.4
scipy: 1.13.0
netCDF4: None
pydap: None
h5netcdf: None
h5py: None
zarr: 2.18.1
cftime: None
nc_time_axis: None
iris: None
bottleneck: None
dask: 2023.12.1
distributed: None
matplotlib: None
cartopy: None
seaborn: None
numbagg: None
fsspec: 2024.5.0
cupy: None
pint: None
sparse: None
flox: None
numpy_groupies: None
setuptools: 67.8.0
pip: 23.1.2
conda: None
pytest: None
mypy: None
IPython: 8.24.0
sphinx: None


