Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:

- uses: prefix-dev/setup-pixi@v0
with:
pixi-version: v0.63.2
pixi-version: v0.65.0
cache: true
environments: docs

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ jobs:
- uses: actions/checkout@v6
- uses: prefix-dev/setup-pixi@v0
with:
pixi-version: v0.63.2
pixi-version: v0.65.0
cache: true
environments: lint
- name: Run linters
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ jobs:

- uses: prefix-dev/setup-pixi@v0
with:
pixi-version: v0.63.2
pixi-version: v0.65.0
environments: ${{ matrix.environment }}
cache: ${{ matrix.environment != 'upstream' }}
locked: ${{ matrix.environment != 'upstream' }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/wheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ jobs:

- uses: prefix-dev/setup-pixi@v0
with:
pixi-version: v0.63.2
pixi-version: v0.65.0
cache: true
environments: dist

Expand Down
1 change: 1 addition & 0 deletions doc/requirements.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ dependencies:
- msgpack-python *
- pyyaml *
- netcdf4 *
- libnetcdf !=4.10.0
- scipy *
- h5netcdf *
- zarr *
Expand Down
4 changes: 2 additions & 2 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ v2.1.0 (unreleased)
of text messages for all differences in numpy, pandas, and xarray objects
- New function :func:`display_diffs` that displays differences
in Jupyter notebooks
- Fixed issue that would cause excessive RAM usage when comparing Dask arrays with
2+ dimensions using a distributed scheduler
- Fixed issues that would cause slowdowns and excessive RAM usage when comparing Dask
arrays with 2+ dimensions using a distributed scheduler
- Added support for P2P rechunk in Dask distributed


Expand Down
5,681 changes: 4,455 additions & 1,226 deletions pixi.lock

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ distributed = "*"
msgpack-python = "*"
PyYAML = "*"
netcdf4 = "*" # netCDF engine
libnetcdf = "!=4.10.0" # Breaks coverage
scipy = "*" # netCDF engine
h5netcdf = "*" # netCDF engine
zarr = "*"
Expand Down Expand Up @@ -227,6 +228,7 @@ ipython = { cmd = "ipython", description = "Launch IPython" }

[tool.pixi.feature.jupyter.dependencies]
jupyterlab = "*"
python-graphviz = "*"

[tool.pixi.feature.jupyter.tasks]
jupyter = { cmd = "jupyter lab", description = "Launch JupyterLab" }
Expand Down
12 changes: 3 additions & 9 deletions recursive_diff/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,15 +430,9 @@ def _diff_dataarrays(
# Generate a bit-mask of the differences
# For Dask-backed arrays, this operation is delayed.
if lhs.dtype.kind in "iufc" and rhs.dtype.kind in "iufc":
# Both arrays are numeric
if is_dask:
import dask.array as da

isclose = da.isclose
else:
isclose = np.isclose

mask = ~isclose(lhs.data, rhs.data, rtol=rel_tol, atol=abs_tol, equal_nan=True)
mask = ~np.isclose(
lhs.data, rhs.data, rtol=rel_tol, atol=abs_tol, equal_nan=True
)
elif lhs.dtype.kind == "M" and rhs.dtype.kind == "M":
# Both arrays are datetime64
# Unlike with np.isclose(equal_nan=True), there is no
Expand Down
4 changes: 2 additions & 2 deletions recursive_diff/tests/test_diff_arrays.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def test_diff_arrays(chunk):
"abs_delta": [1.0],
"rel_delta": [0.25],
"x": ["b"],
"y": np.asarray([1]),
"y": np.asarray([1], dtype=np.int_),
}
).set_index(["x", "y"]),
)
Expand All @@ -61,7 +61,7 @@ def test_diff_arrays(chunk):
arrays["[3][data]"],
pd.DataFrame(
{"lhs": ["bar"], "rhs": ["baz"]},
index=pd.Index(np.asarray([1]), name="dim_0"),
index=pd.Index(np.asarray([1], dtype=np.int_), name="dim_0"),
),
)

Expand Down
72 changes: 67 additions & 5 deletions recursive_diff/tests/test_recursive_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,10 @@ def __repr__(self):
return f"Square({self.side})"


def check(lhs, rhs, *expect, **kwargs):
expect = sorted(expect)
actual = sorted(recursive_diff(lhs, rhs, **kwargs))
def check(lhs, rhs, *expect, order=False, **kwargs):
f = list if order else sorted
expect = f(expect)
actual = f(recursive_diff(lhs, rhs, **kwargs))
assert actual == expect


Expand Down Expand Up @@ -1165,6 +1166,67 @@ def test_dask_dataarray(chunk_lhs, chunk_rhs):
check(lhs, rhs, "[data][x=2]: c != d")


@requires_dask
@pytest.mark.parametrize(
"chunk_lhs,chunk_rhs",
[
(None, None),
(None, -1),
(None, 2),
({"x": 3, "y": 1}, {"x": 2, "y": 2}),
],
)
def test_dask_dataarray_2d(chunk_lhs, chunk_rhs):
lhs = xarray.DataArray([[0, 1, 2], [3, 4, 5]], dims=["x", "y"])
rhs = xarray.DataArray([[0, 1, 2], [3, 4, 6]], dims=["x", "y"])
if chunk_lhs:
lhs = lhs.chunk(chunk_lhs)
if chunk_rhs:
rhs = rhs.chunk(chunk_rhs)

check(lhs, rhs, "[data][x=1, y=2]: 5 != 6 (abs: 1.0e+00, rel: 2.0e-01)")


def test_dask_dataarray_ordered(chunk):
"""Test that difference order goes in C order and is not influenced
by Dask chunks.
"""
lhs = xarray.DataArray(np.arange(2 * 3 * 4).reshape(2, 3, 4), dims=["x", "y", "z"])
rhs = lhs + 1
if chunk:
lhs = lhs.chunk({"x": 2, "y": 2, "z": 3})
rhs = rhs.chunk({"x": 2, "y": 2, "z": 3})
check(
lhs,
rhs,
"[data][x=0, y=0, z=0]: 0 != 1 (abs: 1.0e+00, rel: nan)",
"[data][x=0, y=0, z=1]: 1 != 2 (abs: 1.0e+00, rel: 1.0e+00)",
"[data][x=0, y=0, z=2]: 2 != 3 (abs: 1.0e+00, rel: 5.0e-01)",
"[data][x=0, y=0, z=3]: 3 != 4 (abs: 1.0e+00, rel: 3.3e-01)",
"[data][x=0, y=1, z=0]: 4 != 5 (abs: 1.0e+00, rel: 2.5e-01)",
"[data][x=0, y=1, z=1]: 5 != 6 (abs: 1.0e+00, rel: 2.0e-01)",
"[data][x=0, y=1, z=2]: 6 != 7 (abs: 1.0e+00, rel: 1.7e-01)",
"[data][x=0, y=1, z=3]: 7 != 8 (abs: 1.0e+00, rel: 1.4e-01)",
"[data][x=0, y=2, z=0]: 8 != 9 (abs: 1.0e+00, rel: 1.2e-01)",
"[data][x=0, y=2, z=1]: 9 != 10 (abs: 1.0e+00, rel: 1.1e-01)",
"[data][x=0, y=2, z=2]: 10 != 11 (abs: 1.0e+00, rel: 1.0e-01)",
"[data][x=0, y=2, z=3]: 11 != 12 (abs: 1.0e+00, rel: 9.1e-02)",
"[data][x=1, y=0, z=0]: 12 != 13 (abs: 1.0e+00, rel: 8.3e-02)",
"[data][x=1, y=0, z=1]: 13 != 14 (abs: 1.0e+00, rel: 7.7e-02)",
"[data][x=1, y=0, z=2]: 14 != 15 (abs: 1.0e+00, rel: 7.1e-02)",
"[data][x=1, y=0, z=3]: 15 != 16 (abs: 1.0e+00, rel: 6.7e-02)",
"[data][x=1, y=1, z=0]: 16 != 17 (abs: 1.0e+00, rel: 6.2e-02)",
"[data][x=1, y=1, z=1]: 17 != 18 (abs: 1.0e+00, rel: 5.9e-02)",
"[data][x=1, y=1, z=2]: 18 != 19 (abs: 1.0e+00, rel: 5.6e-02)",
"[data][x=1, y=1, z=3]: 19 != 20 (abs: 1.0e+00, rel: 5.3e-02)",
"[data][x=1, y=2, z=0]: 20 != 21 (abs: 1.0e+00, rel: 5.0e-02)",
"[data][x=1, y=2, z=1]: 21 != 22 (abs: 1.0e+00, rel: 4.8e-02)",
"[data][x=1, y=2, z=2]: 22 != 23 (abs: 1.0e+00, rel: 4.5e-02)",
"[data][x=1, y=2, z=3]: 23 != 24 (abs: 1.0e+00, rel: 4.3e-02)",
order=False,
)


@requires_dask
def test_dask_dataarray_discards_data():
"""Test that chunked Dask datasets are loaded into memory and then
Expand Down Expand Up @@ -1305,9 +1367,9 @@ def test_lazy_datasets_without_dask(tmp_path):
[
# Different OSes and dependency versions have different peak RAM usages.
# These are the worst case scenarios across all combinations.
("netcdf", None, 100), # Takes more on MacOS
("netcdf", None, 100), # Uses more RAM on MacOS
("netcdf", {}, 50),
("zarr", None, 90),
("zarr", None, 100),
("zarr", {}, 25), # ~5 MiB on Linux, up to 25 MiB on Windows
],
)
Expand Down