From 1b8d4e5065124cf72f1b96743c2632151c13546e Mon Sep 17 00:00:00 2001 From: rhoadesScholar Date: Wed, 30 Oct 2024 12:35:57 -0400 Subject: [PATCH 01/17] =?UTF-8?q?feat:=20=E2=9C=A8=20Allow=20writing=20to?= =?UTF-8?q?=20Zarr=20via=20xarray=5Ftensorstore.=5FTensorStoreAdapter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit adds a __setitem__ method to xarray_tensorstore._TensorStoreAdapter, allowing users to write data to a Zarr opened via this adapter. Should close Issue #5 --- xarray_tensorstore.py | 15 +- xarray_tensorstore_test.py | 429 ++++++++++++++++++++----------------- 2 files changed, 247 insertions(+), 197 deletions(-) diff --git a/xarray_tensorstore.py b/xarray_tensorstore.py index d1d0803..6980e1e 100644 --- a/xarray_tensorstore.py +++ b/xarray_tensorstore.py @@ -98,7 +98,20 @@ def __getitem__(self, key: indexing.ExplicitIndexer) -> _TensorStoreAdapter: # like NumPy, not absolute like TensorStore translated = indexed[tensorstore.d[:].translate_to[0]] return type(self)(translated) - + + def __setitem__(self, key: indexing.ExplicitIndexer, value) -> None: + index_tuple = tuple(map(_numpy_to_tensorstore_index, key.tuple, self.shape)) + if isinstance(key, indexing.OuterIndexer): + # TODO(shoyer): fix this for newer versions of Xarray. + # We get the error message: + # AttributeError: '_TensorStoreAdapter' object has no attribute 'oindex' + self.array.oindex[index_tuple] = value + elif isinstance(key, indexing.VectorizedIndexer): + self.array.vindex[index_tuple] = value + else: + assert isinstance(key, indexing.BasicIndexer) + self.array[index_tuple] = value + # xarray>2024.02.0 uses oindex and vindex properties, which are expected to # return objects whose __getitem__ method supports the appropriate form of # indexing. diff --git a/xarray_tensorstore_test.py b/xarray_tensorstore_test.py index d6f33dc..04b0020 100644 --- a/xarray_tensorstore_test.py +++ b/xarray_tensorstore_test.py @@ -18,209 +18,246 @@ import tensorstore import xarray import xarray_tensorstore +from xarray.core import indexing class XarrayTensorstoreTest(parameterized.TestCase): - @parameterized.named_parameters( - # TODO(shoyer): consider using hypothesis to convert these into - # property-based tests - { - 'testcase_name': 'base', - 'transform': lambda ds: ds, - }, - { - 'testcase_name': 'transposed', - 'transform': lambda ds: ds.transpose('z', 'x', 'y'), - }, - { - 'testcase_name': 'basic_int', - 'transform': lambda ds: ds.isel(y=1), - }, - { - 'testcase_name': 'negative_int', - 'transform': lambda ds: ds.isel(y=-1), - }, - { - 'testcase_name': 'basic_slice', - 'transform': lambda ds: ds.isel(z=slice(2)), - }, - { - 'testcase_name': 'full_slice', - 'transform': lambda ds: ds.isel(z=slice(0, 4)), - }, - { - 'testcase_name': 'out_of_bounds_slice', - 'transform': lambda ds: ds.isel(z=slice(0, 10)), - }, - { - 'testcase_name': 'strided_slice', - 'transform': lambda ds: ds.isel(z=slice(0, None, 2)), - }, - { - 'testcase_name': 'negative_stride_slice', - 'transform': lambda ds: ds.isel(z=slice(None, None, -1)), - }, - { - 'testcase_name': 'repeated_indexing', - 'transform': lambda ds: ds.isel(z=slice(1, None)).isel(z=0), - }, - { - 'testcase_name': 'oindex', - # includes repeated, negative and out of order indices - 'transform': lambda ds: ds.isel(x=[0], y=[1, 1], z=[1, -1, 0]), - }, - { - 'testcase_name': 'vindex', - 'transform': lambda ds: ds.isel(x=('w', [0, 1]), y=('w', [1, 2])), - }, - { - 'testcase_name': 'mixed_indexing_types', - 'transform': lambda ds: ds.isel(x=0, y=slice(2), z=[-1]), - }, - { - 'testcase_name': 'select_a_variable', - 'transform': lambda ds: ds['foo'], - }, - ) - def test_open_zarr(self, transform): - source = xarray.Dataset( - { - 'foo': (('x',), np.arange(2), {'local': 'local metadata'}), - 'bar': (('x', 'y'), np.arange(6).reshape(2, 3)), - 'baz': (('x', 'y', 'z'), np.arange(24).reshape(2, 3, 4)), - }, - coords={ - 'x': [1, 2], - 'y': pd.to_datetime(['2000-01-01', '2000-01-02', '2000-01-03']), - 'z': ['a', 'b', 'c', 'd'], - }, - attrs={'global': 'global metadata'}, - ) - path = self.create_tempdir().full_path - source.chunk().to_zarr(path) - - expected = transform(source) - actual = transform(xarray_tensorstore.open_zarr(path)).compute() - xarray.testing.assert_identical(actual, expected) - - @parameterized.parameters( - {'deep': True}, - {'deep': False}, - ) - def test_copy(self, deep): - source = xarray.Dataset({'foo': (('x',), np.arange(10))}) - path = self.create_tempdir().full_path - source.to_zarr(path) - opened = xarray_tensorstore.open_zarr(path) - copied = opened.copy(deep=deep) - xarray.testing.assert_identical(copied, source) - - def test_sortby(self): - # regression test for https://github.com/google/xarray-tensorstore/issues/1 - x = np.arange(10) - source = xarray.Dataset({'foo': (('x',), x)}, {'x': x[::-1]}) - path = self.create_tempdir().full_path - source.to_zarr(path) - opened = xarray_tensorstore.open_zarr(path) - opened.sortby('x') # should not crash - - def test_compute(self): - # verify that get_duck_array() is working properly - source = xarray.Dataset({'foo': (('x',), np.arange(10))}) - path = self.create_tempdir().full_path - source.to_zarr(path) - opened = xarray_tensorstore.open_zarr(path) - computed = opened.compute() - computed_data = computed['foo'].variable._data - self.assertNotIsInstance(computed_data, tensorstore.TensorStore) - - def test_open_zarr_from_uri(self): - source = xarray.Dataset( - {'baz': (('x', 'y', 'z'), np.arange(24).reshape(2, 3, 4))} + @parameterized.named_parameters( + # TODO(shoyer): consider using hypothesis to convert these into + # property-based tests + { + "testcase_name": "base", + "transform": lambda ds: ds, + }, + { + "testcase_name": "transposed", + "transform": lambda ds: ds.transpose("z", "x", "y"), + }, + { + "testcase_name": "basic_int", + "transform": lambda ds: ds.isel(y=1), + }, + { + "testcase_name": "negative_int", + "transform": lambda ds: ds.isel(y=-1), + }, + { + "testcase_name": "basic_slice", + "transform": lambda ds: ds.isel(z=slice(2)), + }, + { + "testcase_name": "full_slice", + "transform": lambda ds: ds.isel(z=slice(0, 4)), + }, + { + "testcase_name": "out_of_bounds_slice", + "transform": lambda ds: ds.isel(z=slice(0, 10)), + }, + { + "testcase_name": "strided_slice", + "transform": lambda ds: ds.isel(z=slice(0, None, 2)), + }, + { + "testcase_name": "negative_stride_slice", + "transform": lambda ds: ds.isel(z=slice(None, None, -1)), + }, + { + "testcase_name": "repeated_indexing", + "transform": lambda ds: ds.isel(z=slice(1, None)).isel(z=0), + }, + { + "testcase_name": "oindex", + # includes repeated, negative and out of order indices + "transform": lambda ds: ds.isel(x=[0], y=[1, 1], z=[1, -1, 0]), + }, + { + "testcase_name": "vindex", + "transform": lambda ds: ds.isel(x=("w", [0, 1]), y=("w", [1, 2])), + }, + { + "testcase_name": "mixed_indexing_types", + "transform": lambda ds: ds.isel(x=0, y=slice(2), z=[-1]), + }, + { + "testcase_name": "select_a_variable", + "transform": lambda ds: ds["foo"], + }, ) - path = self.create_tempdir().full_path - source.chunk().to_zarr(path) + def test_open_zarr(self, transform): + source = xarray.Dataset( + { + "foo": (("x",), np.arange(2), {"local": "local metadata"}), + "bar": (("x", "y"), np.arange(6).reshape(2, 3)), + "baz": (("x", "y", "z"), np.arange(24).reshape(2, 3, 4)), + }, + coords={ + "x": [1, 2], + "y": pd.to_datetime(["2000-01-01", "2000-01-02", "2000-01-03"]), + "z": ["a", "b", "c", "d"], + }, + attrs={"global": "global metadata"}, + ) + path = self.create_tempdir().full_path + source.chunk().to_zarr(path) - opened = xarray_tensorstore.open_zarr('file://' + path) - xarray.testing.assert_identical(source, opened) + expected = transform(source) + actual = transform(xarray_tensorstore.open_zarr(path)).compute() + xarray.testing.assert_identical(actual, expected) - def test_read_dataset(self): - source = xarray.Dataset( - {'baz': (('x', 'y', 'z'), np.arange(24).reshape(2, 3, 4))}, - coords={'x': np.arange(2)}, - ) - path = self.create_tempdir().full_path - source.chunk().to_zarr(path) - - opened = xarray_tensorstore.open_zarr(path) - read = xarray_tensorstore.read(opened) - - self.assertIsNone(opened.variables['baz']._data.future) - self.assertIsNotNone(read.variables['baz']._data.future) - xarray.testing.assert_identical(read, source) - - def test_read_dataarray(self): - source = xarray.DataArray( - np.arange(24).reshape(2, 3, 4), - dims=('x', 'y', 'z'), - name='baz', - coords={'x': np.arange(2)}, - ) - path = self.create_tempdir().full_path - source.to_dataset().chunk().to_zarr(path) - - opened = xarray_tensorstore.open_zarr(path)['baz'] - read = xarray_tensorstore.read(opened) - - self.assertIsNone(opened.variable._data.future) - self.assertIsNotNone(read.variable._data.future) - xarray.testing.assert_identical(read, source) - - def test_mask_and_scale(self): - source = xarray.DataArray( - np.arange(24).reshape(2, 3, 4), - dims=('x', 'y', 'z'), - name='baz', - coords={'x': np.arange(2)}, + @parameterized.parameters( + {"deep": True}, + {"deep": False}, ) + def test_copy(self, deep): + source = xarray.Dataset({"foo": (("x",), np.arange(10))}) + path = self.create_tempdir().full_path + source.to_zarr(path) + opened = xarray_tensorstore.open_zarr(path) + copied = opened.copy(deep=deep) + xarray.testing.assert_identical(copied, source) + + def test_sortby(self): + # regression test for https://github.com/google/xarray-tensorstore/issues/1 + x = np.arange(10) + source = xarray.Dataset({"foo": (("x",), x)}, {"x": x[::-1]}) + path = self.create_tempdir().full_path + source.to_zarr(path) + opened = xarray_tensorstore.open_zarr(path) + opened.sortby("x") # should not crash + + def test_compute(self): + # verify that get_duck_array() is working properly + source = xarray.Dataset({"foo": (("x",), np.arange(10))}) + path = self.create_tempdir().full_path + source.to_zarr(path) + opened = xarray_tensorstore.open_zarr(path) + computed = opened.compute() + computed_data = computed["foo"].variable._data + self.assertNotIsInstance(computed_data, tensorstore.TensorStore) + + def test_open_zarr_from_uri(self): + source = xarray.Dataset( + {"baz": (("x", "y", "z"), np.arange(24).reshape(2, 3, 4))} + ) + path = self.create_tempdir().full_path + source.chunk().to_zarr(path) + + opened = xarray_tensorstore.open_zarr("file://" + path) + xarray.testing.assert_identical(source, opened) - # invalid fill-value - source.encoding = {'_FillValue': -1} - path = self.create_tempdir().full_path - source.to_dataset().chunk().to_zarr(path) - expected_msg = ( - 'variable baz has non-NaN fill value, which is not supported by' - ' xarray-tensorstore: -1. Consider re-opening with' - ' xarray_tensorstore.open_zarr(..., mask_and_scale=False), or falling' - ' back to use xarray.open_zarr().' + def test_read_dataset(self): + source = xarray.Dataset( + {"baz": (("x", "y", "z"), np.arange(24).reshape(2, 3, 4))}, + coords={"x": np.arange(2)}, + ) + path = self.create_tempdir().full_path + source.chunk().to_zarr(path) + + opened = xarray_tensorstore.open_zarr(path) + read = xarray_tensorstore.read(opened) + + self.assertIsNone(opened.variables["baz"]._data.future) + self.assertIsNotNone(read.variables["baz"]._data.future) + xarray.testing.assert_identical(read, source) + + def test_read_dataarray(self): + source = xarray.DataArray( + np.arange(24).reshape(2, 3, 4), + dims=("x", "y", "z"), + name="baz", + coords={"x": np.arange(2)}, + ) + path = self.create_tempdir().full_path + source.to_dataset().chunk().to_zarr(path) + + opened = xarray_tensorstore.open_zarr(path)["baz"] + read = xarray_tensorstore.read(opened) + + self.assertIsNone(opened.variable._data.future) + self.assertIsNotNone(read.variable._data.future) + xarray.testing.assert_identical(read, source) + + @parameterized.named_parameters( + { + "testcase_name": "basic_indexing", + "key": indexing.BasicIndexer((slice(1, None), slice(None), slice(None))), + "numpy_key": (slice(1, None), slice(None), slice(None)), + "value": np.full((1, 2, 3), -1), + }, + { + "testcase_name": "outer_indexing", + "key": indexing.OuterIndexer((np.array([0]), np.array([1]), slice(None))), + "numpy_key": (np.array([0]), np.array([1]), slice(None)), + "value": np.full((1, 1, 3), -2), + }, + { + "testcase_name": "vectorized_indexing", + "key": indexing.VectorizedIndexer( + (np.array([0, 1]), np.array([0, 1]), slice(None)) + ), + "numpy_key": (np.array([0, 1]), np.array([0, 1]), slice(None)), + "value": np.full((2, 3), -3), + }, ) - with self.assertRaisesWithLiteralMatch(ValueError, expected_msg): - xarray_tensorstore.open_zarr(path) - - actual = xarray_tensorstore.open_zarr(path, mask_and_scale=False)['baz'] - xarray.testing.assert_equal(actual, source) # no values are masked - - # invalid scaling - source.encoding = {'scale_factor': 10.0} - path = self.create_tempdir().full_path - source.to_dataset().chunk().to_zarr(path) - expected_msg = 'variable baz uses scale/offset encoding' - with self.assertRaisesRegex(ValueError, expected_msg): - xarray_tensorstore.open_zarr(path) - - actual = xarray_tensorstore.open_zarr(path, mask_and_scale=False)['baz'] - self.assertFalse(actual.equals(source)) # not scaled properly - - # valid offset (coordinate only) - source.encoding = {} - source.coords['x'].encoding = {'add_offset': -1} - path = self.create_tempdir().full_path - source.to_dataset().chunk().to_zarr(path) - actual = xarray_tensorstore.open_zarr(path, mask_and_scale=True)['baz'] - xarray.testing.assert_identical(actual, source) - self.assertEqual(actual.coords['x'].encoding['add_offset'], -1) - - -if __name__ == '__main__': - absltest.main() + def test_setitem(self, key, numpy_key, value): + source_data = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]) + source = tensorstore.array( + source_data, dtype=source_data.dtype, context=tensorstore.Context() + ) + adapter = xarray_tensorstore._TensorStoreAdapter(source) + + adapter[key] = value + result = adapter.array.read().result() + expected = source_data.copy() + expected[numpy_key] = value + print(result) + np.testing.assert_equal(result, expected) + + def test_mask_and_scale(self): + source = xarray.DataArray( + np.arange(24).reshape(2, 3, 4), + dims=("x", "y", "z"), + name="baz", + coords={"x": np.arange(2)}, + ) + + # invalid fill-value + source.encoding = {"_FillValue": -1} + path = self.create_tempdir().full_path + source.to_dataset().chunk().to_zarr(path) + expected_msg = ( + "variable baz has non-NaN fill value, which is not supported by" + " xarray-tensorstore: -1. Consider re-opening with" + " xarray_tensorstore.open_zarr(..., mask_and_scale=False), or falling" + " back to use xarray.open_zarr()." + ) + with self.assertRaisesWithLiteralMatch(ValueError, expected_msg): + xarray_tensorstore.open_zarr(path) + + actual = xarray_tensorstore.open_zarr(path, mask_and_scale=False)["baz"] + xarray.testing.assert_equal(actual, source) # no values are masked + + # invalid scaling + source.encoding = {"scale_factor": 10.0} + path = self.create_tempdir().full_path + source.to_dataset().chunk().to_zarr(path) + expected_msg = "variable baz uses scale/offset encoding" + with self.assertRaisesRegex(ValueError, expected_msg): + xarray_tensorstore.open_zarr(path) + + actual = xarray_tensorstore.open_zarr(path, mask_and_scale=False)["baz"] + self.assertFalse(actual.equals(source)) # not scaled properly + + # valid offset (coordinate only) + source.encoding = {} + source.coords["x"].encoding = {"add_offset": -1} + path = self.create_tempdir().full_path + source.to_dataset().chunk().to_zarr(path) + actual = xarray_tensorstore.open_zarr(path, mask_and_scale=True)["baz"] + xarray.testing.assert_identical(actual, source) + self.assertEqual(actual.coords["x"].encoding["add_offset"], -1) + + +if __name__ == "__main__": + absltest.main() From b9d5b734e2ec83d04669765bc9fbd9070b5716eb Mon Sep 17 00:00:00 2001 From: Jeff Rhoades <37990507+rhoadesScholar@users.noreply.github.com> Date: Tue, 5 Nov 2024 02:09:41 -0500 Subject: [PATCH 02/17] Update xarray_tensorstore_test.py Attempted to match original formatting. --- xarray_tensorstore_test.py | 425 ++++++++++++++++++------------------- 1 file changed, 212 insertions(+), 213 deletions(-) diff --git a/xarray_tensorstore_test.py b/xarray_tensorstore_test.py index 04b0020..a890120 100644 --- a/xarray_tensorstore_test.py +++ b/xarray_tensorstore_test.py @@ -18,246 +18,245 @@ import tensorstore import xarray import xarray_tensorstore -from xarray.core import indexing class XarrayTensorstoreTest(parameterized.TestCase): - @parameterized.named_parameters( - # TODO(shoyer): consider using hypothesis to convert these into - # property-based tests + @parameterized.named_parameters( + # TODO(shoyer): consider using hypothesis to convert these into + # property-based tests + { + 'testcase_name': 'base', + 'transform': lambda ds: ds, + }, + { + 'testcase_name': 'transposed', + 'transform': lambda ds: ds.transpose('z', 'x', 'y'), + }, + { + 'testcase_name': 'basic_int', + 'transform': lambda ds: ds.isel(y=1), + }, + { + 'testcase_name': 'negative_int', + 'transform': lambda ds: ds.isel(y=-1), + }, + { + 'testcase_name': 'basic_slice', + 'transform': lambda ds: ds.isel(z=slice(2)), + }, + { + 'testcase_name': 'full_slice', + 'transform': lambda ds: ds.isel(z=slice(0, 4)), + }, + { + 'testcase_name': 'out_of_bounds_slice', + 'transform': lambda ds: ds.isel(z=slice(0, 10)), + }, + { + 'testcase_name': 'strided_slice', + 'transform': lambda ds: ds.isel(z=slice(0, None, 2)), + }, + { + 'testcase_name': 'negative_stride_slice', + 'transform': lambda ds: ds.isel(z=slice(None, None, -1)), + }, + { + 'testcase_name': 'repeated_indexing', + 'transform': lambda ds: ds.isel(z=slice(1, None)).isel(z=0), + }, + { + 'testcase_name': 'oindex', + # includes repeated, negative and out of order indices + 'transform': lambda ds: ds.isel(x=[0], y=[1, 1], z=[1, -1, 0]), + }, + { + 'testcase_name': 'vindex', + 'transform': lambda ds: ds.isel(x=('w', [0, 1]), y=('w', [1, 2])), + }, + { + 'testcase_name': 'mixed_indexing_types', + 'transform': lambda ds: ds.isel(x=0, y=slice(2), z=[-1]), + }, + { + 'testcase_name': 'select_a_variable', + 'transform': lambda ds: ds['foo'], + }, + ) + def test_open_zarr(self, transform): + source = xarray.Dataset( { - "testcase_name": "base", - "transform": lambda ds: ds, + 'foo': (('x',), np.arange(2), {'local': 'local metadata'}), + 'bar': (('x', 'y'), np.arange(6).reshape(2, 3)), + 'baz': (('x', 'y', 'z'), np.arange(24).reshape(2, 3, 4)), }, - { - "testcase_name": "transposed", - "transform": lambda ds: ds.transpose("z", "x", "y"), - }, - { - "testcase_name": "basic_int", - "transform": lambda ds: ds.isel(y=1), - }, - { - "testcase_name": "negative_int", - "transform": lambda ds: ds.isel(y=-1), - }, - { - "testcase_name": "basic_slice", - "transform": lambda ds: ds.isel(z=slice(2)), - }, - { - "testcase_name": "full_slice", - "transform": lambda ds: ds.isel(z=slice(0, 4)), - }, - { - "testcase_name": "out_of_bounds_slice", - "transform": lambda ds: ds.isel(z=slice(0, 10)), - }, - { - "testcase_name": "strided_slice", - "transform": lambda ds: ds.isel(z=slice(0, None, 2)), - }, - { - "testcase_name": "negative_stride_slice", - "transform": lambda ds: ds.isel(z=slice(None, None, -1)), - }, - { - "testcase_name": "repeated_indexing", - "transform": lambda ds: ds.isel(z=slice(1, None)).isel(z=0), - }, - { - "testcase_name": "oindex", - # includes repeated, negative and out of order indices - "transform": lambda ds: ds.isel(x=[0], y=[1, 1], z=[1, -1, 0]), - }, - { - "testcase_name": "vindex", - "transform": lambda ds: ds.isel(x=("w", [0, 1]), y=("w", [1, 2])), - }, - { - "testcase_name": "mixed_indexing_types", - "transform": lambda ds: ds.isel(x=0, y=slice(2), z=[-1]), - }, - { - "testcase_name": "select_a_variable", - "transform": lambda ds: ds["foo"], + coords={ + 'x': [1, 2], + 'y': pd.to_datetime(['2000-01-01', '2000-01-02', '2000-01-03']), + 'z': ['a', 'b', 'c', 'd'], }, + attrs={'global': 'global metadata'}, ) - def test_open_zarr(self, transform): - source = xarray.Dataset( - { - "foo": (("x",), np.arange(2), {"local": "local metadata"}), - "bar": (("x", "y"), np.arange(6).reshape(2, 3)), - "baz": (("x", "y", "z"), np.arange(24).reshape(2, 3, 4)), - }, - coords={ - "x": [1, 2], - "y": pd.to_datetime(["2000-01-01", "2000-01-02", "2000-01-03"]), - "z": ["a", "b", "c", "d"], - }, - attrs={"global": "global metadata"}, - ) - path = self.create_tempdir().full_path - source.chunk().to_zarr(path) + path = self.create_tempdir().full_path + source.chunk().to_zarr(path) - expected = transform(source) - actual = transform(xarray_tensorstore.open_zarr(path)).compute() - xarray.testing.assert_identical(actual, expected) + expected = transform(source) + actual = transform(xarray_tensorstore.open_zarr(path)).compute() + xarray.testing.assert_identical(actual, expected) - @parameterized.parameters( - {"deep": True}, - {"deep": False}, - ) - def test_copy(self, deep): - source = xarray.Dataset({"foo": (("x",), np.arange(10))}) - path = self.create_tempdir().full_path - source.to_zarr(path) - opened = xarray_tensorstore.open_zarr(path) - copied = opened.copy(deep=deep) - xarray.testing.assert_identical(copied, source) + @parameterized.parameters( + {'deep': True}, + {'deep': False}, + ) + def test_copy(self, deep): + source = xarray.Dataset({'foo': (('x',), np.arange(10))}) + path = self.create_tempdir().full_path + source.to_zarr(path) + opened = xarray_tensorstore.open_zarr(path) + copied = opened.copy(deep=deep) + xarray.testing.assert_identical(copied, source) - def test_sortby(self): - # regression test for https://github.com/google/xarray-tensorstore/issues/1 - x = np.arange(10) - source = xarray.Dataset({"foo": (("x",), x)}, {"x": x[::-1]}) - path = self.create_tempdir().full_path - source.to_zarr(path) - opened = xarray_tensorstore.open_zarr(path) - opened.sortby("x") # should not crash + def test_sortby(self): + # regression test for https://github.com/google/xarray-tensorstore/issues/1 + x = np.arange(10) + source = xarray.Dataset({'foo': (('x',), x)}, {'x': x[::-1]}) + path = self.create_tempdir().full_path + source.to_zarr(path) + opened = xarray_tensorstore.open_zarr(path) + opened.sortby('x') # should not crash - def test_compute(self): - # verify that get_duck_array() is working properly - source = xarray.Dataset({"foo": (("x",), np.arange(10))}) - path = self.create_tempdir().full_path - source.to_zarr(path) - opened = xarray_tensorstore.open_zarr(path) - computed = opened.compute() - computed_data = computed["foo"].variable._data - self.assertNotIsInstance(computed_data, tensorstore.TensorStore) + def test_compute(self): + # verify that get_duck_array() is working properly + source = xarray.Dataset({'foo': (('x',), np.arange(10))}) + path = self.create_tempdir().full_path + source.to_zarr(path) + opened = xarray_tensorstore.open_zarr(path) + computed = opened.compute() + computed_data = computed['foo'].variable._data + self.assertNotIsInstance(computed_data, tensorstore.TensorStore) - def test_open_zarr_from_uri(self): - source = xarray.Dataset( - {"baz": (("x", "y", "z"), np.arange(24).reshape(2, 3, 4))} - ) - path = self.create_tempdir().full_path - source.chunk().to_zarr(path) + def test_open_zarr_from_uri(self): + source = xarray.Dataset( + {'baz': (('x', 'y', 'z'), np.arange(24).reshape(2, 3, 4))} + ) + path = self.create_tempdir().full_path + source.chunk().to_zarr(path) - opened = xarray_tensorstore.open_zarr("file://" + path) - xarray.testing.assert_identical(source, opened) + opened = xarray_tensorstore.open_zarr('file://' + path) + xarray.testing.assert_identical(source, opened) - def test_read_dataset(self): - source = xarray.Dataset( - {"baz": (("x", "y", "z"), np.arange(24).reshape(2, 3, 4))}, - coords={"x": np.arange(2)}, - ) - path = self.create_tempdir().full_path - source.chunk().to_zarr(path) + def test_read_dataset(self): + source = xarray.Dataset( + {'baz': (('x', 'y', 'z'), np.arange(24).reshape(2, 3, 4))}, + coords={'x': np.arange(2)}, + ) + path = self.create_tempdir().full_path + source.chunk().to_zarr(path) - opened = xarray_tensorstore.open_zarr(path) - read = xarray_tensorstore.read(opened) + opened = xarray_tensorstore.open_zarr(path) + read = xarray_tensorstore.read(opened) - self.assertIsNone(opened.variables["baz"]._data.future) - self.assertIsNotNone(read.variables["baz"]._data.future) - xarray.testing.assert_identical(read, source) + self.assertIsNone(opened.variables['baz']._data.future) + self.assertIsNotNone(read.variables['baz']._data.future) + xarray.testing.assert_identical(read, source) - def test_read_dataarray(self): - source = xarray.DataArray( - np.arange(24).reshape(2, 3, 4), - dims=("x", "y", "z"), - name="baz", - coords={"x": np.arange(2)}, - ) - path = self.create_tempdir().full_path - source.to_dataset().chunk().to_zarr(path) + def test_read_dataarray(self): + source = xarray.DataArray( + np.arange(24).reshape(2, 3, 4), + dims=('x', 'y', 'z'), + name='baz', + coords={'x': np.arange(2)}, + ) + path = self.create_tempdir().full_path + source.to_dataset().chunk().to_zarr(path) - opened = xarray_tensorstore.open_zarr(path)["baz"] - read = xarray_tensorstore.read(opened) + opened = xarray_tensorstore.open_zarr(path)['baz'] + read = xarray_tensorstore.read(opened) - self.assertIsNone(opened.variable._data.future) - self.assertIsNotNone(read.variable._data.future) - xarray.testing.assert_identical(read, source) + self.assertIsNone(opened.variable._data.future) + self.assertIsNotNone(read.variable._data.future) + xarray.testing.assert_identical(read, source) - @parameterized.named_parameters( - { - "testcase_name": "basic_indexing", - "key": indexing.BasicIndexer((slice(1, None), slice(None), slice(None))), - "numpy_key": (slice(1, None), slice(None), slice(None)), - "value": np.full((1, 2, 3), -1), - }, - { - "testcase_name": "outer_indexing", - "key": indexing.OuterIndexer((np.array([0]), np.array([1]), slice(None))), - "numpy_key": (np.array([0]), np.array([1]), slice(None)), - "value": np.full((1, 1, 3), -2), - }, - { - "testcase_name": "vectorized_indexing", - "key": indexing.VectorizedIndexer( - (np.array([0, 1]), np.array([0, 1]), slice(None)) - ), - "numpy_key": (np.array([0, 1]), np.array([0, 1]), slice(None)), - "value": np.full((2, 3), -3), - }, + def test_mask_and_scale(self): + source = xarray.DataArray( + np.arange(24).reshape(2, 3, 4), + dims=('x', 'y', 'z'), + name='baz', + coords={'x': np.arange(2)}, ) - def test_setitem(self, key, numpy_key, value): - source_data = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]) - source = tensorstore.array( - source_data, dtype=source_data.dtype, context=tensorstore.Context() - ) - adapter = xarray_tensorstore._TensorStoreAdapter(source) - adapter[key] = value - result = adapter.array.read().result() - expected = source_data.copy() - expected[numpy_key] = value - print(result) - np.testing.assert_equal(result, expected) - - def test_mask_and_scale(self): - source = xarray.DataArray( - np.arange(24).reshape(2, 3, 4), - dims=("x", "y", "z"), - name="baz", - coords={"x": np.arange(2)}, - ) + # invalid fill-value + source.encoding = {'_FillValue': -1} + path = self.create_tempdir().full_path + source.to_dataset().chunk().to_zarr(path) + expected_msg = ( + 'variable baz has non-NaN fill value, which is not supported by' + ' xarray-tensorstore: -1. Consider re-opening with' + ' xarray_tensorstore.open_zarr(..., mask_and_scale=False), or falling' + ' back to use xarray.open_zarr().' + ) + with self.assertRaisesWithLiteralMatch(ValueError, expected_msg): + xarray_tensorstore.open_zarr(path) - # invalid fill-value - source.encoding = {"_FillValue": -1} - path = self.create_tempdir().full_path - source.to_dataset().chunk().to_zarr(path) - expected_msg = ( - "variable baz has non-NaN fill value, which is not supported by" - " xarray-tensorstore: -1. Consider re-opening with" - " xarray_tensorstore.open_zarr(..., mask_and_scale=False), or falling" - " back to use xarray.open_zarr()." - ) - with self.assertRaisesWithLiteralMatch(ValueError, expected_msg): - xarray_tensorstore.open_zarr(path) + actual = xarray_tensorstore.open_zarr(path, mask_and_scale=False)['baz'] + xarray.testing.assert_equal(actual, source) # no values are masked - actual = xarray_tensorstore.open_zarr(path, mask_and_scale=False)["baz"] - xarray.testing.assert_equal(actual, source) # no values are masked + # invalid scaling + source.encoding = {'scale_factor': 10.0} + path = self.create_tempdir().full_path + source.to_dataset().chunk().to_zarr(path) + expected_msg = 'variable baz uses scale/offset encoding' + with self.assertRaisesRegex(ValueError, expected_msg): + xarray_tensorstore.open_zarr(path) - # invalid scaling - source.encoding = {"scale_factor": 10.0} - path = self.create_tempdir().full_path - source.to_dataset().chunk().to_zarr(path) - expected_msg = "variable baz uses scale/offset encoding" - with self.assertRaisesRegex(ValueError, expected_msg): - xarray_tensorstore.open_zarr(path) + actual = xarray_tensorstore.open_zarr(path, mask_and_scale=False)['baz'] + self.assertFalse(actual.equals(source)) # not scaled properly - actual = xarray_tensorstore.open_zarr(path, mask_and_scale=False)["baz"] - self.assertFalse(actual.equals(source)) # not scaled properly + # valid offset (coordinate only) + source.encoding = {} + source.coords['x'].encoding = {'add_offset': -1} + path = self.create_tempdir().full_path + source.to_dataset().chunk().to_zarr(path) + actual = xarray_tensorstore.open_zarr(path, mask_and_scale=True)['baz'] + xarray.testing.assert_identical(actual, source) + self.assertEqual(actual.coords['x'].encoding['add_offset'], -1) - # valid offset (coordinate only) - source.encoding = {} - source.coords["x"].encoding = {"add_offset": -1} - path = self.create_tempdir().full_path - source.to_dataset().chunk().to_zarr(path) - actual = xarray_tensorstore.open_zarr(path, mask_and_scale=True)["baz"] - xarray.testing.assert_identical(actual, source) - self.assertEqual(actual.coords["x"].encoding["add_offset"], -1) + @parameterized.named_parameters( + { + "testcase_name": "basic_indexing", + "key": indexing.BasicIndexer((slice(1, None), slice(None), slice(None))), + "numpy_key": (slice(1, None), slice(None), slice(None)), + "value": np.full((1, 2, 3), -1), + }, + { + "testcase_name": "outer_indexing", + "key": indexing.OuterIndexer((np.array([0]), np.array([1]), slice(None))), + "numpy_key": (np.array([0]), np.array([1]), slice(None)), + "value": np.full((1, 1, 3), -2), + }, + { + "testcase_name": "vectorized_indexing", + "key": indexing.VectorizedIndexer( + (np.array([0, 1]), np.array([0, 1]), slice(None)) + ), + "numpy_key": (np.array([0, 1]), np.array([0, 1]), slice(None)), + "value": np.full((2, 3), -3), + }, + ) + def test_setitem(self, key, numpy_key, value): + source_data = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]) + source = tensorstore.array( + source_data, dtype=source_data.dtype, context=tensorstore.Context() + ) + adapter = xarray_tensorstore._TensorStoreAdapter(source) + + adapter[key] = value + result = adapter.array.read().result() + expected = source_data.copy() + expected[numpy_key] = value + print(result) + np.testing.assert_equal(result, expected) -if __name__ == "__main__": - absltest.main() +if __name__ == '__main__': + absltest.main() From 93d34e987c71ac9ddc381ca34b2fd083247a0eeb Mon Sep 17 00:00:00 2001 From: Jeff Rhoades <37990507+rhoadesScholar@users.noreply.github.com> Date: Tue, 5 Nov 2024 02:14:17 -0500 Subject: [PATCH 03/17] Update xarray_tensorstore_test.py Re-add import --- xarray_tensorstore_test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xarray_tensorstore_test.py b/xarray_tensorstore_test.py index a890120..8a1d0c5 100644 --- a/xarray_tensorstore_test.py +++ b/xarray_tensorstore_test.py @@ -17,6 +17,7 @@ import pandas as pd import tensorstore import xarray +from xarray.core import indexing import xarray_tensorstore From 2f626bc264199071b705776ff729dff3db0cc896 Mon Sep 17 00:00:00 2001 From: Jeff Rhoades <37990507+rhoadesScholar@users.noreply.github.com> Date: Tue, 5 Nov 2024 11:51:49 -0500 Subject: [PATCH 04/17] Update xarray_tensorstore.py --- xarray_tensorstore.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/xarray_tensorstore.py b/xarray_tensorstore.py index 6980e1e..54c1497 100644 --- a/xarray_tensorstore.py +++ b/xarray_tensorstore.py @@ -102,9 +102,6 @@ def __getitem__(self, key: indexing.ExplicitIndexer) -> _TensorStoreAdapter: def __setitem__(self, key: indexing.ExplicitIndexer, value) -> None: index_tuple = tuple(map(_numpy_to_tensorstore_index, key.tuple, self.shape)) if isinstance(key, indexing.OuterIndexer): - # TODO(shoyer): fix this for newer versions of Xarray. - # We get the error message: - # AttributeError: '_TensorStoreAdapter' object has no attribute 'oindex' self.array.oindex[index_tuple] = value elif isinstance(key, indexing.VectorizedIndexer): self.array.vindex[index_tuple] = value From f056ad7c27d94de1a5d34533539a211be9467efa Mon Sep 17 00:00:00 2001 From: rhoadesScholar Date: Tue, 5 Nov 2024 11:59:45 -0500 Subject: [PATCH 05/17] =?UTF-8?q?style:=20=F0=9F=8E=A8=20Format=20with=20t?= =?UTF-8?q?wo=20space=20indentation.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- xarray_tensorstore.py | 4 +-- xarray_tensorstore_test.py | 68 ++++++++++++++++++-------------------- 2 files changed, 35 insertions(+), 37 deletions(-) diff --git a/xarray_tensorstore.py b/xarray_tensorstore.py index 54c1497..3f682bc 100644 --- a/xarray_tensorstore.py +++ b/xarray_tensorstore.py @@ -98,7 +98,7 @@ def __getitem__(self, key: indexing.ExplicitIndexer) -> _TensorStoreAdapter: # like NumPy, not absolute like TensorStore translated = indexed[tensorstore.d[:].translate_to[0]] return type(self)(translated) - + def __setitem__(self, key: indexing.ExplicitIndexer, value) -> None: index_tuple = tuple(map(_numpy_to_tensorstore_index, key.tuple, self.shape)) if isinstance(key, indexing.OuterIndexer): @@ -108,7 +108,7 @@ def __setitem__(self, key: indexing.ExplicitIndexer, value) -> None: else: assert isinstance(key, indexing.BasicIndexer) self.array[index_tuple] = value - + # xarray>2024.02.0 uses oindex and vindex properties, which are expected to # return objects whose __getitem__ method supports the appropriate form of # indexing. diff --git a/xarray_tensorstore_test.py b/xarray_tensorstore_test.py index 8a1d0c5..c019e7d 100644 --- a/xarray_tensorstore_test.py +++ b/xarray_tensorstore_test.py @@ -137,9 +137,7 @@ def test_compute(self): self.assertNotIsInstance(computed_data, tensorstore.TensorStore) def test_open_zarr_from_uri(self): - source = xarray.Dataset( - {'baz': (('x', 'y', 'z'), np.arange(24).reshape(2, 3, 4))} - ) + source = xarray.Dataset({'baz': (('x', 'y', 'z'), np.arange(24).reshape(2, 3, 4))}) path = self.create_tempdir().full_path source.chunk().to_zarr(path) @@ -223,40 +221,40 @@ def test_mask_and_scale(self): self.assertEqual(actual.coords['x'].encoding['add_offset'], -1) @parameterized.named_parameters( - { - "testcase_name": "basic_indexing", - "key": indexing.BasicIndexer((slice(1, None), slice(None), slice(None))), - "numpy_key": (slice(1, None), slice(None), slice(None)), - "value": np.full((1, 2, 3), -1), - }, - { - "testcase_name": "outer_indexing", - "key": indexing.OuterIndexer((np.array([0]), np.array([1]), slice(None))), - "numpy_key": (np.array([0]), np.array([1]), slice(None)), - "value": np.full((1, 1, 3), -2), - }, - { - "testcase_name": "vectorized_indexing", - "key": indexing.VectorizedIndexer( - (np.array([0, 1]), np.array([0, 1]), slice(None)) - ), - "numpy_key": (np.array([0, 1]), np.array([0, 1]), slice(None)), - "value": np.full((2, 3), -3), - }, + { + 'testcase_name': 'basic_indexing', + 'key': indexing.BasicIndexer((slice(1, None), slice(None), slice(None))), + 'numpy_key': (slice(1, None), slice(None), slice(None)), + 'value': np.full((1, 2, 3), -1), + }, + { + 'testcase_name': 'outer_indexing', + 'key': indexing.OuterIndexer((np.array([0]), np.array([1]), slice(None))), + 'numpy_key': (np.array([0]), np.array([1]), slice(None)), + 'value': np.full((1, 1, 3), -2), + }, + { + 'testcase_name': 'vectorized_indexing', + 'key': indexing.VectorizedIndexer( + (np.array([0, 1]), np.array([0, 1]), slice(None)) + ), + 'numpy_key': (np.array([0, 1]), np.array([0, 1]), slice(None)), + 'value': np.full((2, 3), -3), + }, ) def test_setitem(self, key, numpy_key, value): - source_data = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]) - source = tensorstore.array( - source_data, dtype=source_data.dtype, context=tensorstore.Context() - ) - adapter = xarray_tensorstore._TensorStoreAdapter(source) - - adapter[key] = value - result = adapter.array.read().result() - expected = source_data.copy() - expected[numpy_key] = value - print(result) - np.testing.assert_equal(result, expected) + source_data = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]) + source = tensorstore.array( + source_data, dtype=source_data.dtype, context=tensorstore.Context() + ) + adapter = xarray_tensorstore._TensorStoreAdapter(source) + + adapter[key] = value + result = adapter.array.read().result() + expected = source_data.copy() + expected[numpy_key] = value + print(result) + np.testing.assert_equal(result, expected) if __name__ == '__main__': From 19ac4e2d74235dcc1e65fd7e812decc8be9d3324 Mon Sep 17 00:00:00 2001 From: rhoadesScholar Date: Tue, 5 Nov 2024 14:28:48 -0500 Subject: [PATCH 06/17] =?UTF-8?q?feat:=20=E2=9C=A8=20Add=20open=20modes=20?= =?UTF-8?q?"r"=20and=20"r+"=20for=20open=5Fzarr()?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- xarray_tensorstore.py | 438 ++++++++++++++++++------------------ xarray_tensorstore_test.py | 440 +++++++++++++++++++------------------ 2 files changed, 452 insertions(+), 426 deletions(-) diff --git a/xarray_tensorstore.py b/xarray_tensorstore.py index 3f682bc..23aef8e 100644 --- a/xarray_tensorstore.py +++ b/xarray_tensorstore.py @@ -26,183 +26,183 @@ from xarray.core import indexing -__version__ = '0.1.4' # keep in sync with setup.py +__version__ = "0.1.4" # keep in sync with setup.py -Index = TypeVar('Index', int, slice, np.ndarray, None) -XarrayData = TypeVar('XarrayData', xarray.Dataset, xarray.DataArray) +Index = TypeVar("Index", int, slice, np.ndarray, None) +XarrayData = TypeVar("XarrayData", xarray.Dataset, xarray.DataArray) def _numpy_to_tensorstore_index(index: Index, size: int) -> Index: - """Switch from NumPy to TensorStore indexing conventions.""" - # https://google.github.io/tensorstore/python/indexing.html#differences-compared-to-numpy-indexing - if index is None: - return None - elif isinstance(index, int): - # Negative integers do not count from the end in TensorStore - return index + size if index < 0 else index - elif isinstance(index, slice): - start = _numpy_to_tensorstore_index(index.start, size) - stop = _numpy_to_tensorstore_index(index.stop, size) - if stop is not None: - # TensorStore does not allow out of bounds slicing - stop = min(stop, size) - return slice(start, stop, index.step) - else: - assert isinstance(index, np.ndarray) - return np.where(index < 0, index + size, index) + """Switch from NumPy to TensorStore indexing conventions.""" + # https://google.github.io/tensorstore/python/indexing.html#differences-compared-to-numpy-indexing + if index is None: + return None + elif isinstance(index, int): + # Negative integers do not count from the end in TensorStore + return index + size if index < 0 else index + elif isinstance(index, slice): + start = _numpy_to_tensorstore_index(index.start, size) + stop = _numpy_to_tensorstore_index(index.stop, size) + if stop is not None: + # TensorStore does not allow out of bounds slicing + stop = min(stop, size) + return slice(start, stop, index.step) + else: + assert isinstance(index, np.ndarray) + return np.where(index < 0, index + size, index) @dataclasses.dataclass(frozen=True) class _TensorStoreAdapter(indexing.ExplicitlyIndexed): - """TensorStore array that can be wrapped by xarray.Variable. - - We use Xarray's semi-internal ExplicitlyIndexed API so that Xarray will not - attempt to load our array into memory as a NumPy array. In the future, this - should be supported by public Xarray APIs, as part of the refactor discussed - in: https://github.com/pydata/xarray/issues/3981 - """ - - array: tensorstore.TensorStore - future: Optional[tensorstore.Future] = None - - @property - def shape(self) -> tuple[int, ...]: - return self.array.shape - - @property - def dtype(self) -> np.dtype: - return self.array.dtype.numpy_dtype - - @property - def ndim(self) -> int: - return len(self.shape) - - @property - def size(self) -> int: - return math.prod(self.shape) - - def __getitem__(self, key: indexing.ExplicitIndexer) -> _TensorStoreAdapter: - index_tuple = tuple(map(_numpy_to_tensorstore_index, key.tuple, self.shape)) - if isinstance(key, indexing.OuterIndexer): - # TODO(shoyer): fix this for newer versions of Xarray. - # We get the error message: - # AttributeError: '_TensorStoreAdapter' object has no attribute 'oindex' - indexed = self.array.oindex[index_tuple] - elif isinstance(key, indexing.VectorizedIndexer): - indexed = self.array.vindex[index_tuple] - else: - assert isinstance(key, indexing.BasicIndexer) - indexed = self.array[index_tuple] - # Translate to the origin so repeated indexing is relative to the new bounds - # like NumPy, not absolute like TensorStore - translated = indexed[tensorstore.d[:].translate_to[0]] - return type(self)(translated) - - def __setitem__(self, key: indexing.ExplicitIndexer, value) -> None: - index_tuple = tuple(map(_numpy_to_tensorstore_index, key.tuple, self.shape)) - if isinstance(key, indexing.OuterIndexer): - self.array.oindex[index_tuple] = value - elif isinstance(key, indexing.VectorizedIndexer): - self.array.vindex[index_tuple] = value - else: - assert isinstance(key, indexing.BasicIndexer) - self.array[index_tuple] = value - - # xarray>2024.02.0 uses oindex and vindex properties, which are expected to - # return objects whose __getitem__ method supports the appropriate form of - # indexing. - @property - def oindex(self) -> _TensorStoreAdapter: - return self - - @property - def vindex(self) -> _TensorStoreAdapter: - return self - - def transpose(self, order: tuple[int, ...]) -> _TensorStoreAdapter: - transposed = self.array[tensorstore.d[order].transpose[:]] - return type(self)(transposed) - - def read(self) -> _TensorStoreAdapter: - future = self.array.read() - return type(self)(self.array, future) - - def __array__(self, dtype: Optional[np.dtype] = None) -> np.ndarray: # type: ignore - future = self.array.read() if self.future is None else self.future - return np.asarray(future.result(), dtype=dtype) - - def get_duck_array(self): - # special method for xarray to return an in-memory (computed) representation - return np.asarray(self) - - # Work around the missing __copy__ and __deepcopy__ methods from TensorStore, - # which are needed for Xarray: - # https://github.com/google/tensorstore/issues/109 - # TensorStore objects are immutable, so there's no need to actually copy them. - - def __copy__(self) -> _TensorStoreAdapter: - return type(self)(self.array, self.future) - - def __deepcopy__(self, memo) -> _TensorStoreAdapter: - return self.__copy__() + """TensorStore array that can be wrapped by xarray.Variable. + + We use Xarray's semi-internal ExplicitlyIndexed API so that Xarray will not + attempt to load our array into memory as a NumPy array. In the future, this + should be supported by public Xarray APIs, as part of the refactor discussed + in: https://github.com/pydata/xarray/issues/3981 + """ + + array: tensorstore.TensorStore + future: Optional[tensorstore.Future] = None + + @property + def shape(self) -> tuple[int, ...]: + return self.array.shape + + @property + def dtype(self) -> np.dtype: + return self.array.dtype.numpy_dtype + + @property + def ndim(self) -> int: + return len(self.shape) + + @property + def size(self) -> int: + return math.prod(self.shape) + + def __getitem__(self, key: indexing.ExplicitIndexer) -> _TensorStoreAdapter: + index_tuple = tuple(map(_numpy_to_tensorstore_index, key.tuple, self.shape)) + if isinstance(key, indexing.OuterIndexer): + # TODO(shoyer): fix this for newer versions of Xarray. + # We get the error message: + # AttributeError: '_TensorStoreAdapter' object has no attribute 'oindex' + indexed = self.array.oindex[index_tuple] + elif isinstance(key, indexing.VectorizedIndexer): + indexed = self.array.vindex[index_tuple] + else: + assert isinstance(key, indexing.BasicIndexer) + indexed = self.array[index_tuple] + # Translate to the origin so repeated indexing is relative to the new bounds + # like NumPy, not absolute like TensorStore + translated = indexed[tensorstore.d[:].translate_to[0]] + return type(self)(translated) + + def __setitem__(self, key: indexing.ExplicitIndexer, value) -> None: + index_tuple = tuple(map(_numpy_to_tensorstore_index, key.tuple, self.shape)) + if isinstance(key, indexing.OuterIndexer): + self.array.oindex[index_tuple] = value + elif isinstance(key, indexing.VectorizedIndexer): + self.array.vindex[index_tuple] = value + else: + assert isinstance(key, indexing.BasicIndexer) + self.array[index_tuple] = value + + # xarray>2024.02.0 uses oindex and vindex properties, which are expected to + # return objects whose __getitem__ method supports the appropriate form of + # indexing. + @property + def oindex(self) -> _TensorStoreAdapter: + return self + + @property + def vindex(self) -> _TensorStoreAdapter: + return self + + def transpose(self, order: tuple[int, ...]) -> _TensorStoreAdapter: + transposed = self.array[tensorstore.d[order].transpose[:]] + return type(self)(transposed) + + def read(self) -> _TensorStoreAdapter: + future = self.array.read() + return type(self)(self.array, future) + + def __array__(self, dtype: Optional[np.dtype] = None) -> np.ndarray: # type: ignore + future = self.array.read() if self.future is None else self.future + return np.asarray(future.result(), dtype=dtype) + + def get_duck_array(self): + # special method for xarray to return an in-memory (computed) representation + return np.asarray(self) + + # Work around the missing __copy__ and __deepcopy__ methods from TensorStore, + # which are needed for Xarray: + # https://github.com/google/tensorstore/issues/109 + # TensorStore objects are immutable, so there's no need to actually copy them. + + def __copy__(self) -> _TensorStoreAdapter: + return type(self)(self.array, self.future) + + def __deepcopy__(self, memo) -> _TensorStoreAdapter: + return self.__copy__() def _read_tensorstore( array: indexing.ExplicitlyIndexed, ) -> indexing.ExplicitlyIndexed: - """Starts async reading on a TensorStore array.""" - return array.read() if isinstance(array, _TensorStoreAdapter) else array + """Starts async reading on a TensorStore array.""" + return array.read() if isinstance(array, _TensorStoreAdapter) else array def read(xarraydata: XarrayData, /) -> XarrayData: - """Starts async reads on all TensorStore arrays.""" - # pylint: disable=protected-access - if isinstance(xarraydata, xarray.Dataset): - data = { - name: _read_tensorstore(var.variable._data) - for name, var in xarraydata.data_vars.items() - } - elif isinstance(xarraydata, xarray.DataArray): - data = _read_tensorstore(xarraydata.variable._data) - else: - raise TypeError(f'argument is not a DataArray or Dataset: {xarraydata}') - # pylint: enable=protected-access - return xarraydata.copy(data=data) + """Starts async reads on all TensorStore arrays.""" + # pylint: disable=protected-access + if isinstance(xarraydata, xarray.Dataset): + data = { + name: _read_tensorstore(var.variable._data) + for name, var in xarraydata.data_vars.items() + } + elif isinstance(xarraydata, xarray.DataArray): + data = _read_tensorstore(xarraydata.variable._data) + else: + raise TypeError(f"argument is not a DataArray or Dataset: {xarraydata}") + # pylint: enable=protected-access + return xarraydata.copy(data=data) -_DEFAULT_STORAGE_DRIVER = 'file' +_DEFAULT_STORAGE_DRIVER = "file" def _zarr_spec_from_path(path: str) -> ...: - if re.match(r'\w+\://', path): # path is a URI - kv_store = path - else: - kv_store = {'driver': _DEFAULT_STORAGE_DRIVER, 'path': path} - return {'driver': 'zarr', 'kvstore': kv_store} + if re.match(r"\w+\://", path): # path is a URI + kv_store = path + else: + kv_store = {"driver": _DEFAULT_STORAGE_DRIVER, "path": path} + return {"driver": "zarr", "kvstore": kv_store} def _raise_if_mask_and_scale_used_for_data_vars(ds: xarray.Dataset): - """Check a dataset for data variables that would need masking or scaling.""" - advice = ( - 'Consider re-opening with xarray_tensorstore.open_zarr(..., ' - 'mask_and_scale=False), or falling back to use xarray.open_zarr().' - ) - for k in ds: - encoding = ds[k].encoding - for attr in ['_FillValue', 'missing_value']: - fill_value = encoding.get(attr, np.nan) - if fill_value == fill_value: # pylint: disable=comparison-with-itself - raise ValueError( - f'variable {k} has non-NaN fill value, which is not supported by' - f' xarray-tensorstore: {fill_value}. {advice}' - ) - for attr in ['scale_factor', 'add_offset']: - if attr in encoding: - raise ValueError( - f'variable {k} uses scale/offset encoding, which is not supported' - f' by xarray-tensorstore: {encoding}. {advice}' - ) + """Check a dataset for data variables that would need masking or scaling.""" + advice = ( + "Consider re-opening with xarray_tensorstore.open_zarr(..., " + "mask_and_scale=False), or falling back to use xarray.open_zarr()." + ) + for k in ds: + encoding = ds[k].encoding + for attr in ["_FillValue", "missing_value"]: + fill_value = encoding.get(attr, np.nan) + if fill_value == fill_value: # pylint: disable=comparison-with-itself + raise ValueError( + f"variable {k} has non-NaN fill value, which is not supported by" + f" xarray-tensorstore: {fill_value}. {advice}" + ) + for attr in ["scale_factor", "add_offset"]: + if attr in encoding: + raise ValueError( + f"variable {k} uses scale/offset encoding, which is not supported" + f" by xarray-tensorstore: {encoding}. {advice}" + ) def open_zarr( @@ -210,69 +210,73 @@ def open_zarr( *, context: tensorstore.Context | None = None, mask_and_scale: bool = True, + mode: str = "r", ) -> xarray.Dataset: - """Open an xarray.Dataset from Zarr using TensorStore. - - For best performance, explicitly call `read()` to asynchronously load data - in parallel. Otherwise, xarray's `.compute()` method will load each variable's - data in sequence. - - Example usage: - - import xarray_tensorstore - - ds = xarray_tensorstore.open_zarr(path) - - # indexing & transposing is lazy - example = ds.sel(time='2020-01-01').transpose('longitude', 'latitude', ...) - - # start reading data asynchronously - read_example = xarray_tensorstore.read(example) - - # blocking conversion of the data into NumPy arrays - numpy_example = read_example.compute() - - Args: - path: path or URI to Zarr group to open. - context: TensorStore configuration options to use when opening arrays. - mask_and_scale: if True (default), attempt to apply masking and scaling like - xarray.open_zarr(). This is only supported for coordinate variables and - otherwise will raise an error. - - Returns: - Dataset with all data variables opened via TensorStore. - """ - # We use xarray.open_zarr (which uses Zarr Python internally) to open the - # initial version of the dataset for a few reasons: - # 1. TensorStore does not support Zarr groups or array attributes, which we - # need to open in the xarray.Dataset. We use Zarr Python instead of - # parsing the raw Zarr metadata files ourselves. - # 2. TensorStore doesn't support non-standard Zarr dtypes like UTF-8 strings. - # 3. Xarray's open_zarr machinery does some pre-processing (e.g., from numeric - # to datetime64 dtypes) that we would otherwise need to invoke explicitly - # via xarray.decode_cf(). - # - # Fortunately (2) and (3) are most commonly encountered on small coordinate - # arrays, for which the performance advantages of TensorStore are irrelevant. - - if context is None: - context = tensorstore.Context() - - # chunks=None means avoid using dask - ds = xarray.open_zarr(path, chunks=None, mask_and_scale=mask_and_scale) - - if mask_and_scale: - # Data variables get replaced below with _TensorStoreAdapter arrays, which - # don't get masked or scaled. Raising an error avoids surprising users with - # incorrect data values. - _raise_if_mask_and_scale_used_for_data_vars(ds) - - specs = {k: _zarr_spec_from_path(os.path.join(path, k)) for k in ds} - array_futures = { - k: tensorstore.open(spec, read=True, write=False, context=context) - for k, spec in specs.items() - } - arrays = {k: v.result() for k, v in array_futures.items()} - new_data = {k: _TensorStoreAdapter(v) for k, v in arrays.items()} - - return ds.copy(data=new_data) + """Open an xarray.Dataset from Zarr using TensorStore. + + For best performance, explicitly call `read()` to asynchronously load data + in parallel. Otherwise, xarray's `.compute()` method will load each variable's + data in sequence. + + Example usage: + + import xarray_tensorstore + + ds = xarray_tensorstore.open_zarr(path) + + # indexing & transposing is lazy + example = ds.sel(time='2020-01-01').transpose('longitude', 'latitude', ...) + + # start reading data asynchronously + read_example = xarray_tensorstore.read(example) + + # blocking conversion of the data into NumPy arrays + numpy_example = read_example.compute() + + Args: + path: path or URI to Zarr group to open. + context: TensorStore configuration options to use when opening arrays. + mask_and_scale: if True (default), attempt to apply masking and scaling like + xarray.open_zarr(). This is only supported for coordinate variables and + otherwise will raise an error. + mode: file mode to use when opening the Zarr group, either ‘r’ for read only; or ‘r+’ for read/write (Zarr must exist). + + Returns: + Dataset with all data variables opened via TensorStore. + """ + # We use xarray.open_zarr (which uses Zarr Python internally) to open the + # initial version of the dataset for a few reasons: + # 1. TensorStore does not support Zarr groups or array attributes, which we + # need to open in the xarray.Dataset. We use Zarr Python instead of + # parsing the raw Zarr metadata files ourselves. + # 2. TensorStore doesn't support non-standard Zarr dtypes like UTF-8 strings. + # 3. Xarray's open_zarr machinery does some pre-processing (e.g., from numeric + # to datetime64 dtypes) that we would otherwise need to invoke explicitly + # via xarray.decode_cf(). + # + # Fortunately (2) and (3) are most commonly encountered on small coordinate + # arrays, for which the performance advantages of TensorStore are irrelevant. + + if context is None: + context = tensorstore.Context() + + # chunks=None means avoid using dask + ds = xarray.open_zarr(path, chunks=None, mask_and_scale=mask_and_scale) + + if mask_and_scale: + # Data variables get replaced below with _TensorStoreAdapter arrays, which + # don't get masked or scaled. Raising an error avoids surprising users with + # incorrect data values. + _raise_if_mask_and_scale_used_for_data_vars(ds) + + specs = {k: _zarr_spec_from_path(os.path.join(path, k)) for k in ds} + array_futures = { + k: tensorstore.open( + spec, read=True, write=mode.lower() == "r+", context=context + ) + for k, spec in specs.items() + } + arrays = {k: v.result() for k, v in array_futures.items()} + new_data = {k: _TensorStoreAdapter(v) for k, v in arrays.items()} + + return ds.copy(data=new_data) diff --git a/xarray_tensorstore_test.py b/xarray_tensorstore_test.py index c019e7d..0a8af7d 100644 --- a/xarray_tensorstore_test.py +++ b/xarray_tensorstore_test.py @@ -15,6 +15,7 @@ from absl.testing import parameterized import numpy as np import pandas as pd +import pytest import tensorstore import xarray from xarray.core import indexing @@ -23,239 +24,260 @@ class XarrayTensorstoreTest(parameterized.TestCase): - @parameterized.named_parameters( - # TODO(shoyer): consider using hypothesis to convert these into - # property-based tests - { - 'testcase_name': 'base', - 'transform': lambda ds: ds, - }, - { - 'testcase_name': 'transposed', - 'transform': lambda ds: ds.transpose('z', 'x', 'y'), - }, - { - 'testcase_name': 'basic_int', - 'transform': lambda ds: ds.isel(y=1), - }, - { - 'testcase_name': 'negative_int', - 'transform': lambda ds: ds.isel(y=-1), - }, - { - 'testcase_name': 'basic_slice', - 'transform': lambda ds: ds.isel(z=slice(2)), - }, - { - 'testcase_name': 'full_slice', - 'transform': lambda ds: ds.isel(z=slice(0, 4)), - }, - { - 'testcase_name': 'out_of_bounds_slice', - 'transform': lambda ds: ds.isel(z=slice(0, 10)), - }, - { - 'testcase_name': 'strided_slice', - 'transform': lambda ds: ds.isel(z=slice(0, None, 2)), - }, - { - 'testcase_name': 'negative_stride_slice', - 'transform': lambda ds: ds.isel(z=slice(None, None, -1)), - }, - { - 'testcase_name': 'repeated_indexing', - 'transform': lambda ds: ds.isel(z=slice(1, None)).isel(z=0), - }, - { - 'testcase_name': 'oindex', - # includes repeated, negative and out of order indices - 'transform': lambda ds: ds.isel(x=[0], y=[1, 1], z=[1, -1, 0]), - }, - { - 'testcase_name': 'vindex', - 'transform': lambda ds: ds.isel(x=('w', [0, 1]), y=('w', [1, 2])), - }, - { - 'testcase_name': 'mixed_indexing_types', - 'transform': lambda ds: ds.isel(x=0, y=slice(2), z=[-1]), - }, - { - 'testcase_name': 'select_a_variable', - 'transform': lambda ds: ds['foo'], - }, - ) - def test_open_zarr(self, transform): - source = xarray.Dataset( + @parameterized.named_parameters( + # TODO(shoyer): consider using hypothesis to convert these into + # property-based tests { - 'foo': (('x',), np.arange(2), {'local': 'local metadata'}), - 'bar': (('x', 'y'), np.arange(6).reshape(2, 3)), - 'baz': (('x', 'y', 'z'), np.arange(24).reshape(2, 3, 4)), + "testcase_name": "base", + "transform": lambda ds: ds, }, - coords={ - 'x': [1, 2], - 'y': pd.to_datetime(['2000-01-01', '2000-01-02', '2000-01-03']), - 'z': ['a', 'b', 'c', 'd'], + { + "testcase_name": "transposed", + "transform": lambda ds: ds.transpose("z", "x", "y"), + }, + { + "testcase_name": "basic_int", + "transform": lambda ds: ds.isel(y=1), + }, + { + "testcase_name": "negative_int", + "transform": lambda ds: ds.isel(y=-1), + }, + { + "testcase_name": "basic_slice", + "transform": lambda ds: ds.isel(z=slice(2)), + }, + { + "testcase_name": "full_slice", + "transform": lambda ds: ds.isel(z=slice(0, 4)), + }, + { + "testcase_name": "out_of_bounds_slice", + "transform": lambda ds: ds.isel(z=slice(0, 10)), + }, + { + "testcase_name": "strided_slice", + "transform": lambda ds: ds.isel(z=slice(0, None, 2)), + }, + { + "testcase_name": "negative_stride_slice", + "transform": lambda ds: ds.isel(z=slice(None, None, -1)), + }, + { + "testcase_name": "repeated_indexing", + "transform": lambda ds: ds.isel(z=slice(1, None)).isel(z=0), + }, + { + "testcase_name": "oindex", + # includes repeated, negative and out of order indices + "transform": lambda ds: ds.isel(x=[0], y=[1, 1], z=[1, -1, 0]), + }, + { + "testcase_name": "vindex", + "transform": lambda ds: ds.isel(x=("w", [0, 1]), y=("w", [1, 2])), + }, + { + "testcase_name": "mixed_indexing_types", + "transform": lambda ds: ds.isel(x=0, y=slice(2), z=[-1]), + }, + { + "testcase_name": "select_a_variable", + "transform": lambda ds: ds["foo"], }, - attrs={'global': 'global metadata'}, ) - path = self.create_tempdir().full_path - source.chunk().to_zarr(path) + def test_open_zarr(self, transform): + source = xarray.Dataset( + { + "foo": (("x",), np.arange(2), {"local": "local metadata"}), + "bar": (("x", "y"), np.arange(6).reshape(2, 3)), + "baz": (("x", "y", "z"), np.arange(24).reshape(2, 3, 4)), + }, + coords={ + "x": [1, 2], + "y": pd.to_datetime(["2000-01-01", "2000-01-02", "2000-01-03"]), + "z": ["a", "b", "c", "d"], + }, + attrs={"global": "global metadata"}, + ) + path = self.create_tempdir().full_path + source.chunk().to_zarr(path) - expected = transform(source) - actual = transform(xarray_tensorstore.open_zarr(path)).compute() - xarray.testing.assert_identical(actual, expected) + expected = transform(source) + actual = transform(xarray_tensorstore.open_zarr(path)).compute() + xarray.testing.assert_identical(actual, expected) - @parameterized.parameters( - {'deep': True}, - {'deep': False}, - ) - def test_copy(self, deep): - source = xarray.Dataset({'foo': (('x',), np.arange(10))}) - path = self.create_tempdir().full_path - source.to_zarr(path) - opened = xarray_tensorstore.open_zarr(path) - copied = opened.copy(deep=deep) - xarray.testing.assert_identical(copied, source) + @parameterized.parameters( + {"deep": True}, + {"deep": False}, + ) + def test_copy(self, deep): + source = xarray.Dataset({"foo": (("x",), np.arange(10))}) + path = self.create_tempdir().full_path + source.to_zarr(path) + opened = xarray_tensorstore.open_zarr(path) + copied = opened.copy(deep=deep) + xarray.testing.assert_identical(copied, source) - def test_sortby(self): - # regression test for https://github.com/google/xarray-tensorstore/issues/1 - x = np.arange(10) - source = xarray.Dataset({'foo': (('x',), x)}, {'x': x[::-1]}) - path = self.create_tempdir().full_path - source.to_zarr(path) - opened = xarray_tensorstore.open_zarr(path) - opened.sortby('x') # should not crash + def test_sortby(self): + # regression test for https://github.com/google/xarray-tensorstore/issues/1 + x = np.arange(10) + source = xarray.Dataset({"foo": (("x",), x)}, {"x": x[::-1]}) + path = self.create_tempdir().full_path + source.to_zarr(path) + opened = xarray_tensorstore.open_zarr(path) + opened.sortby("x") # should not crash - def test_compute(self): - # verify that get_duck_array() is working properly - source = xarray.Dataset({'foo': (('x',), np.arange(10))}) - path = self.create_tempdir().full_path - source.to_zarr(path) - opened = xarray_tensorstore.open_zarr(path) - computed = opened.compute() - computed_data = computed['foo'].variable._data - self.assertNotIsInstance(computed_data, tensorstore.TensorStore) + def test_compute(self): + # verify that get_duck_array() is working properly + source = xarray.Dataset({"foo": (("x",), np.arange(10))}) + path = self.create_tempdir().full_path + source.to_zarr(path) + opened = xarray_tensorstore.open_zarr(path) + computed = opened.compute() + computed_data = computed["foo"].variable._data + self.assertNotIsInstance(computed_data, tensorstore.TensorStore) - def test_open_zarr_from_uri(self): - source = xarray.Dataset({'baz': (('x', 'y', 'z'), np.arange(24).reshape(2, 3, 4))}) - path = self.create_tempdir().full_path - source.chunk().to_zarr(path) + def test_open_zarr_from_uri(self): + source = xarray.Dataset( + {"baz": (("x", "y", "z"), np.arange(24).reshape(2, 3, 4))} + ) + path = self.create_tempdir().full_path + source.chunk().to_zarr(path) - opened = xarray_tensorstore.open_zarr('file://' + path) - xarray.testing.assert_identical(source, opened) + opened = xarray_tensorstore.open_zarr("file://" + path) + xarray.testing.assert_identical(source, opened) - def test_read_dataset(self): - source = xarray.Dataset( - {'baz': (('x', 'y', 'z'), np.arange(24).reshape(2, 3, 4))}, - coords={'x': np.arange(2)}, - ) - path = self.create_tempdir().full_path - source.chunk().to_zarr(path) + def test_read_dataset(self): + source = xarray.Dataset( + {"baz": (("x", "y", "z"), np.arange(24).reshape(2, 3, 4))}, + coords={"x": np.arange(2)}, + ) + path = self.create_tempdir().full_path + source.chunk().to_zarr(path) - opened = xarray_tensorstore.open_zarr(path) - read = xarray_tensorstore.read(opened) + opened = xarray_tensorstore.open_zarr(path) + read = xarray_tensorstore.read(opened) - self.assertIsNone(opened.variables['baz']._data.future) - self.assertIsNotNone(read.variables['baz']._data.future) - xarray.testing.assert_identical(read, source) + self.assertIsNone(opened.variables["baz"]._data.future) + self.assertIsNotNone(read.variables["baz"]._data.future) + xarray.testing.assert_identical(read, source) - def test_read_dataarray(self): - source = xarray.DataArray( - np.arange(24).reshape(2, 3, 4), - dims=('x', 'y', 'z'), - name='baz', - coords={'x': np.arange(2)}, - ) - path = self.create_tempdir().full_path - source.to_dataset().chunk().to_zarr(path) + def test_read_dataarray(self): + source = xarray.DataArray( + np.arange(24).reshape(2, 3, 4), + dims=("x", "y", "z"), + name="baz", + coords={"x": np.arange(2)}, + ) + path = self.create_tempdir().full_path + source.to_dataset().chunk().to_zarr(path) - opened = xarray_tensorstore.open_zarr(path)['baz'] - read = xarray_tensorstore.read(opened) + opened = xarray_tensorstore.open_zarr(path)["baz"] + read = xarray_tensorstore.read(opened) - self.assertIsNone(opened.variable._data.future) - self.assertIsNotNone(read.variable._data.future) - xarray.testing.assert_identical(read, source) + self.assertIsNone(opened.variable._data.future) + self.assertIsNotNone(read.variable._data.future) + xarray.testing.assert_identical(read, source) - def test_mask_and_scale(self): - source = xarray.DataArray( - np.arange(24).reshape(2, 3, 4), - dims=('x', 'y', 'z'), - name='baz', - coords={'x': np.arange(2)}, - ) + def test_mask_and_scale(self): + source = xarray.DataArray( + np.arange(24).reshape(2, 3, 4), + dims=("x", "y", "z"), + name="baz", + coords={"x": np.arange(2)}, + ) - # invalid fill-value - source.encoding = {'_FillValue': -1} - path = self.create_tempdir().full_path - source.to_dataset().chunk().to_zarr(path) - expected_msg = ( - 'variable baz has non-NaN fill value, which is not supported by' - ' xarray-tensorstore: -1. Consider re-opening with' - ' xarray_tensorstore.open_zarr(..., mask_and_scale=False), or falling' - ' back to use xarray.open_zarr().' - ) - with self.assertRaisesWithLiteralMatch(ValueError, expected_msg): - xarray_tensorstore.open_zarr(path) + # invalid fill-value + source.encoding = {"_FillValue": -1} + path = self.create_tempdir().full_path + source.to_dataset().chunk().to_zarr(path) + expected_msg = ( + "variable baz has non-NaN fill value, which is not supported by" + " xarray-tensorstore: -1. Consider re-opening with" + " xarray_tensorstore.open_zarr(..., mask_and_scale=False), or falling" + " back to use xarray.open_zarr()." + ) + with self.assertRaisesWithLiteralMatch(ValueError, expected_msg): + xarray_tensorstore.open_zarr(path) - actual = xarray_tensorstore.open_zarr(path, mask_and_scale=False)['baz'] - xarray.testing.assert_equal(actual, source) # no values are masked + actual = xarray_tensorstore.open_zarr(path, mask_and_scale=False)["baz"] + xarray.testing.assert_equal(actual, source) # no values are masked - # invalid scaling - source.encoding = {'scale_factor': 10.0} - path = self.create_tempdir().full_path - source.to_dataset().chunk().to_zarr(path) - expected_msg = 'variable baz uses scale/offset encoding' - with self.assertRaisesRegex(ValueError, expected_msg): - xarray_tensorstore.open_zarr(path) + # invalid scaling + source.encoding = {"scale_factor": 10.0} + path = self.create_tempdir().full_path + source.to_dataset().chunk().to_zarr(path) + expected_msg = "variable baz uses scale/offset encoding" + with self.assertRaisesRegex(ValueError, expected_msg): + xarray_tensorstore.open_zarr(path) - actual = xarray_tensorstore.open_zarr(path, mask_and_scale=False)['baz'] - self.assertFalse(actual.equals(source)) # not scaled properly + actual = xarray_tensorstore.open_zarr(path, mask_and_scale=False)["baz"] + self.assertFalse(actual.equals(source)) # not scaled properly - # valid offset (coordinate only) - source.encoding = {} - source.coords['x'].encoding = {'add_offset': -1} - path = self.create_tempdir().full_path - source.to_dataset().chunk().to_zarr(path) - actual = xarray_tensorstore.open_zarr(path, mask_and_scale=True)['baz'] - xarray.testing.assert_identical(actual, source) - self.assertEqual(actual.coords['x'].encoding['add_offset'], -1) + # valid offset (coordinate only) + source.encoding = {} + source.coords["x"].encoding = {"add_offset": -1} + path = self.create_tempdir().full_path + source.to_dataset().chunk().to_zarr(path) + actual = xarray_tensorstore.open_zarr(path, mask_and_scale=True)["baz"] + xarray.testing.assert_identical(actual, source) + self.assertEqual(actual.coords["x"].encoding["add_offset"], -1) - @parameterized.named_parameters( - { - 'testcase_name': 'basic_indexing', - 'key': indexing.BasicIndexer((slice(1, None), slice(None), slice(None))), - 'numpy_key': (slice(1, None), slice(None), slice(None)), - 'value': np.full((1, 2, 3), -1), - }, - { - 'testcase_name': 'outer_indexing', - 'key': indexing.OuterIndexer((np.array([0]), np.array([1]), slice(None))), - 'numpy_key': (np.array([0]), np.array([1]), slice(None)), - 'value': np.full((1, 1, 3), -2), - }, - { - 'testcase_name': 'vectorized_indexing', - 'key': indexing.VectorizedIndexer( - (np.array([0, 1]), np.array([0, 1]), slice(None)) - ), - 'numpy_key': (np.array([0, 1]), np.array([0, 1]), slice(None)), - 'value': np.full((2, 3), -3), - }, - ) - def test_setitem(self, key, numpy_key, value): - source_data = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]) - source = tensorstore.array( - source_data, dtype=source_data.dtype, context=tensorstore.Context() + @parameterized.named_parameters( + { + "testcase_name": "basic_indexing", + "key": (slice(1, None), slice(None), slice(None)), + "value": np.full((1, 2, 3), -1), + "mode": "r+", + }, + { + "testcase_name": "outer_indexing", + "key": (np.array([0]), np.array([1]), slice(None)), + "value": np.full((1, 1, 3), -2), + "mode": "r+", + }, + { + "testcase_name": "vectorized_indexing", + "key": (np.array([0]), np.array([0, 1]), slice(None)), + "value": np.full((2, 3), -3), + "mode": "r+", + }, + { + "testcase_name": "read_only", + "key": (slice(1, None), slice(None), slice(None)), + "value": np.full((1, 2, 3), -1), + "mode": "r", + }, ) - adapter = xarray_tensorstore._TensorStoreAdapter(source) + def test_setitem(self, key, value, mode): + source_data = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]) + source = xarray.DataArray( + source_data, + dims=("x", "y", "z"), + name="baz", + ) + path = self.create_tempdir().full_path + source.to_dataset().chunk().to_zarr(path) + + opened = xarray_tensorstore.open_zarr(path, mode=mode)["baz"] + if mode == "r": + with pytest.raises(ValueError): + opened[key] = value + return + + opened[key] = value + read = xarray_tensorstore.read(opened) + + expected_data = source_data.copy() + expected_data[key] = value + expected = xarray.DataArray( + expected_data, + dims=("x", "y", "z"), + name="baz", + ) - adapter[key] = value - result = adapter.array.read().result() - expected = source_data.copy() - expected[numpy_key] = value - print(result) - np.testing.assert_equal(result, expected) + xarray.testing.assert_equal(read, expected) -if __name__ == '__main__': - absltest.main() +if __name__ == "__main__": + absltest.main() From 3ee6a778cb294c2c04a778bf667c0d7b81f324ae Mon Sep 17 00:00:00 2001 From: rhoadesScholar Date: Tue, 5 Nov 2024 14:30:03 -0500 Subject: [PATCH 07/17] =?UTF-8?q?style:=20=F0=9F=8E=A8=20Format=20with=20p?= =?UTF-8?q?yink?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- xarray_tensorstore.py | 430 ++++++++++++++++++----------------- xarray_tensorstore_test.py | 450 ++++++++++++++++++------------------- 2 files changed, 438 insertions(+), 442 deletions(-) diff --git a/xarray_tensorstore.py b/xarray_tensorstore.py index 23aef8e..6dbffe8 100644 --- a/xarray_tensorstore.py +++ b/xarray_tensorstore.py @@ -34,175 +34,175 @@ def _numpy_to_tensorstore_index(index: Index, size: int) -> Index: - """Switch from NumPy to TensorStore indexing conventions.""" - # https://google.github.io/tensorstore/python/indexing.html#differences-compared-to-numpy-indexing - if index is None: - return None - elif isinstance(index, int): - # Negative integers do not count from the end in TensorStore - return index + size if index < 0 else index - elif isinstance(index, slice): - start = _numpy_to_tensorstore_index(index.start, size) - stop = _numpy_to_tensorstore_index(index.stop, size) - if stop is not None: - # TensorStore does not allow out of bounds slicing - stop = min(stop, size) - return slice(start, stop, index.step) - else: - assert isinstance(index, np.ndarray) - return np.where(index < 0, index + size, index) + """Switch from NumPy to TensorStore indexing conventions.""" + # https://google.github.io/tensorstore/python/indexing.html#differences-compared-to-numpy-indexing + if index is None: + return None + elif isinstance(index, int): + # Negative integers do not count from the end in TensorStore + return index + size if index < 0 else index + elif isinstance(index, slice): + start = _numpy_to_tensorstore_index(index.start, size) + stop = _numpy_to_tensorstore_index(index.stop, size) + if stop is not None: + # TensorStore does not allow out of bounds slicing + stop = min(stop, size) + return slice(start, stop, index.step) + else: + assert isinstance(index, np.ndarray) + return np.where(index < 0, index + size, index) @dataclasses.dataclass(frozen=True) class _TensorStoreAdapter(indexing.ExplicitlyIndexed): - """TensorStore array that can be wrapped by xarray.Variable. - - We use Xarray's semi-internal ExplicitlyIndexed API so that Xarray will not - attempt to load our array into memory as a NumPy array. In the future, this - should be supported by public Xarray APIs, as part of the refactor discussed - in: https://github.com/pydata/xarray/issues/3981 - """ - - array: tensorstore.TensorStore - future: Optional[tensorstore.Future] = None - - @property - def shape(self) -> tuple[int, ...]: - return self.array.shape - - @property - def dtype(self) -> np.dtype: - return self.array.dtype.numpy_dtype - - @property - def ndim(self) -> int: - return len(self.shape) - - @property - def size(self) -> int: - return math.prod(self.shape) - - def __getitem__(self, key: indexing.ExplicitIndexer) -> _TensorStoreAdapter: - index_tuple = tuple(map(_numpy_to_tensorstore_index, key.tuple, self.shape)) - if isinstance(key, indexing.OuterIndexer): - # TODO(shoyer): fix this for newer versions of Xarray. - # We get the error message: - # AttributeError: '_TensorStoreAdapter' object has no attribute 'oindex' - indexed = self.array.oindex[index_tuple] - elif isinstance(key, indexing.VectorizedIndexer): - indexed = self.array.vindex[index_tuple] - else: - assert isinstance(key, indexing.BasicIndexer) - indexed = self.array[index_tuple] - # Translate to the origin so repeated indexing is relative to the new bounds - # like NumPy, not absolute like TensorStore - translated = indexed[tensorstore.d[:].translate_to[0]] - return type(self)(translated) - - def __setitem__(self, key: indexing.ExplicitIndexer, value) -> None: - index_tuple = tuple(map(_numpy_to_tensorstore_index, key.tuple, self.shape)) - if isinstance(key, indexing.OuterIndexer): - self.array.oindex[index_tuple] = value - elif isinstance(key, indexing.VectorizedIndexer): - self.array.vindex[index_tuple] = value - else: - assert isinstance(key, indexing.BasicIndexer) - self.array[index_tuple] = value - - # xarray>2024.02.0 uses oindex and vindex properties, which are expected to - # return objects whose __getitem__ method supports the appropriate form of - # indexing. - @property - def oindex(self) -> _TensorStoreAdapter: - return self - - @property - def vindex(self) -> _TensorStoreAdapter: - return self - - def transpose(self, order: tuple[int, ...]) -> _TensorStoreAdapter: - transposed = self.array[tensorstore.d[order].transpose[:]] - return type(self)(transposed) - - def read(self) -> _TensorStoreAdapter: - future = self.array.read() - return type(self)(self.array, future) - - def __array__(self, dtype: Optional[np.dtype] = None) -> np.ndarray: # type: ignore - future = self.array.read() if self.future is None else self.future - return np.asarray(future.result(), dtype=dtype) - - def get_duck_array(self): - # special method for xarray to return an in-memory (computed) representation - return np.asarray(self) - - # Work around the missing __copy__ and __deepcopy__ methods from TensorStore, - # which are needed for Xarray: - # https://github.com/google/tensorstore/issues/109 - # TensorStore objects are immutable, so there's no need to actually copy them. - - def __copy__(self) -> _TensorStoreAdapter: - return type(self)(self.array, self.future) - - def __deepcopy__(self, memo) -> _TensorStoreAdapter: - return self.__copy__() + """TensorStore array that can be wrapped by xarray.Variable. + + We use Xarray's semi-internal ExplicitlyIndexed API so that Xarray will not + attempt to load our array into memory as a NumPy array. In the future, this + should be supported by public Xarray APIs, as part of the refactor discussed + in: https://github.com/pydata/xarray/issues/3981 + """ + + array: tensorstore.TensorStore + future: Optional[tensorstore.Future] = None + + @property + def shape(self) -> tuple[int, ...]: + return self.array.shape + + @property + def dtype(self) -> np.dtype: + return self.array.dtype.numpy_dtype + + @property + def ndim(self) -> int: + return len(self.shape) + + @property + def size(self) -> int: + return math.prod(self.shape) + + def __getitem__(self, key: indexing.ExplicitIndexer) -> _TensorStoreAdapter: + index_tuple = tuple(map(_numpy_to_tensorstore_index, key.tuple, self.shape)) + if isinstance(key, indexing.OuterIndexer): + # TODO(shoyer): fix this for newer versions of Xarray. + # We get the error message: + # AttributeError: '_TensorStoreAdapter' object has no attribute 'oindex' + indexed = self.array.oindex[index_tuple] + elif isinstance(key, indexing.VectorizedIndexer): + indexed = self.array.vindex[index_tuple] + else: + assert isinstance(key, indexing.BasicIndexer) + indexed = self.array[index_tuple] + # Translate to the origin so repeated indexing is relative to the new bounds + # like NumPy, not absolute like TensorStore + translated = indexed[tensorstore.d[:].translate_to[0]] + return type(self)(translated) + + def __setitem__(self, key: indexing.ExplicitIndexer, value) -> None: + index_tuple = tuple(map(_numpy_to_tensorstore_index, key.tuple, self.shape)) + if isinstance(key, indexing.OuterIndexer): + self.array.oindex[index_tuple] = value + elif isinstance(key, indexing.VectorizedIndexer): + self.array.vindex[index_tuple] = value + else: + assert isinstance(key, indexing.BasicIndexer) + self.array[index_tuple] = value + + # xarray>2024.02.0 uses oindex and vindex properties, which are expected to + # return objects whose __getitem__ method supports the appropriate form of + # indexing. + @property + def oindex(self) -> _TensorStoreAdapter: + return self + + @property + def vindex(self) -> _TensorStoreAdapter: + return self + + def transpose(self, order: tuple[int, ...]) -> _TensorStoreAdapter: + transposed = self.array[tensorstore.d[order].transpose[:]] + return type(self)(transposed) + + def read(self) -> _TensorStoreAdapter: + future = self.array.read() + return type(self)(self.array, future) + + def __array__(self, dtype: Optional[np.dtype] = None) -> np.ndarray: # type: ignore + future = self.array.read() if self.future is None else self.future + return np.asarray(future.result(), dtype=dtype) + + def get_duck_array(self): + # special method for xarray to return an in-memory (computed) representation + return np.asarray(self) + + # Work around the missing __copy__ and __deepcopy__ methods from TensorStore, + # which are needed for Xarray: + # https://github.com/google/tensorstore/issues/109 + # TensorStore objects are immutable, so there's no need to actually copy them. + + def __copy__(self) -> _TensorStoreAdapter: + return type(self)(self.array, self.future) + + def __deepcopy__(self, memo) -> _TensorStoreAdapter: + return self.__copy__() def _read_tensorstore( array: indexing.ExplicitlyIndexed, ) -> indexing.ExplicitlyIndexed: - """Starts async reading on a TensorStore array.""" - return array.read() if isinstance(array, _TensorStoreAdapter) else array + """Starts async reading on a TensorStore array.""" + return array.read() if isinstance(array, _TensorStoreAdapter) else array def read(xarraydata: XarrayData, /) -> XarrayData: - """Starts async reads on all TensorStore arrays.""" - # pylint: disable=protected-access - if isinstance(xarraydata, xarray.Dataset): - data = { - name: _read_tensorstore(var.variable._data) - for name, var in xarraydata.data_vars.items() - } - elif isinstance(xarraydata, xarray.DataArray): - data = _read_tensorstore(xarraydata.variable._data) - else: - raise TypeError(f"argument is not a DataArray or Dataset: {xarraydata}") - # pylint: enable=protected-access - return xarraydata.copy(data=data) + """Starts async reads on all TensorStore arrays.""" + # pylint: disable=protected-access + if isinstance(xarraydata, xarray.Dataset): + data = { + name: _read_tensorstore(var.variable._data) + for name, var in xarraydata.data_vars.items() + } + elif isinstance(xarraydata, xarray.DataArray): + data = _read_tensorstore(xarraydata.variable._data) + else: + raise TypeError(f"argument is not a DataArray or Dataset: {xarraydata}") + # pylint: enable=protected-access + return xarraydata.copy(data=data) _DEFAULT_STORAGE_DRIVER = "file" def _zarr_spec_from_path(path: str) -> ...: - if re.match(r"\w+\://", path): # path is a URI - kv_store = path - else: - kv_store = {"driver": _DEFAULT_STORAGE_DRIVER, "path": path} - return {"driver": "zarr", "kvstore": kv_store} + if re.match(r"\w+\://", path): # path is a URI + kv_store = path + else: + kv_store = {"driver": _DEFAULT_STORAGE_DRIVER, "path": path} + return {"driver": "zarr", "kvstore": kv_store} def _raise_if_mask_and_scale_used_for_data_vars(ds: xarray.Dataset): - """Check a dataset for data variables that would need masking or scaling.""" - advice = ( - "Consider re-opening with xarray_tensorstore.open_zarr(..., " - "mask_and_scale=False), or falling back to use xarray.open_zarr()." - ) - for k in ds: - encoding = ds[k].encoding - for attr in ["_FillValue", "missing_value"]: - fill_value = encoding.get(attr, np.nan) - if fill_value == fill_value: # pylint: disable=comparison-with-itself - raise ValueError( - f"variable {k} has non-NaN fill value, which is not supported by" - f" xarray-tensorstore: {fill_value}. {advice}" - ) - for attr in ["scale_factor", "add_offset"]: - if attr in encoding: - raise ValueError( - f"variable {k} uses scale/offset encoding, which is not supported" - f" by xarray-tensorstore: {encoding}. {advice}" - ) + """Check a dataset for data variables that would need masking or scaling.""" + advice = ( + "Consider re-opening with xarray_tensorstore.open_zarr(..., " + "mask_and_scale=False), or falling back to use xarray.open_zarr()." + ) + for k in ds: + encoding = ds[k].encoding + for attr in ["_FillValue", "missing_value"]: + fill_value = encoding.get(attr, np.nan) + if fill_value == fill_value: # pylint: disable=comparison-with-itself + raise ValueError( + f"variable {k} has non-NaN fill value, which is not supported by" + f" xarray-tensorstore: {fill_value}. {advice}" + ) + for attr in ["scale_factor", "add_offset"]: + if attr in encoding: + raise ValueError( + f"variable {k} uses scale/offset encoding, which is not supported" + f" by xarray-tensorstore: {encoding}. {advice}" + ) def open_zarr( @@ -212,71 +212,69 @@ def open_zarr( mask_and_scale: bool = True, mode: str = "r", ) -> xarray.Dataset: - """Open an xarray.Dataset from Zarr using TensorStore. - - For best performance, explicitly call `read()` to asynchronously load data - in parallel. Otherwise, xarray's `.compute()` method will load each variable's - data in sequence. - - Example usage: - - import xarray_tensorstore - - ds = xarray_tensorstore.open_zarr(path) - - # indexing & transposing is lazy - example = ds.sel(time='2020-01-01').transpose('longitude', 'latitude', ...) - - # start reading data asynchronously - read_example = xarray_tensorstore.read(example) - - # blocking conversion of the data into NumPy arrays - numpy_example = read_example.compute() - - Args: - path: path or URI to Zarr group to open. - context: TensorStore configuration options to use when opening arrays. - mask_and_scale: if True (default), attempt to apply masking and scaling like - xarray.open_zarr(). This is only supported for coordinate variables and - otherwise will raise an error. - mode: file mode to use when opening the Zarr group, either ‘r’ for read only; or ‘r+’ for read/write (Zarr must exist). - - Returns: - Dataset with all data variables opened via TensorStore. - """ - # We use xarray.open_zarr (which uses Zarr Python internally) to open the - # initial version of the dataset for a few reasons: - # 1. TensorStore does not support Zarr groups or array attributes, which we - # need to open in the xarray.Dataset. We use Zarr Python instead of - # parsing the raw Zarr metadata files ourselves. - # 2. TensorStore doesn't support non-standard Zarr dtypes like UTF-8 strings. - # 3. Xarray's open_zarr machinery does some pre-processing (e.g., from numeric - # to datetime64 dtypes) that we would otherwise need to invoke explicitly - # via xarray.decode_cf(). - # - # Fortunately (2) and (3) are most commonly encountered on small coordinate - # arrays, for which the performance advantages of TensorStore are irrelevant. - - if context is None: - context = tensorstore.Context() - - # chunks=None means avoid using dask - ds = xarray.open_zarr(path, chunks=None, mask_and_scale=mask_and_scale) - - if mask_and_scale: - # Data variables get replaced below with _TensorStoreAdapter arrays, which - # don't get masked or scaled. Raising an error avoids surprising users with - # incorrect data values. - _raise_if_mask_and_scale_used_for_data_vars(ds) - - specs = {k: _zarr_spec_from_path(os.path.join(path, k)) for k in ds} - array_futures = { - k: tensorstore.open( - spec, read=True, write=mode.lower() == "r+", context=context - ) - for k, spec in specs.items() - } - arrays = {k: v.result() for k, v in array_futures.items()} - new_data = {k: _TensorStoreAdapter(v) for k, v in arrays.items()} - - return ds.copy(data=new_data) + """Open an xarray.Dataset from Zarr using TensorStore. + + For best performance, explicitly call `read()` to asynchronously load data + in parallel. Otherwise, xarray's `.compute()` method will load each variable's + data in sequence. + + Example usage: + + import xarray_tensorstore + + ds = xarray_tensorstore.open_zarr(path) + + # indexing & transposing is lazy + example = ds.sel(time='2020-01-01').transpose('longitude', 'latitude', ...) + + # start reading data asynchronously + read_example = xarray_tensorstore.read(example) + + # blocking conversion of the data into NumPy arrays + numpy_example = read_example.compute() + + Args: + path: path or URI to Zarr group to open. + context: TensorStore configuration options to use when opening arrays. + mask_and_scale: if True (default), attempt to apply masking and scaling like + xarray.open_zarr(). This is only supported for coordinate variables and + otherwise will raise an error. + mode: file mode to use when opening the Zarr group, either ‘r’ for read only; or ‘r+’ for read/write (Zarr must exist). + + Returns: + Dataset with all data variables opened via TensorStore. + """ + # We use xarray.open_zarr (which uses Zarr Python internally) to open the + # initial version of the dataset for a few reasons: + # 1. TensorStore does not support Zarr groups or array attributes, which we + # need to open in the xarray.Dataset. We use Zarr Python instead of + # parsing the raw Zarr metadata files ourselves. + # 2. TensorStore doesn't support non-standard Zarr dtypes like UTF-8 strings. + # 3. Xarray's open_zarr machinery does some pre-processing (e.g., from numeric + # to datetime64 dtypes) that we would otherwise need to invoke explicitly + # via xarray.decode_cf(). + # + # Fortunately (2) and (3) are most commonly encountered on small coordinate + # arrays, for which the performance advantages of TensorStore are irrelevant. + + if context is None: + context = tensorstore.Context() + + # chunks=None means avoid using dask + ds = xarray.open_zarr(path, chunks=None, mask_and_scale=mask_and_scale) + + if mask_and_scale: + # Data variables get replaced below with _TensorStoreAdapter arrays, which + # don't get masked or scaled. Raising an error avoids surprising users with + # incorrect data values. + _raise_if_mask_and_scale_used_for_data_vars(ds) + + specs = {k: _zarr_spec_from_path(os.path.join(path, k)) for k in ds} + array_futures = { + k: tensorstore.open(spec, read=True, write=mode.lower() == "r+", context=context) + for k, spec in specs.items() + } + arrays = {k: v.result() for k, v in array_futures.items()} + new_data = {k: _TensorStoreAdapter(v) for k, v in arrays.items()} + + return ds.copy(data=new_data) diff --git a/xarray_tensorstore_test.py b/xarray_tensorstore_test.py index 0a8af7d..879db19 100644 --- a/xarray_tensorstore_test.py +++ b/xarray_tensorstore_test.py @@ -24,260 +24,258 @@ class XarrayTensorstoreTest(parameterized.TestCase): - @parameterized.named_parameters( - # TODO(shoyer): consider using hypothesis to convert these into - # property-based tests + @parameterized.named_parameters( + # TODO(shoyer): consider using hypothesis to convert these into + # property-based tests + { + "testcase_name": "base", + "transform": lambda ds: ds, + }, + { + "testcase_name": "transposed", + "transform": lambda ds: ds.transpose("z", "x", "y"), + }, + { + "testcase_name": "basic_int", + "transform": lambda ds: ds.isel(y=1), + }, + { + "testcase_name": "negative_int", + "transform": lambda ds: ds.isel(y=-1), + }, + { + "testcase_name": "basic_slice", + "transform": lambda ds: ds.isel(z=slice(2)), + }, + { + "testcase_name": "full_slice", + "transform": lambda ds: ds.isel(z=slice(0, 4)), + }, + { + "testcase_name": "out_of_bounds_slice", + "transform": lambda ds: ds.isel(z=slice(0, 10)), + }, + { + "testcase_name": "strided_slice", + "transform": lambda ds: ds.isel(z=slice(0, None, 2)), + }, + { + "testcase_name": "negative_stride_slice", + "transform": lambda ds: ds.isel(z=slice(None, None, -1)), + }, + { + "testcase_name": "repeated_indexing", + "transform": lambda ds: ds.isel(z=slice(1, None)).isel(z=0), + }, + { + "testcase_name": "oindex", + # includes repeated, negative and out of order indices + "transform": lambda ds: ds.isel(x=[0], y=[1, 1], z=[1, -1, 0]), + }, + { + "testcase_name": "vindex", + "transform": lambda ds: ds.isel(x=("w", [0, 1]), y=("w", [1, 2])), + }, + { + "testcase_name": "mixed_indexing_types", + "transform": lambda ds: ds.isel(x=0, y=slice(2), z=[-1]), + }, + { + "testcase_name": "select_a_variable", + "transform": lambda ds: ds["foo"], + }, + ) + def test_open_zarr(self, transform): + source = xarray.Dataset( { - "testcase_name": "base", - "transform": lambda ds: ds, + "foo": (("x",), np.arange(2), {"local": "local metadata"}), + "bar": (("x", "y"), np.arange(6).reshape(2, 3)), + "baz": (("x", "y", "z"), np.arange(24).reshape(2, 3, 4)), }, - { - "testcase_name": "transposed", - "transform": lambda ds: ds.transpose("z", "x", "y"), - }, - { - "testcase_name": "basic_int", - "transform": lambda ds: ds.isel(y=1), - }, - { - "testcase_name": "negative_int", - "transform": lambda ds: ds.isel(y=-1), - }, - { - "testcase_name": "basic_slice", - "transform": lambda ds: ds.isel(z=slice(2)), - }, - { - "testcase_name": "full_slice", - "transform": lambda ds: ds.isel(z=slice(0, 4)), - }, - { - "testcase_name": "out_of_bounds_slice", - "transform": lambda ds: ds.isel(z=slice(0, 10)), - }, - { - "testcase_name": "strided_slice", - "transform": lambda ds: ds.isel(z=slice(0, None, 2)), - }, - { - "testcase_name": "negative_stride_slice", - "transform": lambda ds: ds.isel(z=slice(None, None, -1)), - }, - { - "testcase_name": "repeated_indexing", - "transform": lambda ds: ds.isel(z=slice(1, None)).isel(z=0), - }, - { - "testcase_name": "oindex", - # includes repeated, negative and out of order indices - "transform": lambda ds: ds.isel(x=[0], y=[1, 1], z=[1, -1, 0]), - }, - { - "testcase_name": "vindex", - "transform": lambda ds: ds.isel(x=("w", [0, 1]), y=("w", [1, 2])), - }, - { - "testcase_name": "mixed_indexing_types", - "transform": lambda ds: ds.isel(x=0, y=slice(2), z=[-1]), - }, - { - "testcase_name": "select_a_variable", - "transform": lambda ds: ds["foo"], + coords={ + "x": [1, 2], + "y": pd.to_datetime(["2000-01-01", "2000-01-02", "2000-01-03"]), + "z": ["a", "b", "c", "d"], }, + attrs={"global": "global metadata"}, ) - def test_open_zarr(self, transform): - source = xarray.Dataset( - { - "foo": (("x",), np.arange(2), {"local": "local metadata"}), - "bar": (("x", "y"), np.arange(6).reshape(2, 3)), - "baz": (("x", "y", "z"), np.arange(24).reshape(2, 3, 4)), - }, - coords={ - "x": [1, 2], - "y": pd.to_datetime(["2000-01-01", "2000-01-02", "2000-01-03"]), - "z": ["a", "b", "c", "d"], - }, - attrs={"global": "global metadata"}, - ) - path = self.create_tempdir().full_path - source.chunk().to_zarr(path) + path = self.create_tempdir().full_path + source.chunk().to_zarr(path) - expected = transform(source) - actual = transform(xarray_tensorstore.open_zarr(path)).compute() - xarray.testing.assert_identical(actual, expected) + expected = transform(source) + actual = transform(xarray_tensorstore.open_zarr(path)).compute() + xarray.testing.assert_identical(actual, expected) - @parameterized.parameters( - {"deep": True}, - {"deep": False}, - ) - def test_copy(self, deep): - source = xarray.Dataset({"foo": (("x",), np.arange(10))}) - path = self.create_tempdir().full_path - source.to_zarr(path) - opened = xarray_tensorstore.open_zarr(path) - copied = opened.copy(deep=deep) - xarray.testing.assert_identical(copied, source) + @parameterized.parameters( + {"deep": True}, + {"deep": False}, + ) + def test_copy(self, deep): + source = xarray.Dataset({"foo": (("x",), np.arange(10))}) + path = self.create_tempdir().full_path + source.to_zarr(path) + opened = xarray_tensorstore.open_zarr(path) + copied = opened.copy(deep=deep) + xarray.testing.assert_identical(copied, source) - def test_sortby(self): - # regression test for https://github.com/google/xarray-tensorstore/issues/1 - x = np.arange(10) - source = xarray.Dataset({"foo": (("x",), x)}, {"x": x[::-1]}) - path = self.create_tempdir().full_path - source.to_zarr(path) - opened = xarray_tensorstore.open_zarr(path) - opened.sortby("x") # should not crash + def test_sortby(self): + # regression test for https://github.com/google/xarray-tensorstore/issues/1 + x = np.arange(10) + source = xarray.Dataset({"foo": (("x",), x)}, {"x": x[::-1]}) + path = self.create_tempdir().full_path + source.to_zarr(path) + opened = xarray_tensorstore.open_zarr(path) + opened.sortby("x") # should not crash - def test_compute(self): - # verify that get_duck_array() is working properly - source = xarray.Dataset({"foo": (("x",), np.arange(10))}) - path = self.create_tempdir().full_path - source.to_zarr(path) - opened = xarray_tensorstore.open_zarr(path) - computed = opened.compute() - computed_data = computed["foo"].variable._data - self.assertNotIsInstance(computed_data, tensorstore.TensorStore) + def test_compute(self): + # verify that get_duck_array() is working properly + source = xarray.Dataset({"foo": (("x",), np.arange(10))}) + path = self.create_tempdir().full_path + source.to_zarr(path) + opened = xarray_tensorstore.open_zarr(path) + computed = opened.compute() + computed_data = computed["foo"].variable._data + self.assertNotIsInstance(computed_data, tensorstore.TensorStore) - def test_open_zarr_from_uri(self): - source = xarray.Dataset( - {"baz": (("x", "y", "z"), np.arange(24).reshape(2, 3, 4))} - ) - path = self.create_tempdir().full_path - source.chunk().to_zarr(path) + def test_open_zarr_from_uri(self): + source = xarray.Dataset({"baz": (("x", "y", "z"), np.arange(24).reshape(2, 3, 4))}) + path = self.create_tempdir().full_path + source.chunk().to_zarr(path) - opened = xarray_tensorstore.open_zarr("file://" + path) - xarray.testing.assert_identical(source, opened) + opened = xarray_tensorstore.open_zarr("file://" + path) + xarray.testing.assert_identical(source, opened) - def test_read_dataset(self): - source = xarray.Dataset( - {"baz": (("x", "y", "z"), np.arange(24).reshape(2, 3, 4))}, - coords={"x": np.arange(2)}, - ) - path = self.create_tempdir().full_path - source.chunk().to_zarr(path) + def test_read_dataset(self): + source = xarray.Dataset( + {"baz": (("x", "y", "z"), np.arange(24).reshape(2, 3, 4))}, + coords={"x": np.arange(2)}, + ) + path = self.create_tempdir().full_path + source.chunk().to_zarr(path) - opened = xarray_tensorstore.open_zarr(path) - read = xarray_tensorstore.read(opened) + opened = xarray_tensorstore.open_zarr(path) + read = xarray_tensorstore.read(opened) - self.assertIsNone(opened.variables["baz"]._data.future) - self.assertIsNotNone(read.variables["baz"]._data.future) - xarray.testing.assert_identical(read, source) + self.assertIsNone(opened.variables["baz"]._data.future) + self.assertIsNotNone(read.variables["baz"]._data.future) + xarray.testing.assert_identical(read, source) - def test_read_dataarray(self): - source = xarray.DataArray( - np.arange(24).reshape(2, 3, 4), - dims=("x", "y", "z"), - name="baz", - coords={"x": np.arange(2)}, - ) - path = self.create_tempdir().full_path - source.to_dataset().chunk().to_zarr(path) + def test_read_dataarray(self): + source = xarray.DataArray( + np.arange(24).reshape(2, 3, 4), + dims=("x", "y", "z"), + name="baz", + coords={"x": np.arange(2)}, + ) + path = self.create_tempdir().full_path + source.to_dataset().chunk().to_zarr(path) - opened = xarray_tensorstore.open_zarr(path)["baz"] - read = xarray_tensorstore.read(opened) + opened = xarray_tensorstore.open_zarr(path)["baz"] + read = xarray_tensorstore.read(opened) - self.assertIsNone(opened.variable._data.future) - self.assertIsNotNone(read.variable._data.future) - xarray.testing.assert_identical(read, source) + self.assertIsNone(opened.variable._data.future) + self.assertIsNotNone(read.variable._data.future) + xarray.testing.assert_identical(read, source) - def test_mask_and_scale(self): - source = xarray.DataArray( - np.arange(24).reshape(2, 3, 4), - dims=("x", "y", "z"), - name="baz", - coords={"x": np.arange(2)}, - ) + def test_mask_and_scale(self): + source = xarray.DataArray( + np.arange(24).reshape(2, 3, 4), + dims=("x", "y", "z"), + name="baz", + coords={"x": np.arange(2)}, + ) - # invalid fill-value - source.encoding = {"_FillValue": -1} - path = self.create_tempdir().full_path - source.to_dataset().chunk().to_zarr(path) - expected_msg = ( - "variable baz has non-NaN fill value, which is not supported by" - " xarray-tensorstore: -1. Consider re-opening with" - " xarray_tensorstore.open_zarr(..., mask_and_scale=False), or falling" - " back to use xarray.open_zarr()." - ) - with self.assertRaisesWithLiteralMatch(ValueError, expected_msg): - xarray_tensorstore.open_zarr(path) + # invalid fill-value + source.encoding = {"_FillValue": -1} + path = self.create_tempdir().full_path + source.to_dataset().chunk().to_zarr(path) + expected_msg = ( + "variable baz has non-NaN fill value, which is not supported by" + " xarray-tensorstore: -1. Consider re-opening with" + " xarray_tensorstore.open_zarr(..., mask_and_scale=False), or falling" + " back to use xarray.open_zarr()." + ) + with self.assertRaisesWithLiteralMatch(ValueError, expected_msg): + xarray_tensorstore.open_zarr(path) - actual = xarray_tensorstore.open_zarr(path, mask_and_scale=False)["baz"] - xarray.testing.assert_equal(actual, source) # no values are masked + actual = xarray_tensorstore.open_zarr(path, mask_and_scale=False)["baz"] + xarray.testing.assert_equal(actual, source) # no values are masked - # invalid scaling - source.encoding = {"scale_factor": 10.0} - path = self.create_tempdir().full_path - source.to_dataset().chunk().to_zarr(path) - expected_msg = "variable baz uses scale/offset encoding" - with self.assertRaisesRegex(ValueError, expected_msg): - xarray_tensorstore.open_zarr(path) + # invalid scaling + source.encoding = {"scale_factor": 10.0} + path = self.create_tempdir().full_path + source.to_dataset().chunk().to_zarr(path) + expected_msg = "variable baz uses scale/offset encoding" + with self.assertRaisesRegex(ValueError, expected_msg): + xarray_tensorstore.open_zarr(path) - actual = xarray_tensorstore.open_zarr(path, mask_and_scale=False)["baz"] - self.assertFalse(actual.equals(source)) # not scaled properly + actual = xarray_tensorstore.open_zarr(path, mask_and_scale=False)["baz"] + self.assertFalse(actual.equals(source)) # not scaled properly - # valid offset (coordinate only) - source.encoding = {} - source.coords["x"].encoding = {"add_offset": -1} - path = self.create_tempdir().full_path - source.to_dataset().chunk().to_zarr(path) - actual = xarray_tensorstore.open_zarr(path, mask_and_scale=True)["baz"] - xarray.testing.assert_identical(actual, source) - self.assertEqual(actual.coords["x"].encoding["add_offset"], -1) + # valid offset (coordinate only) + source.encoding = {} + source.coords["x"].encoding = {"add_offset": -1} + path = self.create_tempdir().full_path + source.to_dataset().chunk().to_zarr(path) + actual = xarray_tensorstore.open_zarr(path, mask_and_scale=True)["baz"] + xarray.testing.assert_identical(actual, source) + self.assertEqual(actual.coords["x"].encoding["add_offset"], -1) - @parameterized.named_parameters( - { - "testcase_name": "basic_indexing", - "key": (slice(1, None), slice(None), slice(None)), - "value": np.full((1, 2, 3), -1), - "mode": "r+", - }, - { - "testcase_name": "outer_indexing", - "key": (np.array([0]), np.array([1]), slice(None)), - "value": np.full((1, 1, 3), -2), - "mode": "r+", - }, - { - "testcase_name": "vectorized_indexing", - "key": (np.array([0]), np.array([0, 1]), slice(None)), - "value": np.full((2, 3), -3), - "mode": "r+", - }, - { - "testcase_name": "read_only", - "key": (slice(1, None), slice(None), slice(None)), - "value": np.full((1, 2, 3), -1), - "mode": "r", - }, + @parameterized.named_parameters( + { + "testcase_name": "basic_indexing", + "key": (slice(1, None), slice(None), slice(None)), + "value": np.full((1, 2, 3), -1), + "mode": "r+", + }, + { + "testcase_name": "outer_indexing", + "key": (np.array([0]), np.array([1]), slice(None)), + "value": np.full((1, 1, 3), -2), + "mode": "r+", + }, + { + "testcase_name": "vectorized_indexing", + "key": (np.array([0]), np.array([0, 1]), slice(None)), + "value": np.full((2, 3), -3), + "mode": "r+", + }, + { + "testcase_name": "read_only", + "key": (slice(1, None), slice(None), slice(None)), + "value": np.full((1, 2, 3), -1), + "mode": "r", + }, + ) + def test_setitem(self, key, value, mode): + source_data = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]) + source = xarray.DataArray( + source_data, + dims=("x", "y", "z"), + name="baz", ) - def test_setitem(self, key, value, mode): - source_data = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]) - source = xarray.DataArray( - source_data, - dims=("x", "y", "z"), - name="baz", - ) - path = self.create_tempdir().full_path - source.to_dataset().chunk().to_zarr(path) - - opened = xarray_tensorstore.open_zarr(path, mode=mode)["baz"] - if mode == "r": - with pytest.raises(ValueError): - opened[key] = value - return + path = self.create_tempdir().full_path + source.to_dataset().chunk().to_zarr(path) + opened = xarray_tensorstore.open_zarr(path, mode=mode)["baz"] + if mode == "r": + with pytest.raises(ValueError): opened[key] = value - read = xarray_tensorstore.read(opened) + return - expected_data = source_data.copy() - expected_data[key] = value - expected = xarray.DataArray( - expected_data, - dims=("x", "y", "z"), - name="baz", - ) + opened[key] = value + read = xarray_tensorstore.read(opened) + + expected_data = source_data.copy() + expected_data[key] = value + expected = xarray.DataArray( + expected_data, + dims=("x", "y", "z"), + name="baz", + ) - xarray.testing.assert_equal(read, expected) + xarray.testing.assert_equal(read, expected) if __name__ == "__main__": - absltest.main() + absltest.main() From c651d35342d61b12dbb65c9bee783abab5827c69 Mon Sep 17 00:00:00 2001 From: rhoadesScholar Date: Tue, 5 Nov 2024 17:09:54 -0500 Subject: [PATCH 08/17] =?UTF-8?q?fix:=20=F0=9F=90=9B=20Fix=20returning=20o?= =?UTF-8?q?ld=20data=20after=20set=20operation.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- xarray_tensorstore.py | 431 +++++++++++++++++++++--------------------- 1 file changed, 217 insertions(+), 214 deletions(-) diff --git a/xarray_tensorstore.py b/xarray_tensorstore.py index 6dbffe8..9d0f337 100644 --- a/xarray_tensorstore.py +++ b/xarray_tensorstore.py @@ -34,175 +34,176 @@ def _numpy_to_tensorstore_index(index: Index, size: int) -> Index: - """Switch from NumPy to TensorStore indexing conventions.""" - # https://google.github.io/tensorstore/python/indexing.html#differences-compared-to-numpy-indexing - if index is None: - return None - elif isinstance(index, int): - # Negative integers do not count from the end in TensorStore - return index + size if index < 0 else index - elif isinstance(index, slice): - start = _numpy_to_tensorstore_index(index.start, size) - stop = _numpy_to_tensorstore_index(index.stop, size) - if stop is not None: - # TensorStore does not allow out of bounds slicing - stop = min(stop, size) - return slice(start, stop, index.step) - else: - assert isinstance(index, np.ndarray) - return np.where(index < 0, index + size, index) + """Switch from NumPy to TensorStore indexing conventions.""" + # https://google.github.io/tensorstore/python/indexing.html#differences-compared-to-numpy-indexing + if index is None: + return None + elif isinstance(index, int): + # Negative integers do not count from the end in TensorStore + return index + size if index < 0 else index + elif isinstance(index, slice): + start = _numpy_to_tensorstore_index(index.start, size) + stop = _numpy_to_tensorstore_index(index.stop, size) + if stop is not None: + # TensorStore does not allow out of bounds slicing + stop = min(stop, size) + return slice(start, stop, index.step) + else: + assert isinstance(index, np.ndarray) + return np.where(index < 0, index + size, index) @dataclasses.dataclass(frozen=True) class _TensorStoreAdapter(indexing.ExplicitlyIndexed): - """TensorStore array that can be wrapped by xarray.Variable. - - We use Xarray's semi-internal ExplicitlyIndexed API so that Xarray will not - attempt to load our array into memory as a NumPy array. In the future, this - should be supported by public Xarray APIs, as part of the refactor discussed - in: https://github.com/pydata/xarray/issues/3981 - """ - - array: tensorstore.TensorStore - future: Optional[tensorstore.Future] = None - - @property - def shape(self) -> tuple[int, ...]: - return self.array.shape - - @property - def dtype(self) -> np.dtype: - return self.array.dtype.numpy_dtype - - @property - def ndim(self) -> int: - return len(self.shape) - - @property - def size(self) -> int: - return math.prod(self.shape) - - def __getitem__(self, key: indexing.ExplicitIndexer) -> _TensorStoreAdapter: - index_tuple = tuple(map(_numpy_to_tensorstore_index, key.tuple, self.shape)) - if isinstance(key, indexing.OuterIndexer): - # TODO(shoyer): fix this for newer versions of Xarray. - # We get the error message: - # AttributeError: '_TensorStoreAdapter' object has no attribute 'oindex' - indexed = self.array.oindex[index_tuple] - elif isinstance(key, indexing.VectorizedIndexer): - indexed = self.array.vindex[index_tuple] - else: - assert isinstance(key, indexing.BasicIndexer) - indexed = self.array[index_tuple] - # Translate to the origin so repeated indexing is relative to the new bounds - # like NumPy, not absolute like TensorStore - translated = indexed[tensorstore.d[:].translate_to[0]] - return type(self)(translated) - - def __setitem__(self, key: indexing.ExplicitIndexer, value) -> None: - index_tuple = tuple(map(_numpy_to_tensorstore_index, key.tuple, self.shape)) - if isinstance(key, indexing.OuterIndexer): - self.array.oindex[index_tuple] = value - elif isinstance(key, indexing.VectorizedIndexer): - self.array.vindex[index_tuple] = value - else: - assert isinstance(key, indexing.BasicIndexer) - self.array[index_tuple] = value - - # xarray>2024.02.0 uses oindex and vindex properties, which are expected to - # return objects whose __getitem__ method supports the appropriate form of - # indexing. - @property - def oindex(self) -> _TensorStoreAdapter: - return self - - @property - def vindex(self) -> _TensorStoreAdapter: - return self - - def transpose(self, order: tuple[int, ...]) -> _TensorStoreAdapter: - transposed = self.array[tensorstore.d[order].transpose[:]] - return type(self)(transposed) - - def read(self) -> _TensorStoreAdapter: - future = self.array.read() - return type(self)(self.array, future) - - def __array__(self, dtype: Optional[np.dtype] = None) -> np.ndarray: # type: ignore - future = self.array.read() if self.future is None else self.future - return np.asarray(future.result(), dtype=dtype) - - def get_duck_array(self): - # special method for xarray to return an in-memory (computed) representation - return np.asarray(self) - - # Work around the missing __copy__ and __deepcopy__ methods from TensorStore, - # which are needed for Xarray: - # https://github.com/google/tensorstore/issues/109 - # TensorStore objects are immutable, so there's no need to actually copy them. - - def __copy__(self) -> _TensorStoreAdapter: - return type(self)(self.array, self.future) - - def __deepcopy__(self, memo) -> _TensorStoreAdapter: - return self.__copy__() + """TensorStore array that can be wrapped by xarray.Variable. + + We use Xarray's semi-internal ExplicitlyIndexed API so that Xarray will not + attempt to load our array into memory as a NumPy array. In the future, this + should be supported by public Xarray APIs, as part of the refactor discussed + in: https://github.com/pydata/xarray/issues/3981 + """ + + array: tensorstore.TensorStore + future: Optional[tensorstore.Future] = None + + @property + def shape(self) -> tuple[int, ...]: + return self.array.shape + + @property + def dtype(self) -> np.dtype: + return self.array.dtype.numpy_dtype + + @property + def ndim(self) -> int: + return len(self.shape) + + @property + def size(self) -> int: + return math.prod(self.shape) + + def __getitem__(self, key: indexing.ExplicitIndexer) -> _TensorStoreAdapter: + index_tuple = tuple(map(_numpy_to_tensorstore_index, key.tuple, self.shape)) + if isinstance(key, indexing.OuterIndexer): + # TODO(shoyer): fix this for newer versions of Xarray. + # We get the error message: + # AttributeError: '_TensorStoreAdapter' object has no attribute 'oindex' + indexed = self.array.oindex[index_tuple] + elif isinstance(key, indexing.VectorizedIndexer): + indexed = self.array.vindex[index_tuple] + else: + assert isinstance(key, indexing.BasicIndexer) + indexed = self.array[index_tuple] + # Translate to the origin so repeated indexing is relative to the new bounds + # like NumPy, not absolute like TensorStore + translated = indexed[tensorstore.d[:].translate_to[0]] + return type(self)(translated) + + def __setitem__(self, key: indexing.ExplicitIndexer, value) -> None: + index_tuple = tuple(map(_numpy_to_tensorstore_index, key.tuple, self.shape)) + if isinstance(key, indexing.OuterIndexer): + self.array.oindex[index_tuple] = value + elif isinstance(key, indexing.VectorizedIndexer): + self.array.vindex[index_tuple] = value + else: + assert isinstance(key, indexing.BasicIndexer) + self.array[index_tuple] = value + self.future = None + + # xarray>2024.02.0 uses oindex and vindex properties, which are expected to + # return objects whose __getitem__ method supports the appropriate form of + # indexing. + @property + def oindex(self) -> _TensorStoreAdapter: + return self + + @property + def vindex(self) -> _TensorStoreAdapter: + return self + + def transpose(self, order: tuple[int, ...]) -> _TensorStoreAdapter: + transposed = self.array[tensorstore.d[order].transpose[:]] + return type(self)(transposed) + + def read(self) -> _TensorStoreAdapter: + future = self.array.read() + return type(self)(self.array, future) + + def __array__(self, dtype: Optional[np.dtype] = None) -> np.ndarray: # type: ignore + future = self.array.read() if self.future is None else self.future + return np.asarray(future.result(), dtype=dtype) + + def get_duck_array(self): + # special method for xarray to return an in-memory (computed) representation + return np.asarray(self) + + # Work around the missing __copy__ and __deepcopy__ methods from TensorStore, + # which are needed for Xarray: + # https://github.com/google/tensorstore/issues/109 + # TensorStore objects are immutable, so there's no need to actually copy them. + + def __copy__(self) -> _TensorStoreAdapter: + return type(self)(self.array, self.future) + + def __deepcopy__(self, memo) -> _TensorStoreAdapter: + return self.__copy__() def _read_tensorstore( array: indexing.ExplicitlyIndexed, ) -> indexing.ExplicitlyIndexed: - """Starts async reading on a TensorStore array.""" - return array.read() if isinstance(array, _TensorStoreAdapter) else array + """Starts async reading on a TensorStore array.""" + return array.read() if isinstance(array, _TensorStoreAdapter) else array def read(xarraydata: XarrayData, /) -> XarrayData: - """Starts async reads on all TensorStore arrays.""" - # pylint: disable=protected-access - if isinstance(xarraydata, xarray.Dataset): - data = { - name: _read_tensorstore(var.variable._data) - for name, var in xarraydata.data_vars.items() - } - elif isinstance(xarraydata, xarray.DataArray): - data = _read_tensorstore(xarraydata.variable._data) - else: - raise TypeError(f"argument is not a DataArray or Dataset: {xarraydata}") - # pylint: enable=protected-access - return xarraydata.copy(data=data) + """Starts async reads on all TensorStore arrays.""" + # pylint: disable=protected-access + if isinstance(xarraydata, xarray.Dataset): + data = { + name: _read_tensorstore(var.variable._data) + for name, var in xarraydata.data_vars.items() + } + elif isinstance(xarraydata, xarray.DataArray): + data = _read_tensorstore(xarraydata.variable._data) + else: + raise TypeError(f"argument is not a DataArray or Dataset: {xarraydata}") + # pylint: enable=protected-access + return xarraydata.copy(data=data) _DEFAULT_STORAGE_DRIVER = "file" def _zarr_spec_from_path(path: str) -> ...: - if re.match(r"\w+\://", path): # path is a URI - kv_store = path - else: - kv_store = {"driver": _DEFAULT_STORAGE_DRIVER, "path": path} - return {"driver": "zarr", "kvstore": kv_store} + if re.match(r"\w+\://", path): # path is a URI + kv_store = path + else: + kv_store = {"driver": _DEFAULT_STORAGE_DRIVER, "path": path} + return {"driver": "zarr", "kvstore": kv_store} def _raise_if_mask_and_scale_used_for_data_vars(ds: xarray.Dataset): - """Check a dataset for data variables that would need masking or scaling.""" - advice = ( - "Consider re-opening with xarray_tensorstore.open_zarr(..., " - "mask_and_scale=False), or falling back to use xarray.open_zarr()." - ) - for k in ds: - encoding = ds[k].encoding - for attr in ["_FillValue", "missing_value"]: - fill_value = encoding.get(attr, np.nan) - if fill_value == fill_value: # pylint: disable=comparison-with-itself - raise ValueError( - f"variable {k} has non-NaN fill value, which is not supported by" - f" xarray-tensorstore: {fill_value}. {advice}" - ) - for attr in ["scale_factor", "add_offset"]: - if attr in encoding: - raise ValueError( - f"variable {k} uses scale/offset encoding, which is not supported" - f" by xarray-tensorstore: {encoding}. {advice}" - ) + """Check a dataset for data variables that would need masking or scaling.""" + advice = ( + "Consider re-opening with xarray_tensorstore.open_zarr(..., " + "mask_and_scale=False), or falling back to use xarray.open_zarr()." + ) + for k in ds: + encoding = ds[k].encoding + for attr in ["_FillValue", "missing_value"]: + fill_value = encoding.get(attr, np.nan) + if fill_value == fill_value: # pylint: disable=comparison-with-itself + raise ValueError( + f"variable {k} has non-NaN fill value, which is not supported by" + f" xarray-tensorstore: {fill_value}. {advice}" + ) + for attr in ["scale_factor", "add_offset"]: + if attr in encoding: + raise ValueError( + f"variable {k} uses scale/offset encoding, which is not supported" + f" by xarray-tensorstore: {encoding}. {advice}" + ) def open_zarr( @@ -212,69 +213,71 @@ def open_zarr( mask_and_scale: bool = True, mode: str = "r", ) -> xarray.Dataset: - """Open an xarray.Dataset from Zarr using TensorStore. - - For best performance, explicitly call `read()` to asynchronously load data - in parallel. Otherwise, xarray's `.compute()` method will load each variable's - data in sequence. - - Example usage: - - import xarray_tensorstore - - ds = xarray_tensorstore.open_zarr(path) - - # indexing & transposing is lazy - example = ds.sel(time='2020-01-01').transpose('longitude', 'latitude', ...) - - # start reading data asynchronously - read_example = xarray_tensorstore.read(example) - - # blocking conversion of the data into NumPy arrays - numpy_example = read_example.compute() - - Args: - path: path or URI to Zarr group to open. - context: TensorStore configuration options to use when opening arrays. - mask_and_scale: if True (default), attempt to apply masking and scaling like - xarray.open_zarr(). This is only supported for coordinate variables and - otherwise will raise an error. - mode: file mode to use when opening the Zarr group, either ‘r’ for read only; or ‘r+’ for read/write (Zarr must exist). - - Returns: - Dataset with all data variables opened via TensorStore. - """ - # We use xarray.open_zarr (which uses Zarr Python internally) to open the - # initial version of the dataset for a few reasons: - # 1. TensorStore does not support Zarr groups or array attributes, which we - # need to open in the xarray.Dataset. We use Zarr Python instead of - # parsing the raw Zarr metadata files ourselves. - # 2. TensorStore doesn't support non-standard Zarr dtypes like UTF-8 strings. - # 3. Xarray's open_zarr machinery does some pre-processing (e.g., from numeric - # to datetime64 dtypes) that we would otherwise need to invoke explicitly - # via xarray.decode_cf(). - # - # Fortunately (2) and (3) are most commonly encountered on small coordinate - # arrays, for which the performance advantages of TensorStore are irrelevant. - - if context is None: - context = tensorstore.Context() - - # chunks=None means avoid using dask - ds = xarray.open_zarr(path, chunks=None, mask_and_scale=mask_and_scale) - - if mask_and_scale: - # Data variables get replaced below with _TensorStoreAdapter arrays, which - # don't get masked or scaled. Raising an error avoids surprising users with - # incorrect data values. - _raise_if_mask_and_scale_used_for_data_vars(ds) - - specs = {k: _zarr_spec_from_path(os.path.join(path, k)) for k in ds} - array_futures = { - k: tensorstore.open(spec, read=True, write=mode.lower() == "r+", context=context) - for k, spec in specs.items() - } - arrays = {k: v.result() for k, v in array_futures.items()} - new_data = {k: _TensorStoreAdapter(v) for k, v in arrays.items()} - - return ds.copy(data=new_data) + """Open an xarray.Dataset from Zarr using TensorStore. + + For best performance, explicitly call `read()` to asynchronously load data + in parallel. Otherwise, xarray's `.compute()` method will load each variable's + data in sequence. + + Example usage: + + import xarray_tensorstore + + ds = xarray_tensorstore.open_zarr(path) + + # indexing & transposing is lazy + example = ds.sel(time='2020-01-01').transpose('longitude', 'latitude', ...) + + # start reading data asynchronously + read_example = xarray_tensorstore.read(example) + + # blocking conversion of the data into NumPy arrays + numpy_example = read_example.compute() + + Args: + path: path or URI to Zarr group to open. + context: TensorStore configuration options to use when opening arrays. + mask_and_scale: if True (default), attempt to apply masking and scaling like + xarray.open_zarr(). This is only supported for coordinate variables and + otherwise will raise an error. + mode: file mode to use when opening the Zarr group, either ‘r’ for read only; or ‘r+’ for read/write (Zarr must exist). + + Returns: + Dataset with all data variables opened via TensorStore. + """ + # We use xarray.open_zarr (which uses Zarr Python internally) to open the + # initial version of the dataset for a few reasons: + # 1. TensorStore does not support Zarr groups or array attributes, which we + # need to open in the xarray.Dataset. We use Zarr Python instead of + # parsing the raw Zarr metadata files ourselves. + # 2. TensorStore doesn't support non-standard Zarr dtypes like UTF-8 strings. + # 3. Xarray's open_zarr machinery does some pre-processing (e.g., from numeric + # to datetime64 dtypes) that we would otherwise need to invoke explicitly + # via xarray.decode_cf(). + # + # Fortunately (2) and (3) are most commonly encountered on small coordinate + # arrays, for which the performance advantages of TensorStore are irrelevant. + + if context is None: + context = tensorstore.Context() + + # chunks=None means avoid using dask + ds = xarray.open_zarr(path, chunks=None, mask_and_scale=mask_and_scale) + + if mask_and_scale: + # Data variables get replaced below with _TensorStoreAdapter arrays, which + # don't get masked or scaled. Raising an error avoids surprising users with + # incorrect data values. + _raise_if_mask_and_scale_used_for_data_vars(ds) + + specs = {k: _zarr_spec_from_path(os.path.join(path, k)) for k in ds} + array_futures = { + k: tensorstore.open( + spec, read=True, write=mode.lower() == "r+", context=context + ) + for k, spec in specs.items() + } + arrays = {k: v.result() for k, v in array_futures.items()} + new_data = {k: _TensorStoreAdapter(v) for k, v in arrays.items()} + + return ds.copy(data=new_data) From ead69d1612431b4aac90c0c1db74c9695dcc88da Mon Sep 17 00:00:00 2001 From: rhoadesScholar Date: Tue, 5 Nov 2024 18:02:42 -0500 Subject: [PATCH 09/17] =?UTF-8?q?Revert=20"fix:=20=F0=9F=90=9B=20Fix=20ret?= =?UTF-8?q?urning=20old=20data=20after=20set=20operation."?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit c651d35342d61b12dbb65c9bee783abab5827c69. --- xarray_tensorstore.py | 431 +++++++++++++++++++++--------------------- 1 file changed, 214 insertions(+), 217 deletions(-) diff --git a/xarray_tensorstore.py b/xarray_tensorstore.py index 9d0f337..6dbffe8 100644 --- a/xarray_tensorstore.py +++ b/xarray_tensorstore.py @@ -34,176 +34,175 @@ def _numpy_to_tensorstore_index(index: Index, size: int) -> Index: - """Switch from NumPy to TensorStore indexing conventions.""" - # https://google.github.io/tensorstore/python/indexing.html#differences-compared-to-numpy-indexing - if index is None: - return None - elif isinstance(index, int): - # Negative integers do not count from the end in TensorStore - return index + size if index < 0 else index - elif isinstance(index, slice): - start = _numpy_to_tensorstore_index(index.start, size) - stop = _numpy_to_tensorstore_index(index.stop, size) - if stop is not None: - # TensorStore does not allow out of bounds slicing - stop = min(stop, size) - return slice(start, stop, index.step) - else: - assert isinstance(index, np.ndarray) - return np.where(index < 0, index + size, index) + """Switch from NumPy to TensorStore indexing conventions.""" + # https://google.github.io/tensorstore/python/indexing.html#differences-compared-to-numpy-indexing + if index is None: + return None + elif isinstance(index, int): + # Negative integers do not count from the end in TensorStore + return index + size if index < 0 else index + elif isinstance(index, slice): + start = _numpy_to_tensorstore_index(index.start, size) + stop = _numpy_to_tensorstore_index(index.stop, size) + if stop is not None: + # TensorStore does not allow out of bounds slicing + stop = min(stop, size) + return slice(start, stop, index.step) + else: + assert isinstance(index, np.ndarray) + return np.where(index < 0, index + size, index) @dataclasses.dataclass(frozen=True) class _TensorStoreAdapter(indexing.ExplicitlyIndexed): - """TensorStore array that can be wrapped by xarray.Variable. - - We use Xarray's semi-internal ExplicitlyIndexed API so that Xarray will not - attempt to load our array into memory as a NumPy array. In the future, this - should be supported by public Xarray APIs, as part of the refactor discussed - in: https://github.com/pydata/xarray/issues/3981 - """ - - array: tensorstore.TensorStore - future: Optional[tensorstore.Future] = None - - @property - def shape(self) -> tuple[int, ...]: - return self.array.shape - - @property - def dtype(self) -> np.dtype: - return self.array.dtype.numpy_dtype - - @property - def ndim(self) -> int: - return len(self.shape) - - @property - def size(self) -> int: - return math.prod(self.shape) - - def __getitem__(self, key: indexing.ExplicitIndexer) -> _TensorStoreAdapter: - index_tuple = tuple(map(_numpy_to_tensorstore_index, key.tuple, self.shape)) - if isinstance(key, indexing.OuterIndexer): - # TODO(shoyer): fix this for newer versions of Xarray. - # We get the error message: - # AttributeError: '_TensorStoreAdapter' object has no attribute 'oindex' - indexed = self.array.oindex[index_tuple] - elif isinstance(key, indexing.VectorizedIndexer): - indexed = self.array.vindex[index_tuple] - else: - assert isinstance(key, indexing.BasicIndexer) - indexed = self.array[index_tuple] - # Translate to the origin so repeated indexing is relative to the new bounds - # like NumPy, not absolute like TensorStore - translated = indexed[tensorstore.d[:].translate_to[0]] - return type(self)(translated) - - def __setitem__(self, key: indexing.ExplicitIndexer, value) -> None: - index_tuple = tuple(map(_numpy_to_tensorstore_index, key.tuple, self.shape)) - if isinstance(key, indexing.OuterIndexer): - self.array.oindex[index_tuple] = value - elif isinstance(key, indexing.VectorizedIndexer): - self.array.vindex[index_tuple] = value - else: - assert isinstance(key, indexing.BasicIndexer) - self.array[index_tuple] = value - self.future = None - - # xarray>2024.02.0 uses oindex and vindex properties, which are expected to - # return objects whose __getitem__ method supports the appropriate form of - # indexing. - @property - def oindex(self) -> _TensorStoreAdapter: - return self - - @property - def vindex(self) -> _TensorStoreAdapter: - return self - - def transpose(self, order: tuple[int, ...]) -> _TensorStoreAdapter: - transposed = self.array[tensorstore.d[order].transpose[:]] - return type(self)(transposed) - - def read(self) -> _TensorStoreAdapter: - future = self.array.read() - return type(self)(self.array, future) - - def __array__(self, dtype: Optional[np.dtype] = None) -> np.ndarray: # type: ignore - future = self.array.read() if self.future is None else self.future - return np.asarray(future.result(), dtype=dtype) - - def get_duck_array(self): - # special method for xarray to return an in-memory (computed) representation - return np.asarray(self) - - # Work around the missing __copy__ and __deepcopy__ methods from TensorStore, - # which are needed for Xarray: - # https://github.com/google/tensorstore/issues/109 - # TensorStore objects are immutable, so there's no need to actually copy them. - - def __copy__(self) -> _TensorStoreAdapter: - return type(self)(self.array, self.future) - - def __deepcopy__(self, memo) -> _TensorStoreAdapter: - return self.__copy__() + """TensorStore array that can be wrapped by xarray.Variable. + + We use Xarray's semi-internal ExplicitlyIndexed API so that Xarray will not + attempt to load our array into memory as a NumPy array. In the future, this + should be supported by public Xarray APIs, as part of the refactor discussed + in: https://github.com/pydata/xarray/issues/3981 + """ + + array: tensorstore.TensorStore + future: Optional[tensorstore.Future] = None + + @property + def shape(self) -> tuple[int, ...]: + return self.array.shape + + @property + def dtype(self) -> np.dtype: + return self.array.dtype.numpy_dtype + + @property + def ndim(self) -> int: + return len(self.shape) + + @property + def size(self) -> int: + return math.prod(self.shape) + + def __getitem__(self, key: indexing.ExplicitIndexer) -> _TensorStoreAdapter: + index_tuple = tuple(map(_numpy_to_tensorstore_index, key.tuple, self.shape)) + if isinstance(key, indexing.OuterIndexer): + # TODO(shoyer): fix this for newer versions of Xarray. + # We get the error message: + # AttributeError: '_TensorStoreAdapter' object has no attribute 'oindex' + indexed = self.array.oindex[index_tuple] + elif isinstance(key, indexing.VectorizedIndexer): + indexed = self.array.vindex[index_tuple] + else: + assert isinstance(key, indexing.BasicIndexer) + indexed = self.array[index_tuple] + # Translate to the origin so repeated indexing is relative to the new bounds + # like NumPy, not absolute like TensorStore + translated = indexed[tensorstore.d[:].translate_to[0]] + return type(self)(translated) + + def __setitem__(self, key: indexing.ExplicitIndexer, value) -> None: + index_tuple = tuple(map(_numpy_to_tensorstore_index, key.tuple, self.shape)) + if isinstance(key, indexing.OuterIndexer): + self.array.oindex[index_tuple] = value + elif isinstance(key, indexing.VectorizedIndexer): + self.array.vindex[index_tuple] = value + else: + assert isinstance(key, indexing.BasicIndexer) + self.array[index_tuple] = value + + # xarray>2024.02.0 uses oindex and vindex properties, which are expected to + # return objects whose __getitem__ method supports the appropriate form of + # indexing. + @property + def oindex(self) -> _TensorStoreAdapter: + return self + + @property + def vindex(self) -> _TensorStoreAdapter: + return self + + def transpose(self, order: tuple[int, ...]) -> _TensorStoreAdapter: + transposed = self.array[tensorstore.d[order].transpose[:]] + return type(self)(transposed) + + def read(self) -> _TensorStoreAdapter: + future = self.array.read() + return type(self)(self.array, future) + + def __array__(self, dtype: Optional[np.dtype] = None) -> np.ndarray: # type: ignore + future = self.array.read() if self.future is None else self.future + return np.asarray(future.result(), dtype=dtype) + + def get_duck_array(self): + # special method for xarray to return an in-memory (computed) representation + return np.asarray(self) + + # Work around the missing __copy__ and __deepcopy__ methods from TensorStore, + # which are needed for Xarray: + # https://github.com/google/tensorstore/issues/109 + # TensorStore objects are immutable, so there's no need to actually copy them. + + def __copy__(self) -> _TensorStoreAdapter: + return type(self)(self.array, self.future) + + def __deepcopy__(self, memo) -> _TensorStoreAdapter: + return self.__copy__() def _read_tensorstore( array: indexing.ExplicitlyIndexed, ) -> indexing.ExplicitlyIndexed: - """Starts async reading on a TensorStore array.""" - return array.read() if isinstance(array, _TensorStoreAdapter) else array + """Starts async reading on a TensorStore array.""" + return array.read() if isinstance(array, _TensorStoreAdapter) else array def read(xarraydata: XarrayData, /) -> XarrayData: - """Starts async reads on all TensorStore arrays.""" - # pylint: disable=protected-access - if isinstance(xarraydata, xarray.Dataset): - data = { - name: _read_tensorstore(var.variable._data) - for name, var in xarraydata.data_vars.items() - } - elif isinstance(xarraydata, xarray.DataArray): - data = _read_tensorstore(xarraydata.variable._data) - else: - raise TypeError(f"argument is not a DataArray or Dataset: {xarraydata}") - # pylint: enable=protected-access - return xarraydata.copy(data=data) + """Starts async reads on all TensorStore arrays.""" + # pylint: disable=protected-access + if isinstance(xarraydata, xarray.Dataset): + data = { + name: _read_tensorstore(var.variable._data) + for name, var in xarraydata.data_vars.items() + } + elif isinstance(xarraydata, xarray.DataArray): + data = _read_tensorstore(xarraydata.variable._data) + else: + raise TypeError(f"argument is not a DataArray or Dataset: {xarraydata}") + # pylint: enable=protected-access + return xarraydata.copy(data=data) _DEFAULT_STORAGE_DRIVER = "file" def _zarr_spec_from_path(path: str) -> ...: - if re.match(r"\w+\://", path): # path is a URI - kv_store = path - else: - kv_store = {"driver": _DEFAULT_STORAGE_DRIVER, "path": path} - return {"driver": "zarr", "kvstore": kv_store} + if re.match(r"\w+\://", path): # path is a URI + kv_store = path + else: + kv_store = {"driver": _DEFAULT_STORAGE_DRIVER, "path": path} + return {"driver": "zarr", "kvstore": kv_store} def _raise_if_mask_and_scale_used_for_data_vars(ds: xarray.Dataset): - """Check a dataset for data variables that would need masking or scaling.""" - advice = ( - "Consider re-opening with xarray_tensorstore.open_zarr(..., " - "mask_and_scale=False), or falling back to use xarray.open_zarr()." - ) - for k in ds: - encoding = ds[k].encoding - for attr in ["_FillValue", "missing_value"]: - fill_value = encoding.get(attr, np.nan) - if fill_value == fill_value: # pylint: disable=comparison-with-itself - raise ValueError( - f"variable {k} has non-NaN fill value, which is not supported by" - f" xarray-tensorstore: {fill_value}. {advice}" - ) - for attr in ["scale_factor", "add_offset"]: - if attr in encoding: - raise ValueError( - f"variable {k} uses scale/offset encoding, which is not supported" - f" by xarray-tensorstore: {encoding}. {advice}" - ) + """Check a dataset for data variables that would need masking or scaling.""" + advice = ( + "Consider re-opening with xarray_tensorstore.open_zarr(..., " + "mask_and_scale=False), or falling back to use xarray.open_zarr()." + ) + for k in ds: + encoding = ds[k].encoding + for attr in ["_FillValue", "missing_value"]: + fill_value = encoding.get(attr, np.nan) + if fill_value == fill_value: # pylint: disable=comparison-with-itself + raise ValueError( + f"variable {k} has non-NaN fill value, which is not supported by" + f" xarray-tensorstore: {fill_value}. {advice}" + ) + for attr in ["scale_factor", "add_offset"]: + if attr in encoding: + raise ValueError( + f"variable {k} uses scale/offset encoding, which is not supported" + f" by xarray-tensorstore: {encoding}. {advice}" + ) def open_zarr( @@ -213,71 +212,69 @@ def open_zarr( mask_and_scale: bool = True, mode: str = "r", ) -> xarray.Dataset: - """Open an xarray.Dataset from Zarr using TensorStore. - - For best performance, explicitly call `read()` to asynchronously load data - in parallel. Otherwise, xarray's `.compute()` method will load each variable's - data in sequence. - - Example usage: - - import xarray_tensorstore - - ds = xarray_tensorstore.open_zarr(path) - - # indexing & transposing is lazy - example = ds.sel(time='2020-01-01').transpose('longitude', 'latitude', ...) - - # start reading data asynchronously - read_example = xarray_tensorstore.read(example) - - # blocking conversion of the data into NumPy arrays - numpy_example = read_example.compute() - - Args: - path: path or URI to Zarr group to open. - context: TensorStore configuration options to use when opening arrays. - mask_and_scale: if True (default), attempt to apply masking and scaling like - xarray.open_zarr(). This is only supported for coordinate variables and - otherwise will raise an error. - mode: file mode to use when opening the Zarr group, either ‘r’ for read only; or ‘r+’ for read/write (Zarr must exist). - - Returns: - Dataset with all data variables opened via TensorStore. - """ - # We use xarray.open_zarr (which uses Zarr Python internally) to open the - # initial version of the dataset for a few reasons: - # 1. TensorStore does not support Zarr groups or array attributes, which we - # need to open in the xarray.Dataset. We use Zarr Python instead of - # parsing the raw Zarr metadata files ourselves. - # 2. TensorStore doesn't support non-standard Zarr dtypes like UTF-8 strings. - # 3. Xarray's open_zarr machinery does some pre-processing (e.g., from numeric - # to datetime64 dtypes) that we would otherwise need to invoke explicitly - # via xarray.decode_cf(). - # - # Fortunately (2) and (3) are most commonly encountered on small coordinate - # arrays, for which the performance advantages of TensorStore are irrelevant. - - if context is None: - context = tensorstore.Context() - - # chunks=None means avoid using dask - ds = xarray.open_zarr(path, chunks=None, mask_and_scale=mask_and_scale) - - if mask_and_scale: - # Data variables get replaced below with _TensorStoreAdapter arrays, which - # don't get masked or scaled. Raising an error avoids surprising users with - # incorrect data values. - _raise_if_mask_and_scale_used_for_data_vars(ds) - - specs = {k: _zarr_spec_from_path(os.path.join(path, k)) for k in ds} - array_futures = { - k: tensorstore.open( - spec, read=True, write=mode.lower() == "r+", context=context - ) - for k, spec in specs.items() - } - arrays = {k: v.result() for k, v in array_futures.items()} - new_data = {k: _TensorStoreAdapter(v) for k, v in arrays.items()} - - return ds.copy(data=new_data) + """Open an xarray.Dataset from Zarr using TensorStore. + + For best performance, explicitly call `read()` to asynchronously load data + in parallel. Otherwise, xarray's `.compute()` method will load each variable's + data in sequence. + + Example usage: + + import xarray_tensorstore + + ds = xarray_tensorstore.open_zarr(path) + + # indexing & transposing is lazy + example = ds.sel(time='2020-01-01').transpose('longitude', 'latitude', ...) + + # start reading data asynchronously + read_example = xarray_tensorstore.read(example) + + # blocking conversion of the data into NumPy arrays + numpy_example = read_example.compute() + + Args: + path: path or URI to Zarr group to open. + context: TensorStore configuration options to use when opening arrays. + mask_and_scale: if True (default), attempt to apply masking and scaling like + xarray.open_zarr(). This is only supported for coordinate variables and + otherwise will raise an error. + mode: file mode to use when opening the Zarr group, either ‘r’ for read only; or ‘r+’ for read/write (Zarr must exist). + + Returns: + Dataset with all data variables opened via TensorStore. + """ + # We use xarray.open_zarr (which uses Zarr Python internally) to open the + # initial version of the dataset for a few reasons: + # 1. TensorStore does not support Zarr groups or array attributes, which we + # need to open in the xarray.Dataset. We use Zarr Python instead of + # parsing the raw Zarr metadata files ourselves. + # 2. TensorStore doesn't support non-standard Zarr dtypes like UTF-8 strings. + # 3. Xarray's open_zarr machinery does some pre-processing (e.g., from numeric + # to datetime64 dtypes) that we would otherwise need to invoke explicitly + # via xarray.decode_cf(). + # + # Fortunately (2) and (3) are most commonly encountered on small coordinate + # arrays, for which the performance advantages of TensorStore are irrelevant. + + if context is None: + context = tensorstore.Context() + + # chunks=None means avoid using dask + ds = xarray.open_zarr(path, chunks=None, mask_and_scale=mask_and_scale) + + if mask_and_scale: + # Data variables get replaced below with _TensorStoreAdapter arrays, which + # don't get masked or scaled. Raising an error avoids surprising users with + # incorrect data values. + _raise_if_mask_and_scale_used_for_data_vars(ds) + + specs = {k: _zarr_spec_from_path(os.path.join(path, k)) for k in ds} + array_futures = { + k: tensorstore.open(spec, read=True, write=mode.lower() == "r+", context=context) + for k, spec in specs.items() + } + arrays = {k: v.result() for k, v in array_futures.items()} + new_data = {k: _TensorStoreAdapter(v) for k, v in arrays.items()} + + return ds.copy(data=new_data) From eaa72ec24949c16116a05df11318c8276dddd986 Mon Sep 17 00:00:00 2001 From: Jeff Rhoades <37990507+rhoadesScholar@users.noreply.github.com> Date: Tue, 5 Nov 2024 18:05:09 -0500 Subject: [PATCH 10/17] Update xarray_tensorstore.py Redo change without auto-Black formatting --- xarray_tensorstore.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xarray_tensorstore.py b/xarray_tensorstore.py index 6dbffe8..fa8f0e4 100644 --- a/xarray_tensorstore.py +++ b/xarray_tensorstore.py @@ -108,6 +108,7 @@ def __setitem__(self, key: indexing.ExplicitIndexer, value) -> None: else: assert isinstance(key, indexing.BasicIndexer) self.array[index_tuple] = value + self.future = None # xarray>2024.02.0 uses oindex and vindex properties, which are expected to # return objects whose __getitem__ method supports the appropriate form of From e1d336c3c3552e38b420e893a63ca990fab885ac Mon Sep 17 00:00:00 2001 From: rhoadesScholar Date: Wed, 6 Nov 2024 10:50:57 -0500 Subject: [PATCH 11/17] =?UTF-8?q?style:=20=F0=9F=8E=A8=20Return=20original?= =?UTF-8?q?=20single=20quotes=20style=20formatting?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- conftest.py | 2 +- xarray_tensorstore.py | 36 ++++---- xarray_tensorstore_test.py | 184 ++++++++++++++++++------------------- 3 files changed, 111 insertions(+), 111 deletions(-) diff --git a/conftest.py b/conftest.py index c8ed6d3..af7de76 100644 --- a/conftest.py +++ b/conftest.py @@ -17,4 +17,4 @@ try: app.run(lambda argv: None) except SystemExit: - pass \ No newline at end of file + pass diff --git a/xarray_tensorstore.py b/xarray_tensorstore.py index fa8f0e4..321bb61 100644 --- a/xarray_tensorstore.py +++ b/xarray_tensorstore.py @@ -26,11 +26,11 @@ from xarray.core import indexing -__version__ = "0.1.4" # keep in sync with setup.py +__version__ = '0.1.4' # keep in sync with setup.py -Index = TypeVar("Index", int, slice, np.ndarray, None) -XarrayData = TypeVar("XarrayData", xarray.Dataset, xarray.DataArray) +Index = TypeVar('Index', int, slice, np.ndarray, None) +XarrayData = TypeVar('XarrayData', xarray.Dataset, xarray.DataArray) def _numpy_to_tensorstore_index(index: Index, size: int) -> Index: @@ -167,42 +167,42 @@ def read(xarraydata: XarrayData, /) -> XarrayData: elif isinstance(xarraydata, xarray.DataArray): data = _read_tensorstore(xarraydata.variable._data) else: - raise TypeError(f"argument is not a DataArray or Dataset: {xarraydata}") + raise TypeError(f'argument is not a DataArray or Dataset: {xarraydata}') # pylint: enable=protected-access return xarraydata.copy(data=data) -_DEFAULT_STORAGE_DRIVER = "file" +_DEFAULT_STORAGE_DRIVER = 'file' def _zarr_spec_from_path(path: str) -> ...: - if re.match(r"\w+\://", path): # path is a URI + if re.match(r'\w+\://', path): # path is a URI kv_store = path else: - kv_store = {"driver": _DEFAULT_STORAGE_DRIVER, "path": path} - return {"driver": "zarr", "kvstore": kv_store} + kv_store = {'driver': _DEFAULT_STORAGE_DRIVER, 'path': path} + return {'driver': 'zarr', 'kvstore': kv_store} def _raise_if_mask_and_scale_used_for_data_vars(ds: xarray.Dataset): """Check a dataset for data variables that would need masking or scaling.""" advice = ( - "Consider re-opening with xarray_tensorstore.open_zarr(..., " - "mask_and_scale=False), or falling back to use xarray.open_zarr()." + 'Consider re-opening with xarray_tensorstore.open_zarr(..., ' + 'mask_and_scale=False), or falling back to use xarray.open_zarr().' ) for k in ds: encoding = ds[k].encoding - for attr in ["_FillValue", "missing_value"]: + for attr in ['_FillValue', 'missing_value']: fill_value = encoding.get(attr, np.nan) if fill_value == fill_value: # pylint: disable=comparison-with-itself raise ValueError( - f"variable {k} has non-NaN fill value, which is not supported by" - f" xarray-tensorstore: {fill_value}. {advice}" + f'variable {k} has non-NaN fill value, which is not supported by' + f' xarray-tensorstore: {fill_value}. {advice}' ) - for attr in ["scale_factor", "add_offset"]: + for attr in ['scale_factor', 'add_offset']: if attr in encoding: raise ValueError( - f"variable {k} uses scale/offset encoding, which is not supported" - f" by xarray-tensorstore: {encoding}. {advice}" + f'variable {k} uses scale/offset encoding, which is not supported' + f' by xarray-tensorstore: {encoding}. {advice}' ) @@ -211,7 +211,7 @@ def open_zarr( *, context: tensorstore.Context | None = None, mask_and_scale: bool = True, - mode: str = "r", + mode: str = 'r', ) -> xarray.Dataset: """Open an xarray.Dataset from Zarr using TensorStore. @@ -272,7 +272,7 @@ def open_zarr( specs = {k: _zarr_spec_from_path(os.path.join(path, k)) for k in ds} array_futures = { - k: tensorstore.open(spec, read=True, write=mode.lower() == "r+", context=context) + k: tensorstore.open(spec, read=True, write=mode.lower() == 'r+', context=context) for k, spec in specs.items() } arrays = {k: v.result() for k, v in array_futures.items()} diff --git a/xarray_tensorstore_test.py b/xarray_tensorstore_test.py index 879db19..8817650 100644 --- a/xarray_tensorstore_test.py +++ b/xarray_tensorstore_test.py @@ -1,13 +1,13 @@ # Copyright 2023 Google LLC # -# Licensed under the Apache License, Version 2.0 (the "License"); +# Licensed under the Apache License, Version 2.0 (the 'License'); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, +# distributed under the License is distributed on an 'AS IS' BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. @@ -28,76 +28,76 @@ class XarrayTensorstoreTest(parameterized.TestCase): # TODO(shoyer): consider using hypothesis to convert these into # property-based tests { - "testcase_name": "base", - "transform": lambda ds: ds, + 'testcase_name': 'base', + 'transform': lambda ds: ds, }, { - "testcase_name": "transposed", - "transform": lambda ds: ds.transpose("z", "x", "y"), + 'testcase_name': 'transposed', + 'transform': lambda ds: ds.transpose('z', 'x', 'y'), }, { - "testcase_name": "basic_int", - "transform": lambda ds: ds.isel(y=1), + 'testcase_name': 'basic_int', + 'transform': lambda ds: ds.isel(y=1), }, { - "testcase_name": "negative_int", - "transform": lambda ds: ds.isel(y=-1), + 'testcase_name': 'negative_int', + 'transform': lambda ds: ds.isel(y=-1), }, { - "testcase_name": "basic_slice", - "transform": lambda ds: ds.isel(z=slice(2)), + 'testcase_name': 'basic_slice', + 'transform': lambda ds: ds.isel(z=slice(2)), }, { - "testcase_name": "full_slice", - "transform": lambda ds: ds.isel(z=slice(0, 4)), + 'testcase_name': 'full_slice', + 'transform': lambda ds: ds.isel(z=slice(0, 4)), }, { - "testcase_name": "out_of_bounds_slice", - "transform": lambda ds: ds.isel(z=slice(0, 10)), + 'testcase_name': 'out_of_bounds_slice', + 'transform': lambda ds: ds.isel(z=slice(0, 10)), }, { - "testcase_name": "strided_slice", - "transform": lambda ds: ds.isel(z=slice(0, None, 2)), + 'testcase_name': 'strided_slice', + 'transform': lambda ds: ds.isel(z=slice(0, None, 2)), }, { - "testcase_name": "negative_stride_slice", - "transform": lambda ds: ds.isel(z=slice(None, None, -1)), + 'testcase_name': 'negative_stride_slice', + 'transform': lambda ds: ds.isel(z=slice(None, None, -1)), }, { - "testcase_name": "repeated_indexing", - "transform": lambda ds: ds.isel(z=slice(1, None)).isel(z=0), + 'testcase_name': 'repeated_indexing', + 'transform': lambda ds: ds.isel(z=slice(1, None)).isel(z=0), }, { - "testcase_name": "oindex", + 'testcase_name': 'oindex', # includes repeated, negative and out of order indices - "transform": lambda ds: ds.isel(x=[0], y=[1, 1], z=[1, -1, 0]), + 'transform': lambda ds: ds.isel(x=[0], y=[1, 1], z=[1, -1, 0]), }, { - "testcase_name": "vindex", - "transform": lambda ds: ds.isel(x=("w", [0, 1]), y=("w", [1, 2])), + 'testcase_name': 'vindex', + 'transform': lambda ds: ds.isel(x=('w', [0, 1]), y=('w', [1, 2])), }, { - "testcase_name": "mixed_indexing_types", - "transform": lambda ds: ds.isel(x=0, y=slice(2), z=[-1]), + 'testcase_name': 'mixed_indexing_types', + 'transform': lambda ds: ds.isel(x=0, y=slice(2), z=[-1]), }, { - "testcase_name": "select_a_variable", - "transform": lambda ds: ds["foo"], + 'testcase_name': 'select_a_variable', + 'transform': lambda ds: ds['foo'], }, ) def test_open_zarr(self, transform): source = xarray.Dataset( { - "foo": (("x",), np.arange(2), {"local": "local metadata"}), - "bar": (("x", "y"), np.arange(6).reshape(2, 3)), - "baz": (("x", "y", "z"), np.arange(24).reshape(2, 3, 4)), + 'foo': (('x',), np.arange(2), {'local': 'local metadata'}), + 'bar': (('x', 'y'), np.arange(6).reshape(2, 3)), + 'baz': (('x', 'y', 'z'), np.arange(24).reshape(2, 3, 4)), }, coords={ - "x": [1, 2], - "y": pd.to_datetime(["2000-01-01", "2000-01-02", "2000-01-03"]), - "z": ["a", "b", "c", "d"], + 'x': [1, 2], + 'y': pd.to_datetime(['2000-01-01', '2000-01-02', '2000-01-03']), + 'z': ['a', 'b', 'c', 'd'], }, - attrs={"global": "global metadata"}, + attrs={'global': 'global metadata'}, ) path = self.create_tempdir().full_path source.chunk().to_zarr(path) @@ -107,11 +107,11 @@ def test_open_zarr(self, transform): xarray.testing.assert_identical(actual, expected) @parameterized.parameters( - {"deep": True}, - {"deep": False}, + {'deep': True}, + {'deep': False}, ) def test_copy(self, deep): - source = xarray.Dataset({"foo": (("x",), np.arange(10))}) + source = xarray.Dataset({'foo': (('x',), np.arange(10))}) path = self.create_tempdir().full_path source.to_zarr(path) opened = xarray_tensorstore.open_zarr(path) @@ -121,34 +121,34 @@ def test_copy(self, deep): def test_sortby(self): # regression test for https://github.com/google/xarray-tensorstore/issues/1 x = np.arange(10) - source = xarray.Dataset({"foo": (("x",), x)}, {"x": x[::-1]}) + source = xarray.Dataset({'foo': (('x',), x)}, {'x': x[::-1]}) path = self.create_tempdir().full_path source.to_zarr(path) opened = xarray_tensorstore.open_zarr(path) - opened.sortby("x") # should not crash + opened.sortby('x') # should not crash def test_compute(self): # verify that get_duck_array() is working properly - source = xarray.Dataset({"foo": (("x",), np.arange(10))}) + source = xarray.Dataset({'foo': (('x',), np.arange(10))}) path = self.create_tempdir().full_path source.to_zarr(path) opened = xarray_tensorstore.open_zarr(path) computed = opened.compute() - computed_data = computed["foo"].variable._data + computed_data = computed['foo'].variable._data self.assertNotIsInstance(computed_data, tensorstore.TensorStore) def test_open_zarr_from_uri(self): - source = xarray.Dataset({"baz": (("x", "y", "z"), np.arange(24).reshape(2, 3, 4))}) + source = xarray.Dataset({'baz': (('x', 'y', 'z'), np.arange(24).reshape(2, 3, 4))}) path = self.create_tempdir().full_path source.chunk().to_zarr(path) - opened = xarray_tensorstore.open_zarr("file://" + path) + opened = xarray_tensorstore.open_zarr('file://' + path) xarray.testing.assert_identical(source, opened) def test_read_dataset(self): source = xarray.Dataset( - {"baz": (("x", "y", "z"), np.arange(24).reshape(2, 3, 4))}, - coords={"x": np.arange(2)}, + {'baz': (('x', 'y', 'z'), np.arange(24).reshape(2, 3, 4))}, + coords={'x': np.arange(2)}, ) path = self.create_tempdir().full_path source.chunk().to_zarr(path) @@ -156,21 +156,21 @@ def test_read_dataset(self): opened = xarray_tensorstore.open_zarr(path) read = xarray_tensorstore.read(opened) - self.assertIsNone(opened.variables["baz"]._data.future) - self.assertIsNotNone(read.variables["baz"]._data.future) + self.assertIsNone(opened.variables['baz']._data.future) + self.assertIsNotNone(read.variables['baz']._data.future) xarray.testing.assert_identical(read, source) def test_read_dataarray(self): source = xarray.DataArray( np.arange(24).reshape(2, 3, 4), - dims=("x", "y", "z"), - name="baz", - coords={"x": np.arange(2)}, + dims=('x', 'y', 'z'), + name='baz', + coords={'x': np.arange(2)}, ) path = self.create_tempdir().full_path source.to_dataset().chunk().to_zarr(path) - opened = xarray_tensorstore.open_zarr(path)["baz"] + opened = xarray_tensorstore.open_zarr(path)['baz'] read = xarray_tensorstore.read(opened) self.assertIsNone(opened.variable._data.future) @@ -180,85 +180,85 @@ def test_read_dataarray(self): def test_mask_and_scale(self): source = xarray.DataArray( np.arange(24).reshape(2, 3, 4), - dims=("x", "y", "z"), - name="baz", - coords={"x": np.arange(2)}, + dims=('x', 'y', 'z'), + name='baz', + coords={'x': np.arange(2)}, ) # invalid fill-value - source.encoding = {"_FillValue": -1} + source.encoding = {'_FillValue': -1} path = self.create_tempdir().full_path source.to_dataset().chunk().to_zarr(path) expected_msg = ( - "variable baz has non-NaN fill value, which is not supported by" - " xarray-tensorstore: -1. Consider re-opening with" - " xarray_tensorstore.open_zarr(..., mask_and_scale=False), or falling" - " back to use xarray.open_zarr()." + 'variable baz has non-NaN fill value, which is not supported by' + ' xarray-tensorstore: -1. Consider re-opening with' + ' xarray_tensorstore.open_zarr(..., mask_and_scale=False), or falling' + ' back to use xarray.open_zarr().' ) with self.assertRaisesWithLiteralMatch(ValueError, expected_msg): xarray_tensorstore.open_zarr(path) - actual = xarray_tensorstore.open_zarr(path, mask_and_scale=False)["baz"] + actual = xarray_tensorstore.open_zarr(path, mask_and_scale=False)['baz'] xarray.testing.assert_equal(actual, source) # no values are masked # invalid scaling - source.encoding = {"scale_factor": 10.0} + source.encoding = {'scale_factor': 10.0} path = self.create_tempdir().full_path source.to_dataset().chunk().to_zarr(path) - expected_msg = "variable baz uses scale/offset encoding" + expected_msg = 'variable baz uses scale/offset encoding' with self.assertRaisesRegex(ValueError, expected_msg): xarray_tensorstore.open_zarr(path) - actual = xarray_tensorstore.open_zarr(path, mask_and_scale=False)["baz"] + actual = xarray_tensorstore.open_zarr(path, mask_and_scale=False)['baz'] self.assertFalse(actual.equals(source)) # not scaled properly # valid offset (coordinate only) source.encoding = {} - source.coords["x"].encoding = {"add_offset": -1} + source.coords['x'].encoding = {'add_offset': -1} path = self.create_tempdir().full_path source.to_dataset().chunk().to_zarr(path) - actual = xarray_tensorstore.open_zarr(path, mask_and_scale=True)["baz"] + actual = xarray_tensorstore.open_zarr(path, mask_and_scale=True)['baz'] xarray.testing.assert_identical(actual, source) - self.assertEqual(actual.coords["x"].encoding["add_offset"], -1) + self.assertEqual(actual.coords['x'].encoding['add_offset'], -1) @parameterized.named_parameters( { - "testcase_name": "basic_indexing", - "key": (slice(1, None), slice(None), slice(None)), - "value": np.full((1, 2, 3), -1), - "mode": "r+", + 'testcase_name': 'basic_indexing', + 'key': (slice(1, None), slice(None), slice(None)), + 'value': np.full((1, 2, 3), -1), + 'mode': 'r+', }, { - "testcase_name": "outer_indexing", - "key": (np.array([0]), np.array([1]), slice(None)), - "value": np.full((1, 1, 3), -2), - "mode": "r+", + 'testcase_name': 'outer_indexing', + 'key': (np.array([0]), np.array([1]), slice(None)), + 'value': np.full((1, 1, 3), -2), + 'mode': 'r+', }, { - "testcase_name": "vectorized_indexing", - "key": (np.array([0]), np.array([0, 1]), slice(None)), - "value": np.full((2, 3), -3), - "mode": "r+", + 'testcase_name': 'vectorized_indexing', + 'key': (np.array([0]), np.array([0, 1]), slice(None)), + 'value': np.full((2, 3), -3), + 'mode': 'r+', }, { - "testcase_name": "read_only", - "key": (slice(1, None), slice(None), slice(None)), - "value": np.full((1, 2, 3), -1), - "mode": "r", + 'testcase_name': 'read_only', + 'key': (slice(1, None), slice(None), slice(None)), + 'value': np.full((1, 2, 3), -1), + 'mode': 'r', }, ) def test_setitem(self, key, value, mode): source_data = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]) source = xarray.DataArray( source_data, - dims=("x", "y", "z"), - name="baz", + dims=('x', 'y', 'z'), + name='baz', ) path = self.create_tempdir().full_path source.to_dataset().chunk().to_zarr(path) - opened = xarray_tensorstore.open_zarr(path, mode=mode)["baz"] - if mode == "r": + opened = xarray_tensorstore.open_zarr(path, mode=mode)['baz'] + if mode == 'r': with pytest.raises(ValueError): opened[key] = value return @@ -270,12 +270,12 @@ def test_setitem(self, key, value, mode): expected_data[key] = value expected = xarray.DataArray( expected_data, - dims=("x", "y", "z"), - name="baz", + dims=('x', 'y', 'z'), + name='baz', ) xarray.testing.assert_equal(read, expected) -if __name__ == "__main__": +if __name__ == '__main__': absltest.main() From 752741bb917286115bfc58e2e51ab2cd9dec9325 Mon Sep 17 00:00:00 2001 From: Jeff Rhoades <37990507+rhoadesScholar@users.noreply.github.com> Date: Wed, 6 Nov 2024 10:51:48 -0500 Subject: [PATCH 12/17] Revert conftest.py From 5b02b7e021c1dc0f9c24cea59a71b5dbdd477949 Mon Sep 17 00:00:00 2001 From: Jeff Rhoades <37990507+rhoadesScholar@users.noreply.github.com> Date: Wed, 6 Nov 2024 10:56:10 -0500 Subject: [PATCH 13/17] Revert conftest.py From 324f6839cd68816e73112f82bc171abf34781055 Mon Sep 17 00:00:00 2001 From: rhoadesScholar Date: Wed, 6 Nov 2024 11:34:49 -0500 Subject: [PATCH 14/17] =?UTF-8?q?style:=20=F0=9F=8E=A8=20Clean=20up=20docs?= =?UTF-8?q?trings=20and=20tests.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- xarray_tensorstore.py | 6 ++++-- xarray_tensorstore_test.py | 29 ++++++++++++++--------------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/xarray_tensorstore.py b/xarray_tensorstore.py index 321bb61..4d10e08 100644 --- a/xarray_tensorstore.py +++ b/xarray_tensorstore.py @@ -108,7 +108,8 @@ def __setitem__(self, key: indexing.ExplicitIndexer, value) -> None: else: assert isinstance(key, indexing.BasicIndexer) self.array[index_tuple] = value - self.future = None + # Invalidate the future so that the next read will pick up the new value + object.__setattr__(self, 'future', None) # xarray>2024.02.0 uses oindex and vindex properties, which are expected to # return objects whose __getitem__ method supports the appropriate form of @@ -240,7 +241,8 @@ def open_zarr( mask_and_scale: if True (default), attempt to apply masking and scaling like xarray.open_zarr(). This is only supported for coordinate variables and otherwise will raise an error. - mode: file mode to use when opening the Zarr group, either ‘r’ for read only; or ‘r+’ for read/write (Zarr must exist). + mode: file mode to use when opening the Zarr group, either ‘r’ for read + only; or ‘r+’ for read/write (Zarr must exist). Returns: Dataset with all data variables opened via TensorStore. diff --git a/xarray_tensorstore_test.py b/xarray_tensorstore_test.py index 8817650..db74256 100644 --- a/xarray_tensorstore_test.py +++ b/xarray_tensorstore_test.py @@ -226,28 +226,19 @@ def test_mask_and_scale(self): 'testcase_name': 'basic_indexing', 'key': (slice(1, None), slice(None), slice(None)), 'value': np.full((1, 2, 3), -1), - 'mode': 'r+', }, { 'testcase_name': 'outer_indexing', 'key': (np.array([0]), np.array([1]), slice(None)), 'value': np.full((1, 1, 3), -2), - 'mode': 'r+', }, { 'testcase_name': 'vectorized_indexing', 'key': (np.array([0]), np.array([0, 1]), slice(None)), 'value': np.full((2, 3), -3), - 'mode': 'r+', - }, - { - 'testcase_name': 'read_only', - 'key': (slice(1, None), slice(None), slice(None)), - 'value': np.full((1, 2, 3), -1), - 'mode': 'r', }, ) - def test_setitem(self, key, value, mode): + def test_setitem(self, key, value): source_data = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]) source = xarray.DataArray( source_data, @@ -257,11 +248,7 @@ def test_setitem(self, key, value, mode): path = self.create_tempdir().full_path source.to_dataset().chunk().to_zarr(path) - opened = xarray_tensorstore.open_zarr(path, mode=mode)['baz'] - if mode == 'r': - with pytest.raises(ValueError): - opened[key] = value - return + opened = xarray_tensorstore.open_zarr(path, mode="r+")['baz'] opened[key] = value read = xarray_tensorstore.read(opened) @@ -276,6 +263,18 @@ def test_setitem(self, key, value, mode): xarray.testing.assert_equal(read, expected) + def test_setitem_readonly(self): + source = xarray.DataArray( + np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]), + dims=('x', 'y', 'z'), + name='baz', + ) + path = self.create_tempdir().full_path + source.to_dataset().chunk().to_zarr(path) + + opened = xarray_tensorstore.open_zarr(path)['baz'] + with pytest.raises(ValueError): + opened[1:,...] = np.full((1, 2, 3), -1) if __name__ == '__main__': absltest.main() From f28364e3165085d7107e5c6637306a67611d2b67 Mon Sep 17 00:00:00 2001 From: rhoadesScholar Date: Wed, 6 Nov 2024 11:43:45 -0500 Subject: [PATCH 15/17] =?UTF-8?q?style:=20=F0=9F=8E=A8=20Pyink=20format?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- xarray_tensorstore.py | 2 +- xarray_tensorstore_test.py | 23 ++++++++++++----------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/xarray_tensorstore.py b/xarray_tensorstore.py index 4d10e08..8e1ee2c 100644 --- a/xarray_tensorstore.py +++ b/xarray_tensorstore.py @@ -241,7 +241,7 @@ def open_zarr( mask_and_scale: if True (default), attempt to apply masking and scaling like xarray.open_zarr(). This is only supported for coordinate variables and otherwise will raise an error. - mode: file mode to use when opening the Zarr group, either ‘r’ for read + mode: file mode to use when opening the Zarr group, either ‘r’ for read only; or ‘r+’ for read/write (Zarr must exist). Returns: diff --git a/xarray_tensorstore_test.py b/xarray_tensorstore_test.py index db74256..9367287 100644 --- a/xarray_tensorstore_test.py +++ b/xarray_tensorstore_test.py @@ -248,7 +248,7 @@ def test_setitem(self, key, value): path = self.create_tempdir().full_path source.to_dataset().chunk().to_zarr(path) - opened = xarray_tensorstore.open_zarr(path, mode="r+")['baz'] + opened = xarray_tensorstore.open_zarr(path, mode='r+')['baz'] opened[key] = value read = xarray_tensorstore.read(opened) @@ -264,17 +264,18 @@ def test_setitem(self, key, value): xarray.testing.assert_equal(read, expected) def test_setitem_readonly(self): - source = xarray.DataArray( - np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]), - dims=('x', 'y', 'z'), - name='baz', - ) - path = self.create_tempdir().full_path - source.to_dataset().chunk().to_zarr(path) + source = xarray.DataArray( + np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]), + dims=('x', 'y', 'z'), + name='baz', + ) + path = self.create_tempdir().full_path + source.to_dataset().chunk().to_zarr(path) + + opened = xarray_tensorstore.open_zarr(path)['baz'] + with pytest.raises(ValueError): + opened[1:, ...] = np.full((1, 2, 3), -1) - opened = xarray_tensorstore.open_zarr(path)['baz'] - with pytest.raises(ValueError): - opened[1:,...] = np.full((1, 2, 3), -1) if __name__ == '__main__': absltest.main() From 8134fac6a0c124df20f52a9177baf3701d70ef54 Mon Sep 17 00:00:00 2001 From: rhoadesScholar Date: Wed, 6 Nov 2024 13:25:09 -0500 Subject: [PATCH 16/17] =?UTF-8?q?style:=20=F0=9F=8E=A8=20Switch=20mode=3D"?= =?UTF-8?q?r+"=20to=20write=3DTrue?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Also fix indentation. --- xarray_tensorstore.py | 7 +++---- xarray_tensorstore_test.py | 22 +++++++++++----------- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/xarray_tensorstore.py b/xarray_tensorstore.py index 8e1ee2c..9743d38 100644 --- a/xarray_tensorstore.py +++ b/xarray_tensorstore.py @@ -212,7 +212,7 @@ def open_zarr( *, context: tensorstore.Context | None = None, mask_and_scale: bool = True, - mode: str = 'r', + write: bool = False, ) -> xarray.Dataset: """Open an xarray.Dataset from Zarr using TensorStore. @@ -241,8 +241,7 @@ def open_zarr( mask_and_scale: if True (default), attempt to apply masking and scaling like xarray.open_zarr(). This is only supported for coordinate variables and otherwise will raise an error. - mode: file mode to use when opening the Zarr group, either ‘r’ for read - only; or ‘r+’ for read/write (Zarr must exist). + write: Allow write access. Defaults to False. Returns: Dataset with all data variables opened via TensorStore. @@ -274,7 +273,7 @@ def open_zarr( specs = {k: _zarr_spec_from_path(os.path.join(path, k)) for k in ds} array_futures = { - k: tensorstore.open(spec, read=True, write=mode.lower() == 'r+', context=context) + k: tensorstore.open(spec, read=True, write=write, context=context) for k, spec in specs.items() } arrays = {k: v.result() for k, v in array_futures.items()} diff --git a/xarray_tensorstore_test.py b/xarray_tensorstore_test.py index 9367287..8aa99af 100644 --- a/xarray_tensorstore_test.py +++ b/xarray_tensorstore_test.py @@ -263,18 +263,18 @@ def test_setitem(self, key, value): xarray.testing.assert_equal(read, expected) - def test_setitem_readonly(self): - source = xarray.DataArray( - np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]), - dims=('x', 'y', 'z'), - name='baz', - ) - path = self.create_tempdir().full_path - source.to_dataset().chunk().to_zarr(path) + def test_setitem_readonly(self): + source = xarray.DataArray( + np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]), + dims=('x', 'y', 'z'), + name='baz', + ) + path = self.create_tempdir().full_path + source.to_dataset().chunk().to_zarr(path) - opened = xarray_tensorstore.open_zarr(path)['baz'] - with pytest.raises(ValueError): - opened[1:, ...] = np.full((1, 2, 3), -1) + opened = xarray_tensorstore.open_zarr(path)['baz'] + with pytest.raises(ValueError): + opened[1:, ...] = np.full((1, 2, 3), -1) if __name__ == '__main__': From fa7ea05512b474ced8903355b92244b83a72cf68 Mon Sep 17 00:00:00 2001 From: rhoadesScholar Date: Wed, 6 Nov 2024 13:37:30 -0500 Subject: [PATCH 17/17] =?UTF-8?q?fix:=20=F0=9F=90=9B=20Update=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- xarray_tensorstore_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray_tensorstore_test.py b/xarray_tensorstore_test.py index 8aa99af..c40ab9b 100644 --- a/xarray_tensorstore_test.py +++ b/xarray_tensorstore_test.py @@ -248,7 +248,7 @@ def test_setitem(self, key, value): path = self.create_tempdir().full_path source.to_dataset().chunk().to_zarr(path) - opened = xarray_tensorstore.open_zarr(path, mode='r+')['baz'] + opened = xarray_tensorstore.open_zarr(path, write=True)['baz'] opened[key] = value read = xarray_tensorstore.read(opened)