Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bug when chunking large arrays with a subset defined #2302

Merged
merged 9 commits into from
Jun 24, 2022
3 changes: 3 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ v1.4.0 (unreleased)

* Add support for using degrees in full-sphere projections. [#2279]

* Fixed a bug in when using `compute_statistic` on an array large enough to
need chunking when a subset is defined. [#2302]
rosteen marked this conversation as resolved.
Show resolved Hide resolved

v1.3.0 (2022-04-22)
-------------------

Expand Down
14 changes: 12 additions & 2 deletions glue/core/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -1694,6 +1694,7 @@ def compute_statistic(self, statistic, cid, subset_state=None, axis=None,
# later we will need to pad out the result of compute_statistic.
subarray_slices = None

chunk_view = None
if subset_state:
if isinstance(subset_state, SliceSubsetState) and view is None:
mask = None
Expand Down Expand Up @@ -1766,6 +1767,8 @@ def compute_statistic(self, statistic, cid, subset_state=None, axis=None,
mask_idim += 1
else:
new_view.append(view[idim])
# This is the chunk view, which we'll need later
chunk_view = view
view = tuple(new_view)
else: # pragma: nocover
# This should probably never happen, but just in case!
Expand Down Expand Up @@ -1828,9 +1831,16 @@ def compute_statistic(self, statistic, cid, subset_state=None, axis=None,
# of the if statement above.
dhomeier marked this conversation as resolved.
Show resolved Hide resolved
if not isinstance(axis, tuple):
axis = (axis,)
full_shape = [self.shape[idim] for idim in range(self.ndim) if idim not in axis]
full_result = np.zeros(full_shape) * np.nan
result_slices = tuple([subarray_slices[idim] for idim in range(self.ndim) if idim not in axis])

if chunk_view is None:
full_shape = [self.shape[idim] for idim in range(self.ndim) if idim not in axis]
else:
chunk_shape = subset_state.to_mask(self, chunk_view).shape
full_shape = [chunk_shape[idim] for idim in range(self.ndim) if idim not in axis]
view_start = [chunk_view[idim].start for idim in range(self.ndim) if idim not in axis][0]
rosteen marked this conversation as resolved.
Show resolved Hide resolved

full_result = np.zeros(full_shape) * np.nan
full_result[result_slices] = result
return full_result

Expand Down
86 changes: 86 additions & 0 deletions glue/core/tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -855,9 +855,54 @@ def test_compute_statistic_chunks(shape):
data = Data(x=np.random.random(shape))

axis = tuple(range(data.ndim - 1))

assert_allclose(data.compute_statistic('mean', data.id['x'], axis=axis),
data.compute_statistic('mean', data.id['x'], axis=axis, n_chunk_max=10))

subset_state = SliceSubsetState(data, [slice(5)])
stats = data.compute_statistic('mean', data.id['x'], axis=axis, subset_state=subset_state)
chunked = data.compute_statistic('mean', data.id['x'], axis=axis, subset_state=subset_state,
n_chunk_max=10)
assert_allclose(stats, chunked)

subset_state = data.id['x'] > 0.25
stats = data.compute_statistic('mean', data.id['x'], axis=axis, subset_state=subset_state)
chunked = data.compute_statistic('mean', data.id['x'], axis=axis, subset_state=subset_state,
n_chunk_max=10)
assert_allclose(stats, chunked)

roi = RangeROI('x', min=0.1, max=0.95)
subset_state = roi_to_subset_state(roi, x_att='x')
stats = data.compute_statistic('mean', data.id['x'], axis=axis, subset_state=subset_state)
chunked = data.compute_statistic('mean', data.id['x'], axis=axis, subset_state=subset_state,
n_chunk_max=10)
assert_allclose(stats, chunked)

if data.ndim < 3:
return

assert_allclose(data.compute_statistic('mean', data.id['x'], axis=2),
data.compute_statistic('mean', data.id['x'], axis=2, n_chunk_max=10))

subset_state = SliceSubsetState(data, [slice(5)])
stats = data.compute_statistic('mean', data.id['x'], axis=2, subset_state=subset_state)
chunked = data.compute_statistic('mean', data.id['x'], axis=2, subset_state=subset_state,
n_chunk_max=10)
assert_allclose(stats, chunked)

subset_state = data.id['x'] > 0.25
stats = data.compute_statistic('mean', data.id['x'], axis=2, subset_state=subset_state)
chunked = data.compute_statistic('mean', data.id['x'], axis=2, subset_state=subset_state,
n_chunk_max=10)
assert_allclose(stats, chunked)

roi = RangeROI('x', min=0.1, max=0.95)
subset_state = roi_to_subset_state(roi, x_att='x')
stats = data.compute_statistic('mean', data.id['x'], axis=2, subset_state=subset_state)
chunked = data.compute_statistic('mean', data.id['x'], axis=2, subset_state=subset_state,
n_chunk_max=10)
assert_allclose(stats, chunked)


def test_compute_statistic_random_subset():

Expand Down Expand Up @@ -991,6 +1036,47 @@ def test_compute_statistic_shape():
assert result.shape == (20,)


def test_compute_statistic_shape_view():

# Test the compute_statistic method with the same optimizations, but setting
# the `view` parameter for sub-slicing the dataset.

array = np.ones(10 * 20 * 30).reshape((10, 20, 30))
array[3:5, 6:14, 10:21] += 1

data = Data(x=array, y=array)

subset_state = data.id['y'] > 1.5

view = (slice(0, 5), slice(4, 12), slice(0, 10))
subset_state = data.id['y'] > 1.5

result = data.compute_statistic('sum', data.id['x'], subset_state=subset_state, view=view)
assert np.isscalar(result)

result = data.compute_statistic('sum', data.id['x'], subset_state=subset_state,
axis=1, view=view)
assert result.shape == (5, 10)

result = data.compute_statistic('sum', data.id['x'], subset_state=subset_state,
axis=(0, 2), view=view)
assert result.shape == (8,)

roi = RangeROI('x', min=1.5, max=2.0)
subset_state = roi_to_subset_state(roi, x_att='x')

result = data.compute_statistic('sum', data.id['x'], subset_state=subset_state, view=view)
assert np.isscalar(result)

result = data.compute_statistic('sum', data.id['x'], subset_state=subset_state,
axis=1, view=view)
assert result.shape == (5, 10)

result = data.compute_statistic('sum', data.id['x'], subset_state=subset_state,
axis=(0, 2), view=view)
assert result.shape == (8,)
dhomeier marked this conversation as resolved.
Show resolved Hide resolved


def test_compute_histogram_log():

# Make sure that the returned histogram is NaN everywhere if either of the
Expand Down