Skip to content

Commit

Permalink
Find objects bounding boxes (#240)
Browse files Browse the repository at this point in the history
* Alternative find_objects implementation

* Dask docs suggest bag.fold is more efficient than bag.reduction

* Merge dataframes instead of using combine

* Make find_objects output obviously a dask dataframe

* A clearer way to use dask delayed

* Try to clarify dataframe column naming convention

* Additional clarifying comment

* Default step size for slices is None

* Use Marvin's suggestion so we don't have to call compute twice on result

* Revert delayed changes

* Be consistent with slice step (use default value)

* Remove redundant import statement

* Avoid user having to call compute twice on result

* Fix delayed so we know output is a dask dataframe

* Use functools partial to pass in array dimension information to _find_objects

* Improve docstrings in _find_objects.py

* Add check for integer array dtype in find_objects

* Re-trigger CI

* Improve find_objects docstring re return value

* find_objects _merge_bounding_boxes, clarify comment

* Fix find_objects bug where chunk has no non-zero labels

* Add test for find_objects with empty chunk in array
  • Loading branch information
GenevieveBuckley committed Dec 17, 2021
1 parent 9b4ff96 commit f1fc159
Show file tree
Hide file tree
Showing 3 changed files with 209 additions and 0 deletions.
47 changes: 47 additions & 0 deletions dask_image/ndmeasure/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,16 @@
import functools
import operator
import warnings
from dask import compute, delayed

import dask.array as da
import dask.bag as db
import dask.dataframe as dd
import numpy as np

from . import _utils
from ._utils import _label
from ._utils._find_objects import _array_chunk_location, _find_bounding_boxes, _find_objects

__all__ = [
"area",
Expand Down Expand Up @@ -202,6 +206,49 @@ def extrema(image, label_image=None, index=None):
return result


def find_objects(label_image):
"""Return bounding box slices for each object labelled by integers.
Parameters
----------
label_image : ndarray
Image features noted by integers.
Returns
-------
Dask dataframe
Each row respresents an indivdual integrer label. Columns contain the
slice information for the object boundaries in each dimension
(dimensions are named: 0, 1, ..., nd).
Notes
-----
You must have the optional dependency ``dask[dataframe]`` installed
to use the ``find_objects`` function.
"""
if label_image.dtype.char not in np.typecodes['AllInteger']:
raise ValueError("find_objects only accepts integer dtype arrays")

block_iter = zip(
np.ndindex(*label_image.numblocks),
map(functools.partial(operator.getitem, label_image),
da.core.slices_from_chunks(label_image.chunks))
)

arrays = []
for block_id, block in block_iter:
array_location = _array_chunk_location(block_id, label_image.chunks)
arrays.append(delayed(_find_bounding_boxes)(block, array_location))

bag = db.from_sequence(arrays)
result = bag.fold(functools.partial(_find_objects, label_image.ndim), split_every=2).to_delayed()
meta = dd.utils.make_meta([(i, object) for i in range(label_image.ndim)])
result = delayed(compute)(result)[0] # avoid the user having to call compute twice on result
result = dd.from_delayed(result, meta=meta, prefix="find-objects-", verify_meta=False)

return result


def histogram(image,
min,
max,
Expand Down
75 changes: 75 additions & 0 deletions dask_image/ndmeasure/_utils/_find_objects.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import numpy as np
import pandas as pd
from dask.delayed import Delayed
import dask.dataframe as dd


def _array_chunk_location(block_id, chunks):
"""Pixel coordinate of top left corner of the array chunk."""
array_location = []
for idx, chunk in zip(block_id, chunks):
array_location.append(sum(chunk[:idx]))
return tuple(array_location)


def _find_bounding_boxes(x, array_location):
"""An alternative to scipy.ndimage.find_objects.
We use this alternative because scipy.ndimage.find_objects
returns a tuple of length N, where N is the largest integer label.
This is not ideal for distributed labels, where there might be only
one or two objects in an image chunk labelled with very large integers.
This alternative function returns a pandas dataframe,
with one row per object found in the image chunk.
"""
unique_vals = np.unique(x)
unique_vals = unique_vals[unique_vals != 0]
result = {}
for val in unique_vals:
positions = np.where(x == val)
slices = tuple(slice(np.min(pos) + array_location[i], np.max(pos) + 1 + array_location[i]) for i, pos in enumerate(positions))
result[val] = slices
column_names = [i for i in range(x.ndim)] # column names are: 0, 1, ... nD
return pd.DataFrame.from_dict(result, orient='index', columns=column_names)


def _combine_slices(slices):
"Return the union of all slices."
if len(slices) == 1:
return slices[0]
else:
start = min([sl.start for sl in slices])
stop = max([sl.stop for sl in slices])
return slice(start, stop)


def _merge_bounding_boxes(x, ndim):
"""Merge the bounding boxes describing objects over multiple image chunks."""
x = x.dropna()
data = {}
# For each dimension in the array,
# pick out the slice values belonging to that dimension
# and combine the slices
# (i.e. find the union; the slice expanded to all input slices).
for i in range(ndim):
# Array dimensions are labelled by a number followed by an underscroe
# i.e. column labels are: 0_x, 1_x, 2_x, ... 0_y, 1_y, 2_y, ...
# (x and y represent the pair of chunks label slices are merged from)
slices = [x[ii] for ii in x.index if str(ii).startswith(str(i))]
combined_slices = _combine_slices(slices)
data[i] = combined_slices
result = pd.Series(data=data, index=[i for i in range(ndim)], name=x.name)
return result


def _find_objects(ndim, df1, df2):
"""Main utility function for find_objects."""
meta = dd.utils.make_meta([(i, object) for i in range(ndim)])
if isinstance(df1, Delayed):
df1 = dd.from_delayed(df1, meta=meta)
if isinstance(df2, Delayed):
df2 = dd.from_delayed(df2, meta=meta)
ddf = dd.merge(df1, df2, how="outer", left_index=True, right_index=True)
result = ddf.apply(_merge_bounding_boxes, ndim=ndim, axis=1, meta=meta)
return result
87 changes: 87 additions & 0 deletions tests/test_dask_image/test_ndmeasure/test_find_objects.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
from dask_image.ndmeasure._utils import _labeled_comprehension_delayed
import dask.array as da
import dask.dataframe as dd
import numpy as np
import pandas as pd
import pytest

import dask_image.ndmeasure


@pytest.fixture
def label_image():
"""Return small label image for tests.
dask.array<array, shape=(5, 10), dtype=int64, chunksize=(5, 5), chunktype=numpy.ndarray>
array([[ 0, 0, 0, 0, 0, 0, 0, 333, 333, 333],
[111, 111, 0, 0, 0, 0, 0, 333, 333, 333],
[111, 111, 0, 0, 0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 222, 222, 222, 222, 222, 222, 0],
[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
"""
label_image = np.zeros((5, 10)).astype(int)
label_image[1:3,0:2] = 111
label_image[3,3:-2] = 222
label_image[0:2,-3:] = 333
label_image = da.from_array(label_image, chunks=(5, 5))
return label_image


@pytest.fixture
def label_image_with_empty_chunk():
"""Return small label image with an empty chunk for tests.
dask.array<array, shape=(6, 6), dtype=int64, chunksize=(3, 3), chunktype=numpy.ndarray>
array([[ 0, 0, 0, 0, 0, 0],
[111, 111, 0, 0, 0, 0],
[111, 111, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 222, 222, 222],
[ 0, 0, 0, 0, 0, 0]])
"""
label_image = np.zeros((6, 6)).astype(int)
label_image[1:3,0:2] = 111
label_image[4,3:] = 222
label_image = da.from_array(label_image, chunks=(3, 3))
return label_image


def test_find_objects(label_image):
result = dask_image.ndmeasure.find_objects(label_image)
assert isinstance(result, dd.DataFrame)
computed_result = result.compute()
assert isinstance(computed_result, pd.DataFrame)
expected = pd.DataFrame.from_dict(
{0: {111: slice(1, 3), 222: slice(3, 4), 333: slice(0, 2)},
1: {111: slice(0, 2), 222: slice(3, 8), 333: slice(7, 10)}}
)
assert computed_result.equals(expected)


def test_3d_find_objects(label_image):
label_image = da.stack([label_image, label_image], axis=2)
result = dask_image.ndmeasure.find_objects(label_image)
assert isinstance(result, dd.DataFrame)
computed_result = result.compute()
assert isinstance(computed_result, pd.DataFrame)
expected = pd.DataFrame.from_dict(
{0: {111: slice(1, 3), 222: slice(3, 4), 333: slice(0, 2)},
1: {111: slice(0, 2), 222: slice(3, 8), 333: slice(7, 10)},
2: {111: slice(0, 2), 222: slice(0, 2), 333: slice(0, 2)}}
)
assert computed_result.equals(expected)


def test_find_objects_with_empty_chunks(label_image_with_empty_chunk):
result = dask_image.ndmeasure.find_objects(label_image_with_empty_chunk)
assert isinstance(result, dd.DataFrame)
computed_result = result.compute()
assert isinstance(computed_result, pd.DataFrame)
expected = pd.DataFrame.from_dict(
{0: {111: slice(1, 3, None), 222: slice(4, 5, None)},
1: {111: slice(0, 2, None), 222: slice(3, 6, None)}}
)
assert computed_result.equals(expected)

0 comments on commit f1fc159

Please sign in to comment.