diff --git a/continuous_integration/envs/37-latest.yaml b/continuous_integration/envs/37-latest.yaml index 7ffc2597..e1bcf33e 100644 --- a/continuous_integration/envs/37-latest.yaml +++ b/continuous_integration/envs/37-latest.yaml @@ -5,6 +5,7 @@ dependencies: # required dependencies - python=3.7 - dask=2.18 + - numba - distributed - geopandas - pygeos @@ -12,3 +13,4 @@ dependencies: # test dependencies - pytest - pytest-cov + - hilbertcurve diff --git a/continuous_integration/envs/38-latest.yaml b/continuous_integration/envs/38-latest.yaml index d3c43c0f..9b583350 100644 --- a/continuous_integration/envs/38-latest.yaml +++ b/continuous_integration/envs/38-latest.yaml @@ -5,6 +5,7 @@ dependencies: # required dependencies - python=3.8 - dask=2021.05.0 + - numba - distributed - geopandas - pygeos @@ -12,5 +13,6 @@ dependencies: # test dependencies - pytest - pytest-cov + - hilbertcurve # optional dependencies - pyarrow diff --git a/continuous_integration/envs/39-dev.yaml b/continuous_integration/envs/39-dev.yaml index 8d951f08..b4308c50 100644 --- a/continuous_integration/envs/39-dev.yaml +++ b/continuous_integration/envs/39-dev.yaml @@ -4,6 +4,7 @@ channels: dependencies: # required dependencies - python=3.9 + - numba - distributed - pandas - shapely @@ -13,6 +14,7 @@ dependencies: # test dependencies - pytest - pytest-cov + - hilbertcurve # optional dependencies - pyarrow - pip diff --git a/continuous_integration/envs/39-latest.yaml b/continuous_integration/envs/39-latest.yaml index 8c127af1..e93c633a 100644 --- a/continuous_integration/envs/39-latest.yaml +++ b/continuous_integration/envs/39-latest.yaml @@ -5,6 +5,7 @@ dependencies: # required dependencies - python=3.9 - dask + - numba - distributed - geopandas - pygeos @@ -12,5 +13,6 @@ dependencies: # test dependencies - pytest - pytest-cov + - hilbertcurve # optional dependencies - pyarrow diff --git a/dask_geopandas/core.py b/dask_geopandas/core.py index 6b202ba5..1912846d 100644 --- a/dask_geopandas/core.py +++ b/dask_geopandas/core.py @@ -10,6 +10,7 @@ import geopandas from shapely.geometry.base import BaseGeometry from shapely.geometry import box +from .hilbert_distance import _hilbert_distance def _set_crs(df, crs, allow_override): @@ -304,6 +305,37 @@ def explode(self): def cx(self): return _CoordinateIndexer(self) + def hilbert_distance(self, p=15): + + """ + A function that calculates the Hilbert distance between the geometry bounds + and total bounds of a Dask-GeoDataFrame. + The Hilbert distance can be used to spatially partition Dask-GeoPandas objects, + by mapping two dimensional geometries along the Hilbert curve. + + Parameters + ---------- + + p : The number of iterations used in constructing the Hilbert curve. + + Returns + ---------- + Distances for each partition + """ + + # Compute total bounds of all partitions rather than each partition + total_bounds = self.total_bounds + + # Calculate hilbert distances for each partition + distances = self.map_partitions( + _hilbert_distance, + total_bounds=total_bounds, + p=p, + meta=pd.Series([], name="hilbert_distance", dtype="int"), + ) + + return distances + class GeoSeries(_Frame, dd.core.Series): _partition_type = geopandas.GeoSeries diff --git a/dask_geopandas/hilbert_distance.py b/dask_geopandas/hilbert_distance.py new file mode 100644 index 00000000..0443616e --- /dev/null +++ b/dask_geopandas/hilbert_distance.py @@ -0,0 +1,256 @@ +import numpy as np +import pandas as pd +from numba import jit + +ngjit = jit(nopython=True, nogil=True) + +# Based on: https://github.com/holoviz/spatialpandas/blob/ +# 9252a7aba5f8bc7a435fffa2c31018af8d92942c/spatialpandas/dask.py + + +def _hilbert_distance(gdf, total_bounds, p): + + """ + Calculate hilbert distance for a GeoDataFrame + int coordinates + + Parameters + ---------- + gdf : GeoDataFrame + + total_bounds : Total bounds of geometries - array + + p : The number of iterations used in constructing the Hilbert curve + + Returns + --------- + Pandas Series containing hilbert distances + """ + + # Compute bounds as array + bounds = gdf.bounds.to_numpy() + # Compute hilbert distances + coords = _continuous_to_discrete_coords(total_bounds, bounds, p) + distances = _distances_from_coordinates(p, coords) + + return pd.Series(distances, index=gdf.index, name="hilbert_distance") + + +@ngjit +def _continuous_to_discrete_coords(total_bounds, bounds, p): + + """ + Calculates mid points & ranges of geoms and returns + as discrete coords + + Parameters + ---------- + + total_bounds : Total bounds of geometries - array + + bounds : Bounds of each geometry - array + + p : The number of iterations used in constructing the Hilbert curve + + Returns + --------- + Discrete two-dimensional numpy array + Two-dimensional array Array of hilbert distances for each geom + """ + + # Hilbert Side len + side_length = 2 ** p + + # Calculate x and y range of total bound coords - returns array + xmin, ymin, xmax, ymax = total_bounds + + # Calculate mid points for x and y bound coords - returns array + x_mids = (bounds[:, 0] + bounds[:, 2]) / 2.0 + y_mids = (bounds[:, 1] + bounds[:, 3]) / 2.0 + + # Transform continuous int to discrete int for each dimension + x_int = _continuous_to_discrete(x_mids, (xmin, xmax), side_length) + y_int = _continuous_to_discrete(y_mids, (ymin, ymax), side_length) + # Stack x and y discrete ints + coords = np.stack((x_int, y_int), axis=1) + + return coords + + +@ngjit +def _continuous_to_discrete(vals, val_range, n): + + """ + Convert a continuous one-dimensional array to discrete int + based on values and their ranges + + Parameters + ---------- + vals : Array of continuous values + + val_range : Tuple containing range of continuous values + + n : Number of discrete values + + Returns + --------- + One-dimensional array of discrete ints + """ + + width = val_range[1] - val_range[0] + res = ((vals - val_range[0]) * (n / width)).astype(np.int64) + + # TO DO: When numba 0.54 releases - used.clip(res, min=0, max=n, out=res) + # clip + res[res < 0] = 0 + res[res > n - 1] = n - 1 + + return res + + +@ngjit +def _distances_from_coordinates(p, coords): + + """ + Calculate hilbert distance for a set of coords + + Parameters + ---------- + p : The number of iterations used in constructing the Hilbert curve. + + coords : Array of coordinates + + Returns + --------- + Array of hilbert distances for each geom + """ + + result = np.zeros(coords.shape[0], dtype=np.int64) + # For each coord calculate hilbert distance + for i in range(coords.shape[0]): + coord = coords[i, :] + result[i] = _distance_from_coordinate(p, coord) + return result + + +@ngjit +def _distance_from_coordinate(p, coord): + + """ + Calculate hilbert distance for a single coord + + Parameters + ---------- + p : The number of iterations used in constructing the Hilbert curve + + coord : Array of coordinates + + Returns + --------- + Array of hilbert distances for a single coord + """ + + n = len(coord) + M = 1 << (p - 1) + Q = M + while Q > 1: + P = Q - 1 + for i in range(n): + if coord[i] & Q: + coord[0] ^= P + else: + t = (coord[0] ^ coord[i]) & P + coord[0] ^= t + coord[i] ^= t + Q >>= 1 + # Gray encode + for i in range(1, n): + coord[i] ^= coord[i - 1] + t = 0 + Q = M + while Q > 1: + if coord[n - 1] & Q: + t ^= Q - 1 + Q >>= 1 + for i in range(n): + coord[i] ^= t + h = _transpose_to_hilbert_integer(p, coord) + return h + + +@ngjit +def _transpose_to_hilbert_integer(p, coord): + + """ + Calculate hilbert distance for a single coord + + Parameters + ---------- + p : The number of iterations used in constructing the Hilbert curve + + coord : Array of coordinates + + Returns + --------- + Array of hilbert distances for a single coord + """ + + n = len(coord) + bins = [_int_2_binary(v, p) for v in coord] + concat = np.zeros(n * p, dtype=np.uint8) + for i in range(p): + for j in range(n): + concat[n * i + j] = bins[j][i] + + h = _binary_2_int(concat) + return h + + +@ngjit +def _int_2_binary(v, width): + + """ + Convert an array of values from discrete int coordinates to binary byte + + Parameters + ---------- + p : The number of iterations used in constructing the Hilbert curve + + coord : Array of coordinates + + Returns + --------- + Binary byte + """ + + res = np.zeros(width, dtype=np.uint8) + for i in range(width): + res[width - i - 1] = v % 2 # zero-passed to width + v = v >> 1 + return res + + +@ngjit +def _binary_2_int(bin_vec): + + """ + Convert binary byte to int + + Parameters + ---------- + p : The number of iterations used in constructing the Hilbert curve + + coord : Array of coordinates + + Returns + --------- + Discrete int + """ + + res = 0 + next_val = 1 + width = len(bin_vec) + for i in range(width): + res += next_val * bin_vec[width - i - 1] + next_val <<= 1 + return res diff --git a/setup.py b/setup.py index 2542f6bc..d53add87 100644 --- a/setup.py +++ b/setup.py @@ -7,6 +7,7 @@ "geopandas", "dask>=2.18.0,!=2021.05.1", "distributed>=2.18.0,!=2021.05.1", + "numba", ] diff --git a/tests/test_hilbert_distance.py b/tests/test_hilbert_distance.py new file mode 100644 index 00000000..d648a2d2 --- /dev/null +++ b/tests/test_hilbert_distance.py @@ -0,0 +1,62 @@ +import pytest +import pandas as pd +from pandas.testing import assert_index_equal +from hilbertcurve.hilbertcurve import HilbertCurve +from dask_geopandas.hilbert_distance import _continuous_to_discrete_coords +from dask_geopandas import from_geopandas +import geopandas +from shapely.geometry import Point, LineString, Polygon + + +@pytest.fixture +def geoseries_points(): + p1 = Point(1, 2) + p2 = Point(2, 3) + p3 = Point(3, 4) + p4 = Point(4, 1) + return geopandas.GeoSeries([p1, p2, p3, p4]) + + +@pytest.fixture +def geoseries_lines(): + l1 = LineString([(0, 0), (0, 1), (1, 1)]) + l2 = LineString([(0, 0), (1, 0), (1, 1), (0, 1)]) + return geopandas.GeoSeries([l1, l2] * 2) + + +@pytest.fixture() +def geoseries_polygons(): + t1 = Polygon([(0, 3.5), (7, 2.4), (1, 0.1)]) + t2 = Polygon([(0, 0), (1, 1), (0, 1)]) + sq1 = Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]) + sq2 = Polygon([(0, 0), (1, 0), (1, 2), (0, 2)]) + return geopandas.GeoSeries([t1, t2, sq1, sq2]) + + +def hilbert_distance_dask(geoseries): + + bounds = geoseries.bounds.to_numpy() + total_bounds = geoseries.total_bounds + coords = _continuous_to_discrete_coords(total_bounds, bounds, p=15) + + hilbert_curve = HilbertCurve(p=15, n=2) + expected = hilbert_curve.distances_from_points(coords) + + ddf = from_geopandas(geoseries, npartitions=1) + result = ddf.hilbert_distance().compute() + + assert list(result) == expected + assert isinstance(result, pd.Series) + assert_index_equal(ddf.index.compute(), result.index) + + +def test_hilbert_distance_points(geoseries_points): + hilbert_distance_dask(geoseries_points) + + +def test_hilbert_distance_lines(geoseries_lines): + hilbert_distance_dask(geoseries_lines) + + +def test_hilbert_distance_polygons(geoseries_polygons): + hilbert_distance_dask(geoseries_polygons)