Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Implement Hilbert distance #70

Merged
merged 38 commits into from
Jul 22, 2021
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
78403c0
calculate hilbert distances for geopandas
tastatham Jul 7, 2021
825e31e
calculate hilbert distances for dask-geopandas
tastatham Jul 7, 2021
cb173d6
add test to check whether calculated hilbert distances match between …
tastatham Jul 7, 2021
f5d5d52
reformatted previous commits using black
tastatham Jul 7, 2021
bc6fd49
updated _hilbert_distance with no for loops
tastatham Jul 7, 2021
d0f8802
drop test_hilbert_distance tmp
tastatham Jul 12, 2021
bc9b632
add numba acceleration
tastatham Jul 12, 2021
ce52794
update hilbert_distance.py docstring
tastatham Jul 12, 2021
92849da
updated black to avoid failing
tastatham Jul 12, 2021
fc898fa
add numba to continuous env yml files
tastatham Jul 12, 2021
0fba9a8
updated hilbert_distance to lazily evaluate total_bounds
tastatham Jul 12, 2021
9cdb098
updated core.py docstring for hilbert_distance
tastatham Jul 12, 2021
251ae60
reformat calculating hilbert distance for numba & cleaner syntax
tastatham Jul 12, 2021
4511b94
remove bounds_to_numpy
tastatham Jul 13, 2021
1266357
change x_width to width
tastatham Jul 13, 2021
8971210
update docstring for _continuous_to_discrete
tastatham Jul 13, 2021
f7eb6d5
split _hilbert_distance for testing/debugging
tastatham Jul 14, 2021
91465f3
output pd.series instead of array
tastatham Jul 14, 2021
8737e55
add test for calc hilbert_distance
tastatham Jul 14, 2021
b86b5d2
add hilbert_curve to CI env & fix test error
tastatham Jul 14, 2021
947bf77
Update continuous_integration/envs/38-latest.yaml
tastatham Jul 14, 2021
c8fbd7f
update hilbert test & ci env
tastatham Jul 14, 2021
3fa1d01
Preserve original gdf index using hilbert_distance
tastatham Jul 15, 2021
d8a2d46
Update hilbert curve dependency in 39-dev.yaml
tastatham Jul 15, 2021
dbfd011
Use latest version of numba
tastatham Jul 15, 2021
7d750a2
Use latest version of numba
tastatham Jul 15, 2021
a9cf761
Use latest version of numba
tastatham Jul 15, 2021
922addd
Explicitly call x & y mids in _continuous_to_discrete_coords
tastatham Jul 15, 2021
e193f87
Update dask_geopandas/hilbert_distance.py
tastatham Jul 15, 2021
aa47fdd
Drop unnecessary () from pytest.fixture in test_core.py
tastatham Jul 15, 2021
cdf9458
call total bounds and x&y mids in _continuous_discrete
tastatham Jul 15, 2021
98780cd
merge _hilbert_distance and _calculate_hilbert_distance
tastatham Jul 15, 2021
1e1c196
Use latest version of numba
tastatham Jul 15, 2021
8a8b114
add assert statement to check whether gdf indexes are equal
tastatham Jul 15, 2021
7c19310
Merge remote-tracking branch 'origin/master' into hilbert_distance
tastatham Jul 16, 2021
7ae61ca
add numba to requirements
tastatham Jul 16, 2021
bc41634
update docstring
tastatham Jul 21, 2021
e88f940
Update setup.py
martinfleis Jul 22, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions dask_geopandas/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import geopandas
from shapely.geometry.base import BaseGeometry
from shapely.geometry import box
from .hilbert_distance import _hilbert_distance


def _set_crs(df, crs, allow_override):
Expand Down Expand Up @@ -306,6 +307,30 @@ def explode(self):
def cx(self):
return _CoordinateIndexer(self)

def hilbert_distance(self, p=15):

"""
A function that calculates hilbert distance for each geometry
in each partition of a Dask-GeoDataFrame
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a sentence explaining what the Hilbert distance is and why it is useful? So that user does not have to google it to understand what the function does.


Parameters
----------
p : Hilbert curve parameter

Returns
----------
Distances for each partition
"""

# Compute total bounds of all partitions rather than each partition
total_bounds = self.total_bounds.compute()
tastatham marked this conversation as resolved.
Show resolved Hide resolved
tastatham marked this conversation as resolved.
Show resolved Hide resolved
# Calculate hilbert distances for each partition
distances = self.map_partitions(
lambda s: _hilbert_distance(s, total_bounds=total_bounds, p=p)
tastatham marked this conversation as resolved.
Show resolved Hide resolved
)

return distances


class GeoSeries(_Frame, dd.core.Series):
_partition_type = geopandas.GeoSeries
Expand Down
261 changes: 261 additions & 0 deletions dask_geopandas/hilbert_distance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,261 @@
import geopandas
import numpy as np

# from numba import jit # optional - do we want numba as a dependency for dask-geopandas?
# ngjit = jit(nopython=True, nogil=True)
tastatham marked this conversation as resolved.
Show resolved Hide resolved


def _hilbert_distance(gdf, total_bounds, p):

"""
Calculate the hilbert distance for a GeoDataFrame based on the mid-point of
the bounds for each geom and total bounds of the collection of geoms

Based on: https://github.com/holoviz/spatialpandas/blob/
9252a7aba5f8bc7a435fffa2c31018af8d92942c/spatialpandas/dask.py#L172

Parameters
----------
gdf : GeoDataFrame
tastatham marked this conversation as resolved.
Show resolved Hide resolved

total_bounds : Total bounds of GeoDataFrame

p : Hilbert curve parameter

Returns
---------
Array of hilbert distances for each geom
"""

if total_bounds is None:
total_bounds = gdf.total_bounds

# Calculate bounds of each geom
bounds = gdf.bounds.to_numpy()
martinfleis marked this conversation as resolved.
Show resolved Hide resolved

# Hilbert Side len
side_length = 2 ** p

# Calculate x and y range of total bound coords - returns array
geom_ranges = [
(total_bounds[0], total_bounds[2]),
(total_bounds[1], total_bounds[3]),
]
tastatham marked this conversation as resolved.
Show resolved Hide resolved
# Calculate mid points for x and y bound coords - returns array
geom_mids = [
((bounds[:, 0] + bounds[:, 2]) / 2.0),
((bounds[:, 1] + bounds[:, 3]) / 2.0),
]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In general, do we know if there is an advantage to using such midpoints vs "centroid" vs "representative point" ?

(if there is not theoretical/practical reason for one or the other, we should maybe check which one is typically the cheapest to compute. EDIT: and based on a quick check, calculating the mids based on the bounds seems much faster)

Copy link
Contributor Author

@tastatham tastatham Jul 8, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My prior was that this approach would be faster - but I should have checked manually.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This result makes sense: the operations are in increasing levels of complexity. Midpoint is simple math based on bounds, centroid is more complex based on calculating the "center of mass" of the geometry (varies by geometry type), and "representative point" (point on surface) is probably yet more complex since it needs to ensure the result intersects the polygon.

More of a theoretical question (longer term): the key thing to consider here is how representative the midpoints are for ordinating geometries along the Hilbert curve: what is the tradeoff for how well the points represent the locations of the geometries for partitioning (e.g., suboptimal partitions) vs spatial operations performed against those partitions. Put differently: using midpoint for Hilbert may produce partitions quickly, but if those are suboptimal for overlay operations and makes those much slower, then maybe it is worth a somewhat more expensive method for getting the representative points. To do this, one would need to compare the full compute time of calculate Hilbert curve, repartition, overlay. I'm thinking of a case like the admin boundaries of France, which includes overseas territories. Midpoint of bounds will be far away from any of those boundaries. Centroid will be maybe a bit better but still far away from those boundaries. Representative point would be in continental France (I think?) and thus be more optimal for repartitioning and then overlay with other European polygons. Though - this could easily be solved by exploding into single-part geometries...

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Midpoints (and the same for centroids / representative points) might also give suboptimal results if you have a mix of large and small polygons. I was thinking it could be something to explore (later) if you could calculate the hilbert distance for eg the bounding box points, and consider those 4 points per row together when deciding the partitions. But then of course it's not a simple "sort the hilbert_distance column" to determine the partitions.

tastatham marked this conversation as resolved.
Show resolved Hide resolved

# Empty coord array
coords = np.zeros((bounds.shape[0], 2), dtype=np.int64)
# Transform continuous int to discrete int for each dimension
coords[:, 0] = _continuous_int_to_discrete_int(
geom_mids[0], geom_ranges[0], side_length
)
coords[:, 1] = _continuous_int_to_discrete_int(
geom_mids[1], geom_ranges[1], side_length
)
martinfleis marked this conversation as resolved.
Show resolved Hide resolved

# Calculate hilbert distance
hilbert_distances = _distances_from_coordinates(p, coords)

return hilbert_distances
tastatham marked this conversation as resolved.
Show resolved Hide resolved


# @ngjit
def _continuous_int_to_discrete_int(vals, val_range, n):
tastatham marked this conversation as resolved.
Show resolved Hide resolved

"""
Convert an array of values from continuous data coordinates to discrete
martinfleis marked this conversation as resolved.
Show resolved Hide resolved
int coordinates
martinfleis marked this conversation as resolved.
Show resolved Hide resolved

Based on: https://github.com/holoviz/spatialpandas/blob/
9252a7aba5f8bc7a435fffa2c31018af8d92942c/spatialpandas/utils.py#L9
tastatham marked this conversation as resolved.
Show resolved Hide resolved

Parameters
----------
vals : Array of continuous coordinates to be
([([val_1, val_2,..., val_n]), array([val_1, val_2,..., val_n])])

val_range : Ranges of x and y values ([(xmin, xmax), (ymin, ymax)])

n : Number of discrete coords (int)

Returns
---------
Array of discrete int coords
"""

x_width = val_range[1] - val_range[0]
martinfleis marked this conversation as resolved.
Show resolved Hide resolved
res = ((vals - val_range[0]) * (n / x_width)).astype(np.int64)
tastatham marked this conversation as resolved.
Show resolved Hide resolved

# clip
res[res < 0] = 0
res[res > n - 1] = n - 1
return res


def _distances_from_coordinates(p, coords):
tastatham marked this conversation as resolved.
Show resolved Hide resolved

"""
Calculate hilbert distance for a set of coords

Based on: https://github.com/holoviz/spatialpandas/blob/
9252a7aba5f8bc7a435fffa2c31018af8d92942c/spatialpandas/spatialindex/hilbert_curve.py#L173

Parameters
----------
p : Hilbert curve param

coords : Array of coordinates

Returns
---------
Array of hilbert distances for each geom
"""

# Create empty coord list
# coords = np.atleast_2d(coords).copy()
result = np.zeros(coords.shape[0], dtype=np.int64)
# For each coord calculate hilbert distance
for i in range(coords.shape[0]):
coord = coords[i, :]
result[i] = _distance_from_coordinate(p, coord)
return result


# @ngjit
def _distance_from_coordinate(p, coord):

"""
Calculate hilbert distance for a single coord

Based on: https://github.com/holoviz/spatialpandas/blob/
9252a7aba5f8bc7a435fffa2c31018af8d92942c/spatialpandas/spatialindex/rtree.py#L50

Parameters
----------
p : Hilbert curve param

coord : Array of coordinates

Returns
---------
Array of hilbert distances for a single coord
"""

n = len(coord)
M = 1 << (p - 1)
Q = M
while Q > 1:
P = Q - 1
for i in range(n):
if coord[i] & Q:
coord[0] ^= P
else:
t = (coord[0] ^ coord[i]) & P
coord[0] ^= t
coord[i] ^= t
Q >>= 1
# Gray encode
for i in range(1, n):
coord[i] ^= coord[i - 1]
t = 0
Q = M
while Q > 1:
if coord[n - 1] & Q:
t ^= Q - 1
Q >>= 1
for i in range(n):
coord[i] ^= t
h = _transpose_to_hilbert_integer(p, coord)
return h


# @ngjit
def _transpose_to_hilbert_integer(p, coord):

"""
Calculate hilbert distance for a single coord

Based on: https://github.com/holoviz/spatialpandas/blob/
9252a7aba5f8bc7a435fffa2c31018af8d92942c/spatialpandas/spatialindex/hilbert_curve.py#L53


Parameters
----------
p : Hilbert curve param

coord : Array of coordinates

Returns
---------
Array of hilbert distances for a single coord
"""

n = len(coord)
bins = [_int_2_binary(v, p) for v in coord]
concat = np.zeros(n * p, dtype=np.uint8)
for i in range(p):
for j in range(n):
concat[n * i + j] = bins[j][i]

h = _binary_2_int(concat)
return h


# @ngjit
def _int_2_binary(v, width):

"""
Convert an array of values from discrete int coordinates to binary byte

Based on: https://github.com/holoviz/spatialpandas/blob/
9252a7aba5f8bc7a435fffa2c31018af8d92942c/spatialpandas/spatialindex/hilbert_curve.py#L12

Parameters
----------
p : Hilbert curve param

coord : Array of coordinates

Returns
---------
# Returns binary byte
"""

res = np.zeros(width, dtype=np.uint8)
for i in range(width):
res[width - i - 1] = v % 2 # zero-passed to width
v = v >> 1
return res


# @ngjit
def _binary_2_int(bin_vec):

"""
Convert binary byte to int

Based on: https://github.com/holoviz/spatialpandas/blob/
9252a7aba5f8bc7a435fffa2c31018af8d92942c/spatialpandas/spatialindex/hilbert_curve.py#L23

Parameters
----------
p : Hilbert curve param

coord : Array of coordinates

Returns
---------
# Returns discrete int
"""

res = 0
next_val = 1
width = len(bin_vec)
for i in range(width):
res += next_val * bin_vec[width - i - 1]
next_val <<= 1
return res
11 changes: 11 additions & 0 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,3 +397,14 @@ def test_geoseries_apply(geoseries_polygons):
def test_geodataframe_html_repr(geodf_points):
dask_obj = dask_geopandas.from_geopandas(geodf_points, npartitions=2)
assert "Dask-GeoPandas GeoDataFrame" in dask_obj._repr_html_()


def test_hilbert_distance(geodf_points):
tastatham marked this conversation as resolved.
Show resolved Hide resolved

df = geodf_point
dask_obj = dask_geopandas.from_geopandas(df, npartitions=10)

expected = hilbert_distances(df, df.total_bounds, p=16)
result = dask_obj._hilbert_distances(dask_obj.total_bounds, p=15).compute()

assert list(result) == list(expected)