Skip to content

Commit

Permalink
ENH: Implement Hilbert distance (#70)
Browse files Browse the repository at this point in the history
* calculate hilbert distances for geopandas

* calculate hilbert distances for dask-geopandas

* add test to check whether calculated hilbert distances match between geopandas as dask-geopandas

* reformatted previous commits using black

* updated _hilbert_distance with no for loops

* drop test_hilbert_distance tmp

* add numba acceleration

* update hilbert_distance.py docstring

* updated black to avoid failing

* add numba to continuous env yml files

* updated hilbert_distance to lazily evaluate total_bounds

* updated core.py docstring for hilbert_distance

* reformat calculating hilbert distance for numba & cleaner syntax

* remove bounds_to_numpy

* change x_width to width

* update docstring for _continuous_to_discrete

* split _hilbert_distance for testing/debugging

* output pd.series instead of array

* add test for calc hilbert_distance

* add hilbert_curve to CI env & fix test error

* Update continuous_integration/envs/38-latest.yaml

Co-authored-by: Martin Fleischmann <martin@martinfleischmann.net>

* update hilbert test & ci env

* Preserve original gdf index using hilbert_distance

Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>

* Update hilbert curve dependency in 39-dev.yaml

Co-authored-by: Martin Fleischmann <martin@martinfleischmann.net>

* Use latest version of numba

Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>

* Use latest version of numba

Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>

* Use latest version of numba

Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>

* Explicitly call x & y mids in _continuous_to_discrete_coords

Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>

* Update dask_geopandas/hilbert_distance.py

Explicitly call total_bounds in _continuous_to_discrete_coords

Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>

* Drop unnecessary () from pytest.fixture in test_core.py

* call total bounds and x&y mids in _continuous_discrete

* merge _hilbert_distance and _calculate_hilbert_distance

* Use latest version of numba

* add assert statement to check whether gdf indexes are equal

* add numba to requirements

* update docstring

* Update setup.py

Co-authored-by: Martin Fleischmann <martin@martinfleischmann.net>
Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
  • Loading branch information
3 people committed Jul 22, 2021
1 parent 5984a6f commit 837f2a3
Show file tree
Hide file tree
Showing 8 changed files with 359 additions and 0 deletions.
2 changes: 2 additions & 0 deletions continuous_integration/envs/37-latest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@ dependencies:
# required dependencies
- python=3.7
- dask=2.18
- numba
- distributed
- geopandas
- pygeos
- pyproj=2.6 # unpin when pyproj 3.1 is released with multithreading fix
# test dependencies
- pytest
- pytest-cov
- hilbertcurve
2 changes: 2 additions & 0 deletions continuous_integration/envs/38-latest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@ dependencies:
# required dependencies
- python=3.8
- dask=2021.05.0
- numba
- distributed
- geopandas
- pygeos
- pyproj=2.6 # unpin when pyproj 3.1 is released with multithreading fix
# test dependencies
- pytest
- pytest-cov
- hilbertcurve
# optional dependencies
- pyarrow
2 changes: 2 additions & 0 deletions continuous_integration/envs/39-dev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ channels:
dependencies:
# required dependencies
- python=3.9
- numba
- distributed
- pandas
- shapely
Expand All @@ -13,6 +14,7 @@ dependencies:
# test dependencies
- pytest
- pytest-cov
- hilbertcurve
# optional dependencies
- pyarrow
- pip
Expand Down
2 changes: 2 additions & 0 deletions continuous_integration/envs/39-latest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@ dependencies:
# required dependencies
- python=3.9
- dask
- numba
- distributed
- geopandas
- pygeos
- pyproj=2.6 # unpin when pyproj 3.1 is released with multithreading fix
# test dependencies
- pytest
- pytest-cov
- hilbertcurve
# optional dependencies
- pyarrow
32 changes: 32 additions & 0 deletions dask_geopandas/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import geopandas
from shapely.geometry.base import BaseGeometry
from shapely.geometry import box
from .hilbert_distance import _hilbert_distance


def _set_crs(df, crs, allow_override):
Expand Down Expand Up @@ -304,6 +305,37 @@ def explode(self):
def cx(self):
return _CoordinateIndexer(self)

def hilbert_distance(self, p=15):

"""
A function that calculates the Hilbert distance between the geometry bounds
and total bounds of a Dask-GeoDataFrame.
The Hilbert distance can be used to spatially partition Dask-GeoPandas objects,
by mapping two dimensional geometries along the Hilbert curve.
Parameters
----------
p : The number of iterations used in constructing the Hilbert curve.
Returns
----------
Distances for each partition
"""

# Compute total bounds of all partitions rather than each partition
total_bounds = self.total_bounds

# Calculate hilbert distances for each partition
distances = self.map_partitions(
_hilbert_distance,
total_bounds=total_bounds,
p=p,
meta=pd.Series([], name="hilbert_distance", dtype="int"),
)

return distances


class GeoSeries(_Frame, dd.core.Series):
_partition_type = geopandas.GeoSeries
Expand Down
256 changes: 256 additions & 0 deletions dask_geopandas/hilbert_distance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
import numpy as np
import pandas as pd
from numba import jit

ngjit = jit(nopython=True, nogil=True)

# Based on: https://github.com/holoviz/spatialpandas/blob/
# 9252a7aba5f8bc7a435fffa2c31018af8d92942c/spatialpandas/dask.py


def _hilbert_distance(gdf, total_bounds, p):

"""
Calculate hilbert distance for a GeoDataFrame
int coordinates
Parameters
----------
gdf : GeoDataFrame
total_bounds : Total bounds of geometries - array
p : The number of iterations used in constructing the Hilbert curve
Returns
---------
Pandas Series containing hilbert distances
"""

# Compute bounds as array
bounds = gdf.bounds.to_numpy()
# Compute hilbert distances
coords = _continuous_to_discrete_coords(total_bounds, bounds, p)
distances = _distances_from_coordinates(p, coords)

return pd.Series(distances, index=gdf.index, name="hilbert_distance")


@ngjit
def _continuous_to_discrete_coords(total_bounds, bounds, p):

"""
Calculates mid points & ranges of geoms and returns
as discrete coords
Parameters
----------
total_bounds : Total bounds of geometries - array
bounds : Bounds of each geometry - array
p : The number of iterations used in constructing the Hilbert curve
Returns
---------
Discrete two-dimensional numpy array
Two-dimensional array Array of hilbert distances for each geom
"""

# Hilbert Side len
side_length = 2 ** p

# Calculate x and y range of total bound coords - returns array
xmin, ymin, xmax, ymax = total_bounds

# Calculate mid points for x and y bound coords - returns array
x_mids = (bounds[:, 0] + bounds[:, 2]) / 2.0
y_mids = (bounds[:, 1] + bounds[:, 3]) / 2.0

# Transform continuous int to discrete int for each dimension
x_int = _continuous_to_discrete(x_mids, (xmin, xmax), side_length)
y_int = _continuous_to_discrete(y_mids, (ymin, ymax), side_length)
# Stack x and y discrete ints
coords = np.stack((x_int, y_int), axis=1)

return coords


@ngjit
def _continuous_to_discrete(vals, val_range, n):

"""
Convert a continuous one-dimensional array to discrete int
based on values and their ranges
Parameters
----------
vals : Array of continuous values
val_range : Tuple containing range of continuous values
n : Number of discrete values
Returns
---------
One-dimensional array of discrete ints
"""

width = val_range[1] - val_range[0]
res = ((vals - val_range[0]) * (n / width)).astype(np.int64)

# TO DO: When numba 0.54 releases - used.clip(res, min=0, max=n, out=res)
# clip
res[res < 0] = 0
res[res > n - 1] = n - 1

return res


@ngjit
def _distances_from_coordinates(p, coords):

"""
Calculate hilbert distance for a set of coords
Parameters
----------
p : The number of iterations used in constructing the Hilbert curve.
coords : Array of coordinates
Returns
---------
Array of hilbert distances for each geom
"""

result = np.zeros(coords.shape[0], dtype=np.int64)
# For each coord calculate hilbert distance
for i in range(coords.shape[0]):
coord = coords[i, :]
result[i] = _distance_from_coordinate(p, coord)
return result


@ngjit
def _distance_from_coordinate(p, coord):

"""
Calculate hilbert distance for a single coord
Parameters
----------
p : The number of iterations used in constructing the Hilbert curve
coord : Array of coordinates
Returns
---------
Array of hilbert distances for a single coord
"""

n = len(coord)
M = 1 << (p - 1)
Q = M
while Q > 1:
P = Q - 1
for i in range(n):
if coord[i] & Q:
coord[0] ^= P
else:
t = (coord[0] ^ coord[i]) & P
coord[0] ^= t
coord[i] ^= t
Q >>= 1
# Gray encode
for i in range(1, n):
coord[i] ^= coord[i - 1]
t = 0
Q = M
while Q > 1:
if coord[n - 1] & Q:
t ^= Q - 1
Q >>= 1
for i in range(n):
coord[i] ^= t
h = _transpose_to_hilbert_integer(p, coord)
return h


@ngjit
def _transpose_to_hilbert_integer(p, coord):

"""
Calculate hilbert distance for a single coord
Parameters
----------
p : The number of iterations used in constructing the Hilbert curve
coord : Array of coordinates
Returns
---------
Array of hilbert distances for a single coord
"""

n = len(coord)
bins = [_int_2_binary(v, p) for v in coord]
concat = np.zeros(n * p, dtype=np.uint8)
for i in range(p):
for j in range(n):
concat[n * i + j] = bins[j][i]

h = _binary_2_int(concat)
return h


@ngjit
def _int_2_binary(v, width):

"""
Convert an array of values from discrete int coordinates to binary byte
Parameters
----------
p : The number of iterations used in constructing the Hilbert curve
coord : Array of coordinates
Returns
---------
Binary byte
"""

res = np.zeros(width, dtype=np.uint8)
for i in range(width):
res[width - i - 1] = v % 2 # zero-passed to width
v = v >> 1
return res


@ngjit
def _binary_2_int(bin_vec):

"""
Convert binary byte to int
Parameters
----------
p : The number of iterations used in constructing the Hilbert curve
coord : Array of coordinates
Returns
---------
Discrete int
"""

res = 0
next_val = 1
width = len(bin_vec)
for i in range(width):
res += next_val * bin_vec[width - i - 1]
next_val <<= 1
return res
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"geopandas",
"dask>=2.18.0,!=2021.05.1",
"distributed>=2.18.0,!=2021.05.1",
"numba",
]


Expand Down
Loading

0 comments on commit 837f2a3

Please sign in to comment.