Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Implement Hilbert distance #70

Merged
merged 38 commits into from
Jul 22, 2021
Merged
Show file tree
Hide file tree
Changes from 34 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
78403c0
calculate hilbert distances for geopandas
tastatham Jul 7, 2021
825e31e
calculate hilbert distances for dask-geopandas
tastatham Jul 7, 2021
cb173d6
add test to check whether calculated hilbert distances match between …
tastatham Jul 7, 2021
f5d5d52
reformatted previous commits using black
tastatham Jul 7, 2021
bc6fd49
updated _hilbert_distance with no for loops
tastatham Jul 7, 2021
d0f8802
drop test_hilbert_distance tmp
tastatham Jul 12, 2021
bc9b632
add numba acceleration
tastatham Jul 12, 2021
ce52794
update hilbert_distance.py docstring
tastatham Jul 12, 2021
92849da
updated black to avoid failing
tastatham Jul 12, 2021
fc898fa
add numba to continuous env yml files
tastatham Jul 12, 2021
0fba9a8
updated hilbert_distance to lazily evaluate total_bounds
tastatham Jul 12, 2021
9cdb098
updated core.py docstring for hilbert_distance
tastatham Jul 12, 2021
251ae60
reformat calculating hilbert distance for numba & cleaner syntax
tastatham Jul 12, 2021
4511b94
remove bounds_to_numpy
tastatham Jul 13, 2021
1266357
change x_width to width
tastatham Jul 13, 2021
8971210
update docstring for _continuous_to_discrete
tastatham Jul 13, 2021
f7eb6d5
split _hilbert_distance for testing/debugging
tastatham Jul 14, 2021
91465f3
output pd.series instead of array
tastatham Jul 14, 2021
8737e55
add test for calc hilbert_distance
tastatham Jul 14, 2021
b86b5d2
add hilbert_curve to CI env & fix test error
tastatham Jul 14, 2021
947bf77
Update continuous_integration/envs/38-latest.yaml
tastatham Jul 14, 2021
c8fbd7f
update hilbert test & ci env
tastatham Jul 14, 2021
3fa1d01
Preserve original gdf index using hilbert_distance
tastatham Jul 15, 2021
d8a2d46
Update hilbert curve dependency in 39-dev.yaml
tastatham Jul 15, 2021
dbfd011
Use latest version of numba
tastatham Jul 15, 2021
7d750a2
Use latest version of numba
tastatham Jul 15, 2021
a9cf761
Use latest version of numba
tastatham Jul 15, 2021
922addd
Explicitly call x & y mids in _continuous_to_discrete_coords
tastatham Jul 15, 2021
e193f87
Update dask_geopandas/hilbert_distance.py
tastatham Jul 15, 2021
aa47fdd
Drop unnecessary () from pytest.fixture in test_core.py
tastatham Jul 15, 2021
cdf9458
call total bounds and x&y mids in _continuous_discrete
tastatham Jul 15, 2021
98780cd
merge _hilbert_distance and _calculate_hilbert_distance
tastatham Jul 15, 2021
1e1c196
Use latest version of numba
tastatham Jul 15, 2021
8a8b114
add assert statement to check whether gdf indexes are equal
tastatham Jul 15, 2021
7c19310
Merge remote-tracking branch 'origin/master' into hilbert_distance
tastatham Jul 16, 2021
7ae61ca
add numba to requirements
tastatham Jul 16, 2021
bc41634
update docstring
tastatham Jul 21, 2021
e88f940
Update setup.py
martinfleis Jul 22, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
repos:
- repo: https://github.com/psf/black
rev: stable
rev: 21.6b0
hooks:
- id: black
language_version: python3

- repo: https://gitlab.com/pycqa/flake8
rev: 3.7.9
rev: 3.9.2
tastatham marked this conversation as resolved.
Show resolved Hide resolved
hooks:
- id: flake8
language_version: python3
2 changes: 2 additions & 0 deletions continuous_integration/envs/37-latest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@ dependencies:
# required dependencies
- python=3.7
- dask=2.18
- numba
- distributed
- geopandas
- pygeos
- pyproj=2.6 # unpin when pyproj 3.1 is released with multithreading fix
# test dependencies
- pytest
- pytest-cov
- hilbertcurve
2 changes: 2 additions & 0 deletions continuous_integration/envs/38-latest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@ dependencies:
# required dependencies
- python=3.8
- dask=2021.05.0
- numba
- distributed
- geopandas
- pygeos
- pyproj=2.6 # unpin when pyproj 3.1 is released with multithreading fix
# test dependencies
- pytest
- pytest-cov
- hilbertcurve
# optional dependencies
- pyarrow
2 changes: 2 additions & 0 deletions continuous_integration/envs/39-dev.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ channels:
dependencies:
# required dependencies
- python=3.9
- numba
- distributed
- pandas
- shapely
Expand All @@ -13,6 +14,7 @@ dependencies:
# test dependencies
- pytest
- pytest-cov
- hilbertcurve
# optional dependencies
- pyarrow
- pip
Expand Down
2 changes: 2 additions & 0 deletions continuous_integration/envs/39-latest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@ dependencies:
# required dependencies
- python=3.9
- dask
- numba
- distributed
- geopandas
- pygeos
- pyproj=2.6 # unpin when pyproj 3.1 is released with multithreading fix
# test dependencies
- pytest
- pytest-cov
- hilbertcurve
# optional dependencies
- pyarrow
29 changes: 29 additions & 0 deletions dask_geopandas/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import geopandas
from shapely.geometry.base import BaseGeometry
from shapely.geometry import box
from .hilbert_distance import _hilbert_distance


def _set_crs(df, crs, allow_override):
Expand Down Expand Up @@ -306,6 +307,34 @@ def explode(self):
def cx(self):
return _CoordinateIndexer(self)

def hilbert_distance(self, p=15):

"""
A function that calculates hilbert distance for each geometry
in each partition of a Dask-GeoDataFrame
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a sentence explaining what the Hilbert distance is and why it is useful? So that user does not have to google it to understand what the function does.


Parameters
----------
p : Hilbert curve parameter
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be good to add some explanation of what this is, how it influences the result, and some guidance on whether you should set this or not


Returns
----------
Distances for each partition
"""

# Compute total bounds of all partitions rather than each partition
total_bounds = self.total_bounds
martinfleis marked this conversation as resolved.
Show resolved Hide resolved

# Calculate hilbert distances for each partition
distances = self.map_partitions(
_hilbert_distance,
total_bounds=total_bounds,
p=p,
meta=pd.Series([], name="hilbert_distance", dtype="int"),
)

return distances


class GeoSeries(_Frame, dd.core.Series):
_partition_type = geopandas.GeoSeries
Expand Down
256 changes: 256 additions & 0 deletions dask_geopandas/hilbert_distance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
import numpy as np
import pandas as pd
from numba import jit

ngjit = jit(nopython=True, nogil=True)

# Based on: https://github.com/holoviz/spatialpandas/blob/
# 9252a7aba5f8bc7a435fffa2c31018af8d92942c/spatialpandas/dask.py


def _hilbert_distance(gdf, total_bounds, p):

"""
Calculate hilbert distance for a GeoDataFrame
int coordinates

Parameters
----------
gdf : GeoDataFrame

total_bounds : Total bounds of geometries - array

p : Hilbert curve parameter

Returns
---------
Pandas Series containing hilbert distances
"""

# Compute bounds as array
bounds = gdf.bounds.to_numpy()
martinfleis marked this conversation as resolved.
Show resolved Hide resolved
# Compute hilbert distances
coords = _continuous_to_discrete_coords(total_bounds, bounds, p)
distances = _distances_from_coordinates(p, coords)

return pd.Series(distances, index=gdf.index, name="hilbert_distance")


@ngjit
def _continuous_to_discrete_coords(total_bounds, bounds, p):

"""
Calculates mid points & ranges of geoms and returns
as discrete coords

Parameters
----------

total_bounds : Total bounds of geometries - array

bounds : Bounds of each geometry - array

p : Hilbert curve parameter

Returns
---------
Discrete two-dimensional numpy array
Two-dimensional array Array of hilbert distances for each geom
"""

# Hilbert Side len
side_length = 2 ** p

# Calculate x and y range of total bound coords - returns array
xmin, ymin, xmax, ymax = total_bounds

# Calculate mid points for x and y bound coords - returns array
x_mids = (bounds[:, 0] + bounds[:, 2]) / 2.0
y_mids = (bounds[:, 1] + bounds[:, 3]) / 2.0

# Transform continuous int to discrete int for each dimension
x_int = _continuous_to_discrete(x_mids, (xmin, xmax), side_length)
y_int = _continuous_to_discrete(y_mids, (ymin, ymax), side_length)
# Stack x and y discrete ints
coords = np.stack((x_int, y_int), axis=1)

return coords


@ngjit
def _continuous_to_discrete(vals, val_range, n):

"""
Convert a continuous one-dimensional array to discrete int
based on values and their ranges

Parameters
----------
vals : Array of continuous values

val_range : Tuple containing range of continuous values

n : Number of discrete values

Returns
---------
One-dimensional array of discrete ints
"""

width = val_range[1] - val_range[0]
res = ((vals - val_range[0]) * (n / width)).astype(np.int64)

# TO DO: When numba 0.54 releases - used.clip(res, min=0, max=n, out=res)
martinfleis marked this conversation as resolved.
Show resolved Hide resolved
# clip
res[res < 0] = 0
res[res > n - 1] = n - 1

return res


@ngjit
def _distances_from_coordinates(p, coords):
tastatham marked this conversation as resolved.
Show resolved Hide resolved

"""
Calculate hilbert distance for a set of coords

Parameters
----------
p : Hilbert curve param

coords : Array of coordinates

Returns
---------
Array of hilbert distances for each geom
"""

result = np.zeros(coords.shape[0], dtype=np.int64)
# For each coord calculate hilbert distance
for i in range(coords.shape[0]):
coord = coords[i, :]
result[i] = _distance_from_coordinate(p, coord)
return result


@ngjit
def _distance_from_coordinate(p, coord):

"""
Calculate hilbert distance for a single coord

Parameters
----------
p : Hilbert curve param

coord : Array of coordinates

Returns
---------
Array of hilbert distances for a single coord
"""

n = len(coord)
M = 1 << (p - 1)
Q = M
while Q > 1:
P = Q - 1
for i in range(n):
if coord[i] & Q:
coord[0] ^= P
else:
t = (coord[0] ^ coord[i]) & P
coord[0] ^= t
coord[i] ^= t
Q >>= 1
# Gray encode
for i in range(1, n):
coord[i] ^= coord[i - 1]
t = 0
Q = M
while Q > 1:
if coord[n - 1] & Q:
t ^= Q - 1
Q >>= 1
for i in range(n):
coord[i] ^= t
h = _transpose_to_hilbert_integer(p, coord)
return h


@ngjit
def _transpose_to_hilbert_integer(p, coord):

"""
Calculate hilbert distance for a single coord

Parameters
----------
p : Hilbert curve param

coord : Array of coordinates

Returns
---------
Array of hilbert distances for a single coord
"""

n = len(coord)
bins = [_int_2_binary(v, p) for v in coord]
concat = np.zeros(n * p, dtype=np.uint8)
for i in range(p):
for j in range(n):
concat[n * i + j] = bins[j][i]

h = _binary_2_int(concat)
return h


@ngjit
def _int_2_binary(v, width):

"""
Convert an array of values from discrete int coordinates to binary byte

Parameters
----------
p : Hilbert curve param

coord : Array of coordinates

Returns
---------
Binary byte
"""

res = np.zeros(width, dtype=np.uint8)
for i in range(width):
res[width - i - 1] = v % 2 # zero-passed to width
v = v >> 1
return res


@ngjit
def _binary_2_int(bin_vec):

"""
Convert binary byte to int

Parameters
----------
p : Hilbert curve param

coord : Array of coordinates

Returns
---------
Discrete int
"""

res = 0
next_val = 1
width = len(bin_vec)
for i in range(width):
res += next_val * bin_vec[width - i - 1]
next_val <<= 1
return res