Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add spatial-join algorithm #475

Merged
merged 14 commits into from Aug 20, 2017
110 changes: 110 additions & 0 deletions geopandas/algos.c
@@ -0,0 +1,110 @@

#include <geos_c.h>
#include "kvec.h"
#include "algos.h"
#include <stdio.h>
#include <time.h>
#include <stdbool.h>

typedef char (*GEOSPreparedPredicate)(GEOSContextHandle_t handle, const GEOSPreparedGeometry *left, const GEOSGeometry *right);


/* Callback to give to strtree_query
* Given the value returned from each intersecting geometry it inserts that
* value (typically an index) into the given size_vector */
void strtree_query_callback(void *item, void *vec)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add some more comments/docstrings?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've added some more docstrings. Please feel free to point out further areas where this can be improved.

{
kv_push(size_t, *((size_vector*) vec), (size_t) item);
}


/* Create STRTree spatial index from an array of Geometries */
GEOSSTRtree *create_index(GEOSContextHandle_t handle, GEOSGeometry **geoms, size_t n)
{
int i;
GEOSSTRtree* tree = GEOSSTRtree_create_r(handle, n);

for (i = 0; i < n ; i++)
{
if (geoms[i] != NULL)
{
GEOSSTRtree_insert_r(handle, tree, geoms[i], (void*) i);
}
}
return tree;
}


/* Spatial join of two arrays of geometries over the predicate
*
* This places all right-side geometries into an STRTree spatial index
* And then iterates through the left side and compares them against the index
* This produces a rough intersection of all geometry pairs that might interact
* Then we filter those pairs by the more precise spatial predicate
* like intersects, contains, covers, etc..
* This returns an array of indices in each side that match
* Organized in a [left_0, right_0, left_1, right_1, ... ] order
*/
size_vector sjoin(GEOSContextHandle_t handle,
GEOSPreparedPredicate predicate,
GEOSGeometry **left, size_t nleft,
GEOSGeometry **right, size_t nright)
{
// clock_t begin, begin_1, begin_2;
// clock_t end, end_1, end_2;
// double time_spent = 0, time_spent_1 = 0, time_spent_2 = 0;

size_t l, r; // indices for left and right sides
GEOSPreparedGeometry* prepared; // Temporary prepared geometry for right side
size_vector out; // Resizable output array of matching indices
size_vector vec; // Temporary array for matches for each geometry
kv_init(out);
kv_init(vec);

// begin = clock();

GEOSSTRtree* tree = create_index(handle, right, nright);

// end = clock();
// time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
// printf("Create tree, %f\n", time_spent);
// begin = end;

for (l = 0; l < nleft; l++)
{
if (left[l] != NULL)
{
// begin_1 = clock();
// Find all geometries of the right side that are close. Store in vec
GEOSSTRtree_query_r(handle, tree, left[l], strtree_query_callback, &vec);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this stores indices of all intersecting geoms of the right array that intersect with left[l] in temporary array vec, correct?

// end_1 = clock();
// time_spent_1 += (double)(end_1 - begin_1) / CLOCKS_PER_SEC;

// Prepare left side for fine-grained predicate
prepared = GEOSPrepare_r(handle, left[l]);

// begin_2 = clock();
// Iterate over vec and compare with predicate
while (vec.n)
{
r = kv_pop(vec);
if (predicate(handle, prepared, right[r])){
kv_push(size_t, out, l);
kv_push(size_t, out, r);
}
}
GEOSPreparedGeom_destroy_r(handle, prepared);
// end_2 = clock();
// time_spent_2 += (double)(end_2 - begin_2) / CLOCKS_PER_SEC;
}
}
// end = clock();
// time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
// printf("Intersect, %f\n", time_spent);
// printf("query, %f\n", time_spent_1);
// printf("intersect, %f\n", time_spent_2);

GEOSSTRtree_destroy_r(handle, tree);
kv_destroy(vec);
return out;
}
26 changes: 26 additions & 0 deletions geopandas/algos.h
@@ -0,0 +1,26 @@
#include <geos_c.h>
#include "kvec.h"

#ifndef GEOPANDAS_ALGOS_C
#define GEOPANDAS_ALGOS_C

typedef char (*GEOSPredicate)(GEOSContextHandle_t handle, const GEOSGeometry *left, const GEOSGeometry *right);
typedef char (*GEOSPreparedPredicate)(GEOSContextHandle_t handle, const GEOSPreparedGeometry *left, const GEOSGeometry *right);

/* A resizable vector
* This is the same as kvec_t(size_t)
*/
typedef struct
{
size_t n, m;
size_t *a;
} size_vector;

void sjoin_callback(void *item, void *vec);

size_vector sjoin(GEOSContextHandle_t handle,
GEOSPreparedPredicate predicate,
GEOSGeometry **left, size_t nleft,
GEOSGeometry **right, size_t nright);

#endif
2 changes: 1 addition & 1 deletion geopandas/geodataframe.py
Expand Up @@ -83,7 +83,7 @@ def __init__(self, *args, **kwargs):
if isinstance(v, GeoSeries):
if 'columns' in kwargs:
columns = kwargs['columns'].copy()
columns.pop(i)
columns.remove(k)
kwargs['columns'] = columns
gs[k] = arg.pop(k)
kwargs['index'] = v.index # TODO: assumes consistent index
Expand Down
92 changes: 92 additions & 0 deletions geopandas/kvec.h
@@ -0,0 +1,92 @@
/* This contains a standalone resizable array implementation in C */

/* The MIT License
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you put at the top of this file a short note about why this file is includes (what it provides (implementation of resizeable vector) and why we need it (used to store matching indices in the spatial join implementation in algos.c)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a small note to the top of kvec.h


Copyright (c) 2008, by Attractive Chaos <attractor@live.co.uk>

Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:

The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/

/*
An example:

#include "kvec.h"
int main() {
kvec_t(int) array;
kv_init(array);
kv_push(int, array, 10); // append
kv_a(int, array, 20) = 5; // dynamic
kv_A(array, 20) = 4; // static
kv_destroy(array);
return 0;
}
*/

/*
2008-09-22 (0.1.0):

* The initial version.

*/

#ifndef AC_KVEC_H
#define AC_KVEC_H

#include <stdlib.h>

#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))

#define kvec_t(type) struct { size_t n, m; type *a; }
#define kv_init(v) ((v).n = (v).m = 0, (v).a = 0)
#define kv_destroy(v) free((v).a)
#define kv_A(v, i) ((v).a[(i)])
#define kv_pop(v) ((v).a[--(v).n])
#define kv_size(v) ((v).n)
#define kv_max(v) ((v).m)

#define kv_resize(type, v, s) ((v).m = (s), (v).a = (type*)realloc((v).a, sizeof(type) * (v).m))

#define kv_copy(type, v1, v0) do { \
if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n); \
(v1).n = (v0).n; \
memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \
} while (0) \

#define kv_push(type, v, x) do { \
if ((v).n == (v).m) { \
(v).m = (v).m? (v).m<<1 : 2; \
(v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \
} \
(v).a[(v).n++] = (x); \
} while (0)

#define kv_pushp(type, v) (((v).n == (v).m)? \
((v).m = ((v).m? (v).m<<1 : 2), \
(v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \
: 0), ((v).a + ((v).n++))

#define kv_a(type, v, i) (((v).m <= (size_t)(i)? \
((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \
(v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \
: (v).n <= (size_t)(i)? (v).n = (i) + 1 \
: 0), (v).a[(i)])

#endif
12 changes: 12 additions & 0 deletions geopandas/tests/test_geodataframe.py
Expand Up @@ -520,3 +520,15 @@ def test_set_geometry_null():
def test_constructor_without_geometries():
gdf = GeoDataFrame({'x': [1]})
assert list(gdf.x) == [1]


def test_constructor_column_ordering():
geoms = [Point(1, 1), Point(2, 2), Point(3, 3)]
gs = GeoSeries(geoms)
gdf = GeoDataFrame({'a': [1, 2, 3], 'geometry': gs},
columns=['geometry', 'a'],
index=pd.Index([0, 0, 1]),
geometry='geometry')

assert gdf.geometry._geometry_array is gs._geometry_array
assert gdf.geometry._geometry_array.data.all()
29 changes: 28 additions & 1 deletion geopandas/tests/test_vectorized.py
Expand Up @@ -3,7 +3,7 @@
import random
import shapely
from geopandas.vectorized import (GeometryArray, points_from_xy,
from_shapely, serialize, deserialize)
from_shapely, serialize, deserialize, cysjoin)
from shapely.geometry.base import (CAP_STYLE, JOIN_STYLE)

import pytest
Expand Down Expand Up @@ -333,6 +333,33 @@ def test_pickle():
assert T.equals(T2).all()


@pytest.mark.parametrize('predicate', [
'contains',
'covers',
'crosses',
'disjoint',
'intersects',
'overlaps',
'touches',
'within',
])
def test_sjoin(predicate):
left, right = cysjoin(T.data, P.data, predicate)

assert isinstance(left, np.ndarray)
assert isinstance(right, np.ndarray)
assert left.dtype == T.data.dtype
assert right.dtype == T.data.dtype
assert left.shape == right.shape
(n,) = left.shape
assert n < (len(T) * len(P))

for i, j in zip(left, right):
left = triangles[i]
right = points[j]
assert getattr(left, predicate)(right)


def test_raise_on_bad_sizes():
with pytest.raises(ValueError) as info:
T.contains(P)
Expand Down