-
Notifications
You must be signed in to change notification settings - Fork 905
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #188 from mdbartos/spatial_join
ENH: Optimized spatial join
- Loading branch information
Showing
2 changed files
with
122 additions
and
54 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,62 +1,128 @@ | ||
import numpy as np | ||
import pandas as pd | ||
from geopandas import GeoDataFrame | ||
from .overlay import _uniquify | ||
import rtree | ||
from shapely import prepared | ||
import geopandas as gpd | ||
|
||
def sjoin(left_df, right_df, how="left", op="intersects", use_sindex=True, **kwargs): | ||
def sjoin(left_df, right_df, how='inner', op='intersects', | ||
lsuffix='left', rsuffix='right', **kwargs): | ||
"""Spatial join of two GeoDataFrames. | ||
left_df, right_df are GeoDataFrames | ||
how: type of join | ||
left -> use keys from left_df; retain only left_df geometry column | ||
right -> use keys from right_df; retain only right_df geometry column | ||
inner -> use intersection of keys from both dfs; retain only left_df geometry column | ||
inner -> use intersection of keys from both dfs; | ||
retain only left_df geometry column | ||
op: binary predicate {'intersects', 'contains', 'within'} | ||
see http://toblerity.org/shapely/manual.html#binary-predicates | ||
use_sindex : Use the spatial index to speed up operation? Default is True | ||
kwargs: passed to op method | ||
lsuffix: suffix to apply to overlapping column names (left GeoDataFrame) | ||
rsuffix: suffix to apply to overlapping column names (right GeoDataFrame) | ||
""" | ||
|
||
# CHECK VALIDITY OF JOIN TYPE | ||
allowed_hows = ['left', 'right', 'inner'] | ||
|
||
if how not in allowed_hows: | ||
raise ValueError("`how` was \"%s\" but is expected to be in %s" % \ | ||
(how, allowed_hows)) | ||
|
||
# CHECK VALIDITY OF PREDICATE OPERATION | ||
allowed_ops = ['contains', 'within', 'intersects'] | ||
|
||
if op not in allowed_ops: | ||
raise ValueError("`op` was \"%s\" but is expected to be in %s" % \ | ||
(op, allowed_ops)) | ||
|
||
# IF WITHIN, SWAP NAMES | ||
if op == "within": | ||
# within implemented as the inverse of contains; swap names | ||
left_df, right_df = right_df, left_df | ||
|
||
# CONVERT CRS IF NOT EQUAL | ||
if left_df.crs != right_df.crs: | ||
print('Warning: CRS does not match!') | ||
|
||
# CONSTRUCT SPATIAL INDEX FOR RIGHT DATAFRAME | ||
tree_idx = rtree.index.Index() | ||
right_df_bounds = right_df['geometry'].apply(lambda x: x.bounds) | ||
for i in right_df_bounds.index: | ||
tree_idx.insert(i, right_df_bounds[i]) | ||
|
||
# FIND INTERSECTION OF SPATIAL INDEX | ||
idxmatch = (left_df['geometry'].apply(lambda x: x.bounds) | ||
.apply(lambda x: list(tree_idx.intersection(x)))) | ||
idxmatch = idxmatch[idxmatch.str.len() > 0] | ||
|
||
r_idx = np.concatenate(idxmatch.values) | ||
l_idx = idxmatch.index.values.repeat(idxmatch.str.len().values) | ||
|
||
# VECTORIZE PREDICATE OPERATIONS | ||
def find_intersects(a1, a2): | ||
return a1.intersects(a2) | ||
|
||
def find_contains(a1, a2): | ||
return a1.contains(a2) | ||
|
||
predicate_d = {'intersects': find_intersects, | ||
'contains': find_contains, | ||
'within': find_contains} | ||
|
||
check_predicates = np.vectorize(predicate_d[op]) | ||
|
||
# CHECK PREDICATES | ||
result = ( | ||
pd.DataFrame( | ||
np.column_stack( | ||
[l_idx, | ||
r_idx, | ||
check_predicates( | ||
left_df['geometry'] | ||
.apply(lambda x: prepared.prep(x)).values[l_idx], | ||
right_df['geometry'].values[r_idx]) | ||
])) | ||
) | ||
|
||
result.columns = ['index_%s' % lsuffix, 'index_%s' % rsuffix, 'match_bool'] | ||
result = ( | ||
pd.DataFrame(result[result['match_bool']==1]) | ||
.drop('match_bool', axis=1) | ||
) | ||
|
||
if how == "right": | ||
# right outer join just implemented as the inverse of left; swap names | ||
# IF 'WITHIN', SWAP NAMES AGAIN | ||
if op == "within": | ||
# within implemented as the inverse of contains; swap names | ||
left_df, right_df = right_df, left_df | ||
result = result.rename(columns={ | ||
'index_%s' % (lsuffix): 'index_%s' % (rsuffix), | ||
'index_%s' % (rsuffix): 'index_%s' % (lsuffix)}) | ||
|
||
collection = [] | ||
for i, feat in left_df.iterrows(): | ||
geom = feat.geometry | ||
|
||
if use_sindex and right_df.sindex: | ||
candidates = [x.object for x in | ||
right_df.sindex.intersection(geom.bounds, objects=True)] | ||
else: | ||
candidates = [i for i, x in right_df.iterrows()] | ||
|
||
feature_hits = 0 | ||
for cand_id in candidates: | ||
candidate = right_df.ix[cand_id] | ||
if getattr(geom, op)(candidate.geometry, **kwargs): | ||
newseries = candidate.drop(right_df._geometry_column_name) | ||
newfeat = pd.concat([feat, newseries]) | ||
newfeat.index = _uniquify(newfeat.index) | ||
collection.append(newfeat) | ||
feature_hits += 1 | ||
|
||
# TODO Should we perform aggregation if feature_hit > 1? | ||
# Advantage: single step and possible performance improvement | ||
# Disadvantage: Pandas already has groupby so user can do this later | ||
|
||
# If left does not spatially join with any right features, | ||
# Fill in the right columns with NA | ||
if how != 'inner' and feature_hits == 0: | ||
empty = pd.Series(dict.fromkeys(right_df.columns, None)) | ||
empty.drop(right_df._geometry_column_name, inplace=True) | ||
|
||
newfeat = pd.concat([feat, empty]) | ||
newfeat.index = _uniquify(newfeat.index) | ||
collection.append(newfeat) | ||
|
||
return GeoDataFrame(collection, index=range(len(collection))) | ||
# APPLY JOIN | ||
if how == 'inner': | ||
result = result.set_index('index_%s' % lsuffix) | ||
return ( | ||
left_df | ||
.merge(result, left_index=True, right_index=True) | ||
.merge(right_df.drop('geometry', axis=1), | ||
left_on='index_%s' % rsuffix, right_index=True, | ||
suffixes=('_%s' % lsuffix, '_%s' % rsuffix)) | ||
) | ||
elif how == 'left': | ||
result = result.set_index('index_%s' % lsuffix) | ||
return ( | ||
left_df | ||
.merge(result, left_index=True, right_index=True, how='left') | ||
.merge(right_df.drop('geometry', axis=1), | ||
how='left', left_on='index_%s' % rsuffix, right_index=True, | ||
suffixes=('_%s' % lsuffix, '_%s' % rsuffix)) | ||
) | ||
elif how == 'right': | ||
return ( | ||
left_df | ||
.drop('geometry', axis=1) | ||
.merge(result.merge(right_df, | ||
left_on='index_%s' % rsuffix, right_index=True, | ||
how='right'), left_index=True, | ||
right_on='index_%s' % lsuffix, how='right') | ||
.set_index('index_%s' % rsuffix) | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters