Skip to content

Commit

Permalink
Merge pull request #188 from mdbartos/spatial_join
Browse files Browse the repository at this point in the history
ENH: Optimized spatial join
  • Loading branch information
kjordahl committed Apr 13, 2015
2 parents cf4eb4d + 2af7022 commit 3ffef9e
Show file tree
Hide file tree
Showing 2 changed files with 122 additions and 54 deletions.
152 changes: 109 additions & 43 deletions geopandas/tools/sjoin.py
Original file line number Diff line number Diff line change
@@ -1,62 +1,128 @@
import numpy as np
import pandas as pd
from geopandas import GeoDataFrame
from .overlay import _uniquify
import rtree
from shapely import prepared
import geopandas as gpd

def sjoin(left_df, right_df, how="left", op="intersects", use_sindex=True, **kwargs):
def sjoin(left_df, right_df, how='inner', op='intersects',
lsuffix='left', rsuffix='right', **kwargs):
"""Spatial join of two GeoDataFrames.
left_df, right_df are GeoDataFrames
how: type of join
left -> use keys from left_df; retain only left_df geometry column
right -> use keys from right_df; retain only right_df geometry column
inner -> use intersection of keys from both dfs; retain only left_df geometry column
inner -> use intersection of keys from both dfs;
retain only left_df geometry column
op: binary predicate {'intersects', 'contains', 'within'}
see http://toblerity.org/shapely/manual.html#binary-predicates
use_sindex : Use the spatial index to speed up operation? Default is True
kwargs: passed to op method
lsuffix: suffix to apply to overlapping column names (left GeoDataFrame)
rsuffix: suffix to apply to overlapping column names (right GeoDataFrame)
"""

# CHECK VALIDITY OF JOIN TYPE
allowed_hows = ['left', 'right', 'inner']

if how not in allowed_hows:
raise ValueError("`how` was \"%s\" but is expected to be in %s" % \
(how, allowed_hows))

# CHECK VALIDITY OF PREDICATE OPERATION
allowed_ops = ['contains', 'within', 'intersects']

if op not in allowed_ops:
raise ValueError("`op` was \"%s\" but is expected to be in %s" % \
(op, allowed_ops))

# IF WITHIN, SWAP NAMES
if op == "within":
# within implemented as the inverse of contains; swap names
left_df, right_df = right_df, left_df

# CONVERT CRS IF NOT EQUAL
if left_df.crs != right_df.crs:
print('Warning: CRS does not match!')

# CONSTRUCT SPATIAL INDEX FOR RIGHT DATAFRAME
tree_idx = rtree.index.Index()
right_df_bounds = right_df['geometry'].apply(lambda x: x.bounds)
for i in right_df_bounds.index:
tree_idx.insert(i, right_df_bounds[i])

# FIND INTERSECTION OF SPATIAL INDEX
idxmatch = (left_df['geometry'].apply(lambda x: x.bounds)
.apply(lambda x: list(tree_idx.intersection(x))))
idxmatch = idxmatch[idxmatch.str.len() > 0]

r_idx = np.concatenate(idxmatch.values)
l_idx = idxmatch.index.values.repeat(idxmatch.str.len().values)

# VECTORIZE PREDICATE OPERATIONS
def find_intersects(a1, a2):
return a1.intersects(a2)

def find_contains(a1, a2):
return a1.contains(a2)

predicate_d = {'intersects': find_intersects,
'contains': find_contains,
'within': find_contains}

check_predicates = np.vectorize(predicate_d[op])

# CHECK PREDICATES
result = (
pd.DataFrame(
np.column_stack(
[l_idx,
r_idx,
check_predicates(
left_df['geometry']
.apply(lambda x: prepared.prep(x)).values[l_idx],
right_df['geometry'].values[r_idx])
]))
)

result.columns = ['index_%s' % lsuffix, 'index_%s' % rsuffix, 'match_bool']
result = (
pd.DataFrame(result[result['match_bool']==1])
.drop('match_bool', axis=1)
)

if how == "right":
# right outer join just implemented as the inverse of left; swap names
# IF 'WITHIN', SWAP NAMES AGAIN
if op == "within":
# within implemented as the inverse of contains; swap names
left_df, right_df = right_df, left_df
result = result.rename(columns={
'index_%s' % (lsuffix): 'index_%s' % (rsuffix),
'index_%s' % (rsuffix): 'index_%s' % (lsuffix)})

collection = []
for i, feat in left_df.iterrows():
geom = feat.geometry

if use_sindex and right_df.sindex:
candidates = [x.object for x in
right_df.sindex.intersection(geom.bounds, objects=True)]
else:
candidates = [i for i, x in right_df.iterrows()]

feature_hits = 0
for cand_id in candidates:
candidate = right_df.ix[cand_id]
if getattr(geom, op)(candidate.geometry, **kwargs):
newseries = candidate.drop(right_df._geometry_column_name)
newfeat = pd.concat([feat, newseries])
newfeat.index = _uniquify(newfeat.index)
collection.append(newfeat)
feature_hits += 1

# TODO Should we perform aggregation if feature_hit > 1?
# Advantage: single step and possible performance improvement
# Disadvantage: Pandas already has groupby so user can do this later

# If left does not spatially join with any right features,
# Fill in the right columns with NA
if how != 'inner' and feature_hits == 0:
empty = pd.Series(dict.fromkeys(right_df.columns, None))
empty.drop(right_df._geometry_column_name, inplace=True)

newfeat = pd.concat([feat, empty])
newfeat.index = _uniquify(newfeat.index)
collection.append(newfeat)

return GeoDataFrame(collection, index=range(len(collection)))
# APPLY JOIN
if how == 'inner':
result = result.set_index('index_%s' % lsuffix)
return (
left_df
.merge(result, left_index=True, right_index=True)
.merge(right_df.drop('geometry', axis=1),
left_on='index_%s' % rsuffix, right_index=True,
suffixes=('_%s' % lsuffix, '_%s' % rsuffix))
)
elif how == 'left':
result = result.set_index('index_%s' % lsuffix)
return (
left_df
.merge(result, left_index=True, right_index=True, how='left')
.merge(right_df.drop('geometry', axis=1),
how='left', left_on='index_%s' % rsuffix, right_index=True,
suffixes=('_%s' % lsuffix, '_%s' % rsuffix))
)
elif how == 'right':
return (
left_df
.drop('geometry', axis=1)
.merge(result.merge(right_df,
left_on='index_%s' % rsuffix, right_index=True,
how='right'), left_index=True,
right_on='index_%s' % lsuffix, how='right')
.set_index('index_%s' % rsuffix)
)
24 changes: 13 additions & 11 deletions tests/test_sjoin.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@

from __future__ import absolute_import
import tempfile
import shutil
import numpy as np
from shapely.geometry import Point
from geopandas import GeoDataFrame, read_file
from geopandas.tools import sjoin
Expand All @@ -25,8 +27,8 @@ def tearDown(self):
shutil.rmtree(self.tempdir)

def test_sjoin_left(self):
df = sjoin(self.pointdf, self.polydf)
self.assertEquals(df.shape, (21,7))
df = sjoin(self.pointdf, self.polydf, how='left')
self.assertEquals(df.shape, (21,8))
for i, row in df.iterrows():
self.assertEquals(row.geometry.type, 'Point')
self.assertTrue('pointattr1' in df.columns)
Expand All @@ -36,7 +38,7 @@ def test_sjoin_right(self):
# the inverse of left
df = sjoin(self.pointdf, self.polydf, how="right")
df2 = sjoin(self.polydf, self.pointdf, how="left")
self.assertEquals(df.shape, (12, 7))
self.assertEquals(df.shape, (12, 8))
self.assertEquals(df.shape, df2.shape)
for i, row in df.iterrows():
self.assertEquals(row.geometry.type, 'MultiPolygon')
Expand All @@ -45,31 +47,31 @@ def test_sjoin_right(self):

def test_sjoin_inner(self):
df = sjoin(self.pointdf, self.polydf, how="inner")
self.assertEquals(df.shape, (11, 7))
self.assertEquals(df.shape, (11, 8))

def test_sjoin_op(self):
# points within polygons
df = sjoin(self.pointdf, self.polydf, how="left", op="within")
self.assertEquals(df.shape, (21,7))
self.assertEquals(df.shape, (21,8))
self.assertAlmostEquals(df.ix[1]['Shape_Leng'], 330454.175933)

# points contain polygons? never happens so we should have nulls
df = sjoin(self.pointdf, self.polydf, how="left", op="contains")
self.assertEquals(df.shape, (21, 7))
self.assertEquals(df.ix[1]['Shape_Area'], None)
self.assertEquals(df.shape, (21, 8))
self.assertTrue(np.isnan(df.ix[1]['Shape_Area']))

def test_sjoin_bad_op(self):
# AttributeError: 'Point' object has no attribute 'spandex'
self.assertRaises(AttributeError, sjoin,
self.assertRaises(ValueError, sjoin,
self.pointdf, self.polydf, how="left", op="spandex")

def test_sjoin_duplicate_column_name(self):
pointdf2 = self.pointdf.rename(columns={'pointattr1': 'Shape_Area'})
df = sjoin(pointdf2, self.polydf, how="left")
self.assertTrue('Shape_Area' in df.columns)
self.assertTrue('Shape_Area_2' in df.columns)
self.assertTrue('Shape_Area_left' in df.columns)
self.assertTrue('Shape_Area_right' in df.columns)

@unittest.skip("Not implemented")
def test_sjoin_outer(self):
df = sjoin(self.pointdf, self.polydf, how="outer")
self.assertEquals(df.shape, (21,7))
self.assertEquals(df.shape, (21,8))

0 comments on commit 3ffef9e

Please sign in to comment.