Merge pull request #188 from mdbartos/spatial_join

ENH: Optimized spatial join
geopandas · Apr 13, 2015 · 3ffef9e · 3ffef9e
2 parents cf4eb4d + 2af7022
commit 3ffef9e
Show file tree

Hide file tree

Showing 2 changed files with 122 additions and 54 deletions.
diff --git a/geopandas/tools/sjoin.py b/geopandas/tools/sjoin.py
@@ -1,62 +1,128 @@
+import numpy as np
 import pandas as pd
-from geopandas import GeoDataFrame
-from .overlay import _uniquify
+import rtree
+from shapely import prepared
+import geopandas as gpd
 
-def sjoin(left_df, right_df, how="left", op="intersects", use_sindex=True, **kwargs):
+def sjoin(left_df, right_df, how='inner', op='intersects',
+          lsuffix='left', rsuffix='right', **kwargs):
     """Spatial join of two GeoDataFrames.
 
     left_df, right_df are GeoDataFrames
     how: type of join
         left -> use keys from left_df; retain only left_df geometry column
         right -> use keys from right_df; retain only right_df geometry column
-        inner -> use intersection of keys from both dfs; retain only left_df geometry column
+        inner -> use intersection of keys from both dfs;
+                 retain only left_df geometry column
     op: binary predicate {'intersects', 'contains', 'within'}
         see http://toblerity.org/shapely/manual.html#binary-predicates
-    use_sindex : Use the spatial index to speed up operation? Default is True
-    kwargs: passed to op method
+    lsuffix: suffix to apply to overlapping column names (left GeoDataFrame)
+    rsuffix: suffix to apply to overlapping column names (right GeoDataFrame)
     """
+
+    # CHECK VALIDITY OF JOIN TYPE
     allowed_hows = ['left', 'right', 'inner']
 
     if how not in allowed_hows:
         raise ValueError("`how` was \"%s\" but is expected to be in %s" % \
             (how, allowed_hows))
+
+    # CHECK VALIDITY OF PREDICATE OPERATION
+    allowed_ops = ['contains', 'within', 'intersects']
+
+    if op not in allowed_ops:
+        raise ValueError("`op` was \"%s\" but is expected to be in %s" % \
+            (op, allowed_ops))
+
+    # IF WITHIN, SWAP NAMES
+    if op == "within":
+        # within implemented as the inverse of contains; swap names
+        left_df, right_df = right_df, left_df
+
+    # CONVERT CRS IF NOT EQUAL
+    if left_df.crs != right_df.crs:
+        print('Warning: CRS does not match!')
+
+    # CONSTRUCT SPATIAL INDEX FOR RIGHT DATAFRAME
+    tree_idx = rtree.index.Index()
+    right_df_bounds = right_df['geometry'].apply(lambda x: x.bounds)
+    for i in right_df_bounds.index:
+        tree_idx.insert(i, right_df_bounds[i])
+
+    # FIND INTERSECTION OF SPATIAL INDEX
+    idxmatch = (left_df['geometry'].apply(lambda x: x.bounds)
+                .apply(lambda x: list(tree_idx.intersection(x))))
+    idxmatch = idxmatch[idxmatch.str.len() > 0]
+
+    r_idx = np.concatenate(idxmatch.values)
+    l_idx = idxmatch.index.values.repeat(idxmatch.str.len().values) 
+
+    # VECTORIZE PREDICATE OPERATIONS
+    def find_intersects(a1, a2):
+        return a1.intersects(a2)
+
+    def find_contains(a1, a2):
+        return a1.contains(a2)
+
+    predicate_d = {'intersects': find_intersects,
+                   'contains': find_contains,
+                   'within': find_contains}
+
+    check_predicates = np.vectorize(predicate_d[op])
+
+    # CHECK PREDICATES
+    result = (
+              pd.DataFrame(
+                  np.column_stack(
+                      [l_idx,
+                       r_idx,
+                       check_predicates(
+                           left_df['geometry']
+                           .apply(lambda x: prepared.prep(x)).values[l_idx],
+                           right_df['geometry'].values[r_idx])
+                       ]))
+               )
+
+    result.columns = ['index_%s' % lsuffix, 'index_%s' % rsuffix, 'match_bool']
+    result = (
+              pd.DataFrame(result[result['match_bool']==1])
+              .drop('match_bool', axis=1)
+              )
 
-    if how == "right":
-        # right outer join just implemented as the inverse of left; swap names
+    # IF 'WITHIN', SWAP NAMES AGAIN
+    if op == "within":
+        # within implemented as the inverse of contains; swap names
         left_df, right_df = right_df, left_df
+        result = result.rename(columns={
+                    'index_%s' % (lsuffix): 'index_%s' % (rsuffix),
+                    'index_%s' % (rsuffix): 'index_%s' % (lsuffix)})
 
-    collection = []
-    for i, feat in left_df.iterrows():
-        geom = feat.geometry
-
-        if use_sindex and right_df.sindex:
-            candidates = [x.object for x in
-                           right_df.sindex.intersection(geom.bounds, objects=True)]
-        else:
-            candidates = [i for i, x in right_df.iterrows()]
-
-        feature_hits = 0
-        for cand_id in candidates:
-            candidate = right_df.ix[cand_id]
-            if getattr(geom, op)(candidate.geometry, **kwargs):
-                newseries = candidate.drop(right_df._geometry_column_name) 
-                newfeat = pd.concat([feat, newseries])
-                newfeat.index = _uniquify(newfeat.index)
-                collection.append(newfeat)
-                feature_hits += 1
-
-        # TODO Should we perform aggregation if feature_hit > 1?
-        # Advantage: single step and possible performance improvement
-        # Disadvantage: Pandas already has groupby so user can do this later
-
-        # If left does not spatially join with any right features,
-        # Fill in the right columns with NA
-        if how != 'inner' and feature_hits == 0:
-            empty = pd.Series(dict.fromkeys(right_df.columns, None))
-            empty.drop(right_df._geometry_column_name, inplace=True) 
-
-            newfeat = pd.concat([feat, empty])
-            newfeat.index = _uniquify(newfeat.index)
-            collection.append(newfeat)
-
-    return GeoDataFrame(collection, index=range(len(collection)))
+    # APPLY JOIN
+    if how == 'inner':
+        result = result.set_index('index_%s' % lsuffix)
+        return (
+                left_df
+                .merge(result, left_index=True, right_index=True)
+                .merge(right_df.drop('geometry', axis=1),
+                    left_on='index_%s' % rsuffix, right_index=True,
+                    suffixes=('_%s' % lsuffix, '_%s' % rsuffix))
+                )
+    elif how == 'left':
+        result = result.set_index('index_%s' % lsuffix)
+        return (
+                left_df
+                .merge(result, left_index=True, right_index=True, how='left')
+                .merge(right_df.drop('geometry', axis=1),
+                    how='left', left_on='index_%s' % rsuffix, right_index=True,
+                    suffixes=('_%s' % lsuffix, '_%s' % rsuffix))
+                )
+    elif how == 'right':
+        return (
+                left_df
+                .drop('geometry', axis=1)
+                .merge(result.merge(right_df,
+                    left_on='index_%s' % rsuffix, right_index=True,
+                    how='right'), left_index=True,
+                    right_on='index_%s' % lsuffix, how='right')
+                .set_index('index_%s' % rsuffix)
+                )
diff --git a/tests/test_sjoin.py b/tests/test_sjoin.py
@@ -1,6 +1,8 @@
+
 from __future__ import absolute_import
 import tempfile
 import shutil
+import numpy as np
 from shapely.geometry import Point
 from geopandas import GeoDataFrame, read_file
 from geopandas.tools import sjoin
@@ -25,8 +27,8 @@ def tearDown(self):
         shutil.rmtree(self.tempdir)
 
     def test_sjoin_left(self):
-        df = sjoin(self.pointdf, self.polydf)
-        self.assertEquals(df.shape, (21,7))
+        df = sjoin(self.pointdf, self.polydf, how='left')
+        self.assertEquals(df.shape, (21,8))
         for i, row in df.iterrows():
             self.assertEquals(row.geometry.type, 'Point')
         self.assertTrue('pointattr1' in df.columns)
@@ -36,7 +38,7 @@ def test_sjoin_right(self):
         # the inverse of left
         df = sjoin(self.pointdf, self.polydf, how="right")
         df2 = sjoin(self.polydf, self.pointdf, how="left")
-        self.assertEquals(df.shape, (12, 7))
+        self.assertEquals(df.shape, (12, 8))
         self.assertEquals(df.shape, df2.shape)
         for i, row in df.iterrows():
             self.assertEquals(row.geometry.type, 'MultiPolygon')
@@ -45,31 +47,31 @@ def test_sjoin_right(self):
 
     def test_sjoin_inner(self):
         df = sjoin(self.pointdf, self.polydf, how="inner")
-        self.assertEquals(df.shape, (11, 7))
+        self.assertEquals(df.shape, (11, 8))
 
     def test_sjoin_op(self):
         # points within polygons
         df = sjoin(self.pointdf, self.polydf, how="left", op="within")
-        self.assertEquals(df.shape, (21,7))
+        self.assertEquals(df.shape, (21,8))
         self.assertAlmostEquals(df.ix[1]['Shape_Leng'], 330454.175933)
 
         # points contain polygons? never happens so we should have nulls
         df = sjoin(self.pointdf, self.polydf, how="left", op="contains")
-        self.assertEquals(df.shape, (21, 7))
-        self.assertEquals(df.ix[1]['Shape_Area'], None)
+        self.assertEquals(df.shape, (21, 8))
+        self.assertTrue(np.isnan(df.ix[1]['Shape_Area']))
 
     def test_sjoin_bad_op(self):
         # AttributeError: 'Point' object has no attribute 'spandex'
-        self.assertRaises(AttributeError, sjoin,
+        self.assertRaises(ValueError, sjoin,
             self.pointdf, self.polydf, how="left", op="spandex")
 
     def test_sjoin_duplicate_column_name(self):
         pointdf2 = self.pointdf.rename(columns={'pointattr1': 'Shape_Area'})
         df = sjoin(pointdf2, self.polydf, how="left")
-        self.assertTrue('Shape_Area' in df.columns)
-        self.assertTrue('Shape_Area_2' in df.columns)
+        self.assertTrue('Shape_Area_left' in df.columns)
+        self.assertTrue('Shape_Area_right' in df.columns)
 
     @unittest.skip("Not implemented")
     def test_sjoin_outer(self):
         df = sjoin(self.pointdf, self.polydf, how="outer")
-        self.assertEquals(df.shape, (21,7))
+        self.assertEquals(df.shape, (21,8))