From 15c0000882bced2a151c0fe63b3c35ce9608a5c3 Mon Sep 17 00:00:00 2001
From: Philipp Kats <casyfill@gmail.com>
Date: Sun, 7 Aug 2016 11:21:38 -0400
Subject: [PATCH] separate spatial_index function, might be of use for
 "millions_to_hundreds" sjoint, where we split data into chunks and don't want
 to generate a rtree.Index for each chunk once again

---
 geopandas/tools/sjoin.py | 32 ++++++++++++++++++++++++++------
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/geopandas/tools/sjoin.py b/geopandas/tools/sjoin.py
index 9dd16b9de9..af92507ba3 100644
--- a/geopandas/tools/sjoin.py
+++ b/geopandas/tools/sjoin.py
@@ -3,8 +3,29 @@
 from shapely import prepared
 
 
+def rtree_index(geom):
+    '''create a spatial index of a geometry
+
+    Parameters
+    ----------
+    geom : pd.Series with shapely geometry objects
+
+    Returns
+    -------
+    rtree.index : spatial index
+    '''
+    import rtree
+
+    tree_idx = rtree.index.Index()
+    geom_bounds = geom.apply(lambda x: x.bounds)
+    for i in geom_bounds.index:
+        tree_idx.insert(i, geom_bounds[i])
+
+    return tree_idx
+
+
 def sjoin(left_df, right_df, how='inner', op='intersects',
-          lsuffix='left', rsuffix='right'):
+          lsuffix='left', rsuffix='right', tree_idx=None):
     """Spatial join of two GeoDataFrames.
 
     Parameters
@@ -24,9 +45,9 @@ def sjoin(left_df, right_df, how='inner', op='intersects',
         Suffix to apply to overlapping column names (left GeoDataFrame).
     rsuffix : string, default 'right'
         Suffix to apply to overlapping column names (right GeoDataFrame).
+    tree_idx : rtree.index for right GeoDataFrame, if None, will be created.
 
     """
-    import rtree
 
     allowed_hows = ['left', 'right', 'inner']
     if how not in allowed_hows:
@@ -45,10 +66,9 @@ def sjoin(left_df, right_df, how='inner', op='intersects',
     if left_df.crs != right_df.crs:
         print('Warning: CRS does not match!')
 
-    tree_idx = rtree.index.Index()
-    right_df_bounds = right_df['geometry'].apply(lambda x: x.bounds)
-    for i in right_df_bounds.index:
-        tree_idx.insert(i, right_df_bounds[i])
+    if tree_idx is None:
+        tree_idx = rtree_index(right_df['geometry'])
+
 
     idxmatch = (left_df['geometry'].apply(lambda x: x.bounds)
                 .apply(lambda x: list(tree_idx.intersection(x))))