From f58b5f46ade33a59f25bebf0523a4cfb716cd540 Mon Sep 17 00:00:00 2001 From: Nick Eubank Date: Fri, 27 May 2016 13:04:40 -0700 Subject: [PATCH] dissolve bug fixes (#323) --- geopandas/geodataframe.py | 36 ++++++++++---------- {tests => geopandas/tests}/test_dissolve.py | 37 ++++++++++++++++----- 2 files changed, 46 insertions(+), 27 deletions(-) rename {tests => geopandas/tests}/test_dissolve.py (56%) diff --git a/geopandas/geodataframe.py b/geopandas/geodataframe.py index 12e24b9deb..8c9e5b38fc 100644 --- a/geopandas/geodataframe.py +++ b/geopandas/geodataframe.py @@ -449,9 +449,14 @@ def plot(self, *args, **kwargs): plot.__doc__ = plot_dataframe.__doc__ - def dissolve(self, by=None, aggfunc='first'): + def dissolve(self, by=None, aggfunc='first', as_index=True): """ Dissolve geometries within `groupby` into single observation. + This is accomplished by applying the `unary_union` method + to all geometries within a groupself. + + Observations associated with each `groupby` group will be aggregated + using the `aggfunc`. Parameters ---------- @@ -460,6 +465,8 @@ def dissolve(self, by=None, aggfunc='first'): aggfunc : function or string, default "first" Aggregation function for manipulation of data associated with each group. Passed to pandas `groupby.agg` method. + as_index : boolean, default True + If true, groupby columns become index of result. Returns ------- @@ -467,33 +474,26 @@ def dissolve(self, by=None, aggfunc='first'): """ # Process non-spatial component - data = self.drop(labels=self.geometry.name, axis=1).copy() + data = self.drop(labels=self.geometry.name, axis=1) aggregated_data = data.groupby(by=by).agg(aggfunc) # Process spatial component - groupby_plus_geometry_cols = [self.geometry.name] - groupby_plus_geometry_cols.append(by) - geometry = self[groupby_plus_geometry_cols].copy() - def merge_geometries(block): - merged_geom = block.unary_union + return merged_geom - new_index = block.drop(self.geometry.name, axis=1).iloc[0][by] - merged_w_index = GeoSeries(merged_geom, index=Index(Series(new_index),name=by), - name=self.geometry.name) - return merged_w_index - + g = self.groupby(by=by, group_keys=False)[self.geometry.name].agg(merge_geometries) - g = geometry.groupby(by=by, group_keys=False).apply(merge_geometries) - - aggregated_geometry = GeoDataFrame(g, - index=g.index, - geometry=self.geometry.name) + # Aggregate + aggregated_geometry = GeoDataFrame(g, geometry=self.geometry.name) # Recombine aggregated = aggregated_geometry.join(aggregated_data) - aggregated = aggregated.set_geometry(self.geometry.name) + + # Reset if requested + if not as_index: + aggregated = aggregated.reset_index() + return aggregated def _dataframe_set_geometry(self, col, drop=False, inplace=False, crs=None): diff --git a/tests/test_dissolve.py b/geopandas/tests/test_dissolve.py similarity index 56% rename from tests/test_dissolve.py rename to geopandas/tests/test_dissolve.py index 58c91ac115..45cc8774be 100644 --- a/tests/test_dissolve.py +++ b/geopandas/tests/test_dissolve.py @@ -8,6 +8,11 @@ from .util import unittest, download_nybb from pandas.util.testing import assert_frame_equal from pandas import Index +from distutils.version import LooseVersion +import pandas as pd + +pandas_0_15_problem = 'fails under pandas < 0.16 due to issue 324,'\ + 'not problem with dissolve.' class TestDataFrame(unittest.TestCase): @@ -28,7 +33,7 @@ def setUp(self): others = self.polydf.loc[0:2,] collapsed = [others.geometry.unary_union, manhattan_bronx.geometry.unary_union] - merged_shapes = GeoDataFrame({'myshapes': collapsed}, geometry='myshapes', + merged_shapes = GeoDataFrame({'myshapes': collapsed}, geometry='myshapes', index=Index([5,6], name='manhattan_bronx')) # Different expected results @@ -40,25 +45,39 @@ def setUp(self): self.mean['BoroCode'] = [4,1.5] + @unittest.skipIf(str(pd.__version__) < LooseVersion('0.16'), pandas_0_15_problem) def test_geom_dissolve(self): test = self.polydf.dissolve('manhattan_bronx') self.assertTrue(test.geometry.name == 'myshapes') self.assertTrue(test.geom_almost_equals(self.first).all()) + @unittest.skipIf(str(pd.__version__) < LooseVersion('0.16'), pandas_0_15_problem) def test_first_dissolve(self): test = self.polydf.dissolve('manhattan_bronx') - test = test.drop('myshapes', axis=1) - first = self.first.drop('myshapes', axis=1) - assert_frame_equal(first, test) + assert_frame_equal(self.first, test, check_column_type=False) + @unittest.skipIf(str(pd.__version__) < LooseVersion('0.16'), pandas_0_15_problem) def test_mean_dissolve(self): test = self.polydf.dissolve('manhattan_bronx', aggfunc='mean') - test = test.drop('myshapes', axis=1) - mean = self.mean.drop('myshapes', axis=1) - assert_frame_equal(mean, test) + assert_frame_equal(self.mean, test, check_column_type=False) test = self.polydf.dissolve('manhattan_bronx', aggfunc=np.mean) - test = test.drop('myshapes', axis=1) - assert_frame_equal(mean, test) + assert_frame_equal(self.mean, test, check_column_type=False) + + @unittest.skipIf(str(pd.__version__) < LooseVersion('0.16'), pandas_0_15_problem) + def test_multicolumn_dissolve(self): + multi = self.polydf.copy() + multi['dup_col'] = multi.manhattan_bronx + multi_test = multi.dissolve(['manhattan_bronx', 'dup_col'], aggfunc='first') + + first = self.first.copy() + first['dup_col'] = first.index + first = first.set_index([first.index, 'dup_col']) + assert_frame_equal(multi_test, first, check_column_type=False) + @unittest.skipIf(str(pd.__version__) < LooseVersion('0.16'), pandas_0_15_problem) + def test_reset_index(self): + test = self.polydf.dissolve('manhattan_bronx', as_index=False) + comparison = self.first.reset_index() + assert_frame_equal(comparison, test, check_column_type=False)