From e9ec04f8ca008a5d6b6808711585c1ae10b8e643 Mon Sep 17 00:00:00 2001 From: "P. Sai Vinay" Date: Wed, 19 Aug 2020 15:18:17 +0530 Subject: [PATCH 1/5] DataFrame.agg() with single non-list aggregation name now returns series --- eland/dataframe.py | 9 ++++++--- eland/tests/dataframe/test_aggs_pytest.py | 23 ++++++++++++++++++++++- 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/eland/dataframe.py b/eland/dataframe.py index 875cdb1c..ee7178c6 100644 --- a/eland/dataframe.py +++ b/eland/dataframe.py @@ -1386,9 +1386,12 @@ def aggregate(self, func, axis=0, *args, **kwargs): # ['count', 'mad', 'max', 'mean', 'median', 'min', 'mode', 'quantile', # 'rank', 'sem', 'skew', 'sum', 'std', 'var', 'nunique'] if isinstance(func, str): - # wrap in list - func = [func] - return self._query_compiler.aggs(func) + try: + return getattr(self, func)() + except AttributeError: + # If all are implemented, this has to be changed to + # print(f"{func} is invalid operation") + print(f"{func} is not implemented/invalid") elif is_list_like(func): # we have a list! return self._query_compiler.aggs(func) diff --git a/eland/tests/dataframe/test_aggs_pytest.py b/eland/tests/dataframe/test_aggs_pytest.py index 2f1546f3..7d404602 100644 --- a/eland/tests/dataframe/test_aggs_pytest.py +++ b/eland/tests/dataframe/test_aggs_pytest.py @@ -18,7 +18,7 @@ # File called _pytest for PyCharm compatability import numpy as np -from pandas.testing import assert_frame_equal +from pandas.testing import assert_frame_equal, assert_series_equal from eland.tests.common import TestData @@ -94,3 +94,24 @@ def test_aggs_median_var(self): # TODO - investigate this more pd_aggs = pd_aggs.astype("float64") assert_frame_equal(pd_aggs, ed_aggs, check_exact=False, check_less_precise=2) + + # If Aggregate is given a string and series is returned. + def test_terms_aggs_series(self): + pd_flights = self.pd_flights() + ed_flights = self.ed_flights() + + pd_sum_min = pd_flights.select_dtypes(include=[np.number]).agg(["sum", "min"]) + ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(["sum", "min"]) + + # Eland returns all float values for all metric aggs, pandas can return int + # TODO - investigate this more + pd_sum_min = pd_sum_min.astype("float64") + assert_frame_equal(pd_sum_min, ed_sum_min, check_exact=False) + + pd_sum_min_std = pd_flights.select_dtypes(include=[np.number]).agg("mean") + ed_sum_min_std = ed_flights.select_dtypes(include=[np.number]).agg("mean") + + print(pd_sum_min_std.dtypes) + print(ed_sum_min_std.dtypes) + + assert_series_equal(pd_sum_min_std, ed_sum_min_std) From cf5779c03221b5e3fa550d10da8b541a1062f097 Mon Sep 17 00:00:00 2001 From: "P. Sai Vinay" Date: Wed, 19 Aug 2020 16:32:45 +0530 Subject: [PATCH 2/5] Refactor and added test --- eland/dataframe.py | 4 +++- eland/tests/dataframe/test_aggs_pytest.py | 10 +++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/eland/dataframe.py b/eland/dataframe.py index ee7178c6..1cc8c8c7 100644 --- a/eland/dataframe.py +++ b/eland/dataframe.py @@ -1387,11 +1387,13 @@ def aggregate(self, func, axis=0, *args, **kwargs): # 'rank', 'sem', 'skew', 'sum', 'std', 'var', 'nunique'] if isinstance(func, str): try: - return getattr(self, func)() + result = getattr(self, func)() except AttributeError: # If all are implemented, this has to be changed to # print(f"{func} is invalid operation") print(f"{func} is not implemented/invalid") + raise + return result elif is_list_like(func): # we have a list! return self._query_compiler.aggs(func) diff --git a/eland/tests/dataframe/test_aggs_pytest.py b/eland/tests/dataframe/test_aggs_pytest.py index 7d404602..5aa8e08d 100644 --- a/eland/tests/dataframe/test_aggs_pytest.py +++ b/eland/tests/dataframe/test_aggs_pytest.py @@ -19,7 +19,7 @@ import numpy as np from pandas.testing import assert_frame_equal, assert_series_equal - +import pytest from eland.tests.common import TestData @@ -115,3 +115,11 @@ def test_terms_aggs_series(self): print(ed_sum_min_std.dtypes) assert_series_equal(pd_sum_min_std, ed_sum_min_std) + + # If Wrong Aggregate value is given. + def test_terms_wrongaggs(self): + ed_flights = self.ed_flights() + + match = "'DataFrame' object has no attribute 'abc'" + with pytest.raises(AttributeError, match=match): + ed_flights.select_dtypes(include=[np.number]).agg("abc") From 24172cf6c91440a822d45575704d721f30c5c347 Mon Sep 17 00:00:00 2001 From: "P. Sai Vinay" Date: Thu, 20 Aug 2020 16:52:57 +0530 Subject: [PATCH 3/5] Update logic --- .gitignore | 1 + eland/dataframe.py | 10 ++++------ eland/tests/dataframe/test_aggs_pytest.py | 15 ++------------- 3 files changed, 7 insertions(+), 19 deletions(-) diff --git a/.gitignore b/.gitignore index f149ff4f..8c004149 100644 --- a/.gitignore +++ b/.gitignore @@ -49,3 +49,4 @@ venv/ ENV/ env.bak/ venv.bak/ +.mypy_cache \ No newline at end of file diff --git a/eland/dataframe.py b/eland/dataframe.py index 1cc8c8c7..69265bc9 100644 --- a/eland/dataframe.py +++ b/eland/dataframe.py @@ -1386,14 +1386,12 @@ def aggregate(self, func, axis=0, *args, **kwargs): # ['count', 'mad', 'max', 'mean', 'median', 'min', 'mode', 'quantile', # 'rank', 'sem', 'skew', 'sum', 'std', 'var', 'nunique'] if isinstance(func, str): - try: - result = getattr(self, func)() - except AttributeError: + if hasattr(self, func): # If all are implemented, this has to be changed to # print(f"{func} is invalid operation") - print(f"{func} is not implemented/invalid") - raise - return result + return getattr(self, func)() + else: + raise AttributeError(f"{func} is not implemented/invalid") elif is_list_like(func): # we have a list! return self._query_compiler.aggs(func) diff --git a/eland/tests/dataframe/test_aggs_pytest.py b/eland/tests/dataframe/test_aggs_pytest.py index 5aa8e08d..9a3bad66 100644 --- a/eland/tests/dataframe/test_aggs_pytest.py +++ b/eland/tests/dataframe/test_aggs_pytest.py @@ -100,26 +100,15 @@ def test_terms_aggs_series(self): pd_flights = self.pd_flights() ed_flights = self.ed_flights() - pd_sum_min = pd_flights.select_dtypes(include=[np.number]).agg(["sum", "min"]) - ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(["sum", "min"]) - - # Eland returns all float values for all metric aggs, pandas can return int - # TODO - investigate this more - pd_sum_min = pd_sum_min.astype("float64") - assert_frame_equal(pd_sum_min, ed_sum_min, check_exact=False) - pd_sum_min_std = pd_flights.select_dtypes(include=[np.number]).agg("mean") ed_sum_min_std = ed_flights.select_dtypes(include=[np.number]).agg("mean") - print(pd_sum_min_std.dtypes) - print(ed_sum_min_std.dtypes) - assert_series_equal(pd_sum_min_std, ed_sum_min_std) # If Wrong Aggregate value is given. def test_terms_wrongaggs(self): - ed_flights = self.ed_flights() + ed_flights = self.ed_flights()[["FlightDelayMin"]] - match = "'DataFrame' object has no attribute 'abc'" + match = "abc is not implemented/invalid" with pytest.raises(AttributeError, match=match): ed_flights.select_dtypes(include=[np.number]).agg("abc") From d777fe01c1875b1eebe03febc5eab97d6229c3ef Mon Sep 17 00:00:00 2001 From: "P. Sai Vinay" Date: Tue, 25 Aug 2020 21:29:13 +0530 Subject: [PATCH 4/5] Requested changes --- .gitignore | 2 +- eland/dataframe.py | 10 ++++------ eland/tests/dataframe/test_aggs_pytest.py | 11 ++++++----- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/.gitignore b/.gitignore index 8c004149..2a08cbfd 100644 --- a/.gitignore +++ b/.gitignore @@ -49,4 +49,4 @@ venv/ ENV/ env.bak/ venv.bak/ -.mypy_cache \ No newline at end of file +.mypy_cache diff --git a/eland/dataframe.py b/eland/dataframe.py index 69265bc9..a8c9907b 100644 --- a/eland/dataframe.py +++ b/eland/dataframe.py @@ -1386,12 +1386,10 @@ def aggregate(self, func, axis=0, *args, **kwargs): # ['count', 'mad', 'max', 'mean', 'median', 'min', 'mode', 'quantile', # 'rank', 'sem', 'skew', 'sum', 'std', 'var', 'nunique'] if isinstance(func, str): - if hasattr(self, func): - # If all are implemented, this has to be changed to - # print(f"{func} is invalid operation") - return getattr(self, func)() - else: - raise AttributeError(f"{func} is not implemented/invalid") + # Wrap in list + return self._query_compiler.aggs([func]).squeeze().rename(None) + elif is_list_like(func) and len(func) == 1: + return self._query_compiler.aggs(func).squeeze().rename(None) elif is_list_like(func): # we have a list! return self._query_compiler.aggs(func) diff --git a/eland/tests/dataframe/test_aggs_pytest.py b/eland/tests/dataframe/test_aggs_pytest.py index 9a3bad66..99ca95f7 100644 --- a/eland/tests/dataframe/test_aggs_pytest.py +++ b/eland/tests/dataframe/test_aggs_pytest.py @@ -95,13 +95,14 @@ def test_aggs_median_var(self): pd_aggs = pd_aggs.astype("float64") assert_frame_equal(pd_aggs, ed_aggs, check_exact=False, check_less_precise=2) - # If Aggregate is given a string and series is returned. - def test_terms_aggs_series(self): + # If Aggregate is given a string or list with one agg then series is returned. + @pytest.mark.parametrize("agg", [["mean"], "mean"]) + def test_terms_aggs_series(self, agg): pd_flights = self.pd_flights() ed_flights = self.ed_flights() pd_sum_min_std = pd_flights.select_dtypes(include=[np.number]).agg("mean") - ed_sum_min_std = ed_flights.select_dtypes(include=[np.number]).agg("mean") + ed_sum_min_std = ed_flights.select_dtypes(include=[np.number]).agg(agg) assert_series_equal(pd_sum_min_std, ed_sum_min_std) @@ -109,6 +110,6 @@ def test_terms_aggs_series(self): def test_terms_wrongaggs(self): ed_flights = self.ed_flights()[["FlightDelayMin"]] - match = "abc is not implemented/invalid" - with pytest.raises(AttributeError, match=match): + match = "('abc', ' not currently implemented')" + with pytest.raises(NotImplementedError, match=match): ed_flights.select_dtypes(include=[np.number]).agg("abc") From b654656e90662b1850cb920b7176a224bf3d40d4 Mon Sep 17 00:00:00 2001 From: "P. Sai Vinay" Date: Tue, 25 Aug 2020 22:22:34 +0530 Subject: [PATCH 5/5] Add test for single list agg --- eland/dataframe.py | 2 -- eland/tests/dataframe/test_aggs_pytest.py | 16 +++++++++++++--- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/eland/dataframe.py b/eland/dataframe.py index a8c9907b..f7406876 100644 --- a/eland/dataframe.py +++ b/eland/dataframe.py @@ -1388,8 +1388,6 @@ def aggregate(self, func, axis=0, *args, **kwargs): if isinstance(func, str): # Wrap in list return self._query_compiler.aggs([func]).squeeze().rename(None) - elif is_list_like(func) and len(func) == 1: - return self._query_compiler.aggs(func).squeeze().rename(None) elif is_list_like(func): # we have a list! return self._query_compiler.aggs(func) diff --git a/eland/tests/dataframe/test_aggs_pytest.py b/eland/tests/dataframe/test_aggs_pytest.py index 99ca95f7..330fb61a 100644 --- a/eland/tests/dataframe/test_aggs_pytest.py +++ b/eland/tests/dataframe/test_aggs_pytest.py @@ -95,17 +95,27 @@ def test_aggs_median_var(self): pd_aggs = pd_aggs.astype("float64") assert_frame_equal(pd_aggs, ed_aggs, check_exact=False, check_less_precise=2) - # If Aggregate is given a string or list with one agg then series is returned. - @pytest.mark.parametrize("agg", [["mean"], "mean"]) + # If Aggregate is given a string then series is returned. + @pytest.mark.parametrize("agg", ["mean", "min", "max"]) def test_terms_aggs_series(self, agg): pd_flights = self.pd_flights() ed_flights = self.ed_flights() - pd_sum_min_std = pd_flights.select_dtypes(include=[np.number]).agg("mean") + pd_sum_min_std = pd_flights.select_dtypes(include=[np.number]).agg(agg) ed_sum_min_std = ed_flights.select_dtypes(include=[np.number]).agg(agg) assert_series_equal(pd_sum_min_std, ed_sum_min_std) + def test_terms_aggs_series_with_single_list_agg(self): + # aggs list with single agg should return dataframe. + pd_flights = self.pd_flights() + ed_flights = self.ed_flights() + + pd_sum_min = pd_flights.select_dtypes(include=[np.number]).agg(["mean"]) + ed_sum_min = ed_flights.select_dtypes(include=[np.number]).agg(["mean"]) + + assert_frame_equal(pd_sum_min, ed_sum_min) + # If Wrong Aggregate value is given. def test_terms_wrongaggs(self): ed_flights = self.ed_flights()[["FlightDelayMin"]]