From 7da1e1eef0bb3660981a72c7d2c27824e3139b9a Mon Sep 17 00:00:00 2001 From: Seth Michael Larson Date: Mon, 17 Aug 2020 08:49:21 -0500 Subject: [PATCH] Fix Series.describe(), median agg dtype --- docs/source/reference/supported_apis.rst | 2 +- eland/operations.py | 2 +- eland/series.py | 43 ++++++++++++++++++-- eland/tests/dataframe/test_metrics_pytest.py | 22 ++++++++++ eland/tests/series/test_describe_pytest.py | 40 ++++++++++++++++++ 5 files changed, 103 insertions(+), 6 deletions(-) create mode 100644 eland/tests/series/test_describe_pytest.py diff --git a/docs/source/reference/supported_apis.rst b/docs/source/reference/supported_apis.rst index b526d8ea..25c80a86 100644 --- a/docs/source/reference/supported_apis.rst +++ b/docs/source/reference/supported_apis.rst @@ -714,7 +714,7 @@ script instead of being modified manually. +---------------------------------------+------------+ | ``ed.Series.dropna()`` | No | +---------------------------------------+------------+ -| ``ed.Series.dtype`` | No | +| ``ed.Series.dtype`` | **Yes** | +---------------------------------------+------------+ | ``ed.Series.dtypes`` | **Yes** | +---------------------------------------+------------+ diff --git a/eland/operations.py b/eland/operations.py index 170e7113..9728be74 100644 --- a/eland/operations.py +++ b/eland/operations.py @@ -301,7 +301,7 @@ def _metric_aggs(self, query_compiler: "QueryCompiler", pd_aggs, numeric_only=Tr ) # These aggregations maintain the column datatype - elif pd_agg in ("max", "min"): + elif pd_agg in {"max", "min", "median"}: agg_value = field.np_dtype.type(agg_value) values.append(agg_value) diff --git a/eland/series.py b/eland/series.py index 9623a54b..c1551dc8 100644 --- a/eland/series.py +++ b/eland/series.py @@ -425,7 +425,7 @@ def to_pandas(self, show_progress: bool = False) -> pd.Series: return self._query_compiler.to_pandas(show_progress=show_progress)[self.name] @property - def _dtype(self) -> np.dtype: + def dtype(self) -> np.dtype: # DO NOT MAKE PUBLIC (i.e. def dtype) as this breaks query eval implementation return self._query_compiler.dtypes[0] @@ -1192,7 +1192,7 @@ def _numeric_op(self, right: Any, method_name: str) -> "Series": self._query_compiler.check_arithmetics(right._query_compiler) right_object = ArithmeticSeries( - right._query_compiler, right.name, right._dtype + right._query_compiler, right.name, right.dtype ) display_name = None elif np.issubdtype(np.dtype(type(right)), np.number): @@ -1204,11 +1204,11 @@ def _numeric_op(self, right: Any, method_name: str) -> "Series": else: raise TypeError( f"unsupported operation type(s) [{method_name!r}] " - f"for operands ['{type(self)}' with dtype '{self._dtype}', " + f"for operands ['{type(self)}' with dtype '{self.dtype}', " f"'{type(right).__name__}']" ) - left_object = ArithmeticSeries(self._query_compiler, self.name, self._dtype) + left_object = ArithmeticSeries(self._query_compiler, self.name, self.dtype) left_object.arithmetic_operation(method_name, right_object) series = Series( @@ -1430,6 +1430,41 @@ def mad(self, numeric_only=None): results = super().mad(numeric_only=numeric_only) return results.squeeze() + def describe(self) -> pd.Series: + """ + Generate descriptive statistics that summarize the central tendency, dispersion and shape of a + dataset’s distribution, excluding NaN values. + + Analyzes both numeric and object series, as well as DataFrame column sets of mixed data types. + The output will vary depending on what is provided. Refer to the notes below for more detail. + + TODO - add additional arguments (current only numeric values supported) + + Returns + ------- + pandas.Series: + Summary information + + See Also + -------- + :pandas_api_docs:`pandas.Series.describe` + + Examples + -------- + >>> df = ed.DataFrame('localhost', 'flights') + >>> df.AvgTicketPrice.describe() # ignoring percentiles as they don't generate consistent results + count 13059.000000 + mean 628.253689 + std 266.386661 + min 100.020531 + ... + ... + ... + max 1199.729004 + Name: AvgTicketPrice, dtype: float64 + """ + return super().describe().squeeze() + # def values TODO - not implemented as causes current implementation of query to fail def to_numpy(self): diff --git a/eland/tests/dataframe/test_metrics_pytest.py b/eland/tests/dataframe/test_metrics_pytest.py index 4a1c826b..494e42d6 100644 --- a/eland/tests/dataframe/test_metrics_pytest.py +++ b/eland/tests/dataframe/test_metrics_pytest.py @@ -228,3 +228,25 @@ def test_flights_datetime_metrics_median(self): <= median <= pd.to_datetime("2018-01-01 12:00:00.000") ) + + def test_metric_agg_keep_dtypes(self): + # max, min, and median maintain their dtypes + df = self.ed_flights_small()[["AvgTicketPrice", "Cancelled", "dayOfWeek"]] + assert df.min().tolist() == [131.81910705566406, False, 0] + assert df.max().tolist() == [989.9527587890625, True, 0] + assert df.median().tolist() == [550.276123046875, False, 0] + all_agg = df.agg(["min", "max", "median"]) + assert all_agg.dtypes.tolist() == [ + np.dtype("float64"), + np.dtype("bool"), + np.dtype("int64"), + ] + assert all_agg.to_dict() == { + "AvgTicketPrice": { + "max": 989.9527587890625, + "median": 550.276123046875, + "min": 131.81910705566406, + }, + "Cancelled": {"max": True, "median": False, "min": False}, + "dayOfWeek": {"max": 0, "median": 0, "min": 0}, + } diff --git a/eland/tests/series/test_describe_pytest.py b/eland/tests/series/test_describe_pytest.py new file mode 100644 index 00000000..2f6f6ec7 --- /dev/null +++ b/eland/tests/series/test_describe_pytest.py @@ -0,0 +1,40 @@ +# Licensed to Elasticsearch B.V. under one or more contributor +# license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright +# ownership. Elasticsearch B.V. licenses this file to you under +# the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pandas as pd +from eland.tests.common import TestData, assert_series_equal + + +class TestSeriesDescribe(TestData): + def test_series_describe(self): + ed_df = self.ed_flights_small() + pd_df = self.pd_flights_small() + + ed_desc = ed_df.AvgTicketPrice.describe() + pd_desc = pd_df.AvgTicketPrice.describe() + + assert isinstance(ed_desc, pd.Series) + assert ed_desc.shape == pd_desc.shape + assert ed_desc.dtype == pd_desc.dtype + assert ed_desc.index.equals(pd_desc.index) + + # Percentiles calculations vary for Elasticsearch + assert_series_equal( + ed_desc[["count", "mean", "std", "min", "max"]], + pd_desc[["count", "mean", "std", "min", "max"]], + rtol=0.2, + )