Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Series.describe(), median agg dtype #258

Merged
merged 1 commit into from Aug 17, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/reference/supported_apis.rst
Expand Up @@ -714,7 +714,7 @@ script instead of being modified manually.
+---------------------------------------+------------+
| ``ed.Series.dropna()`` | No |
+---------------------------------------+------------+
| ``ed.Series.dtype`` | No |
| ``ed.Series.dtype`` | **Yes** |
+---------------------------------------+------------+
| ``ed.Series.dtypes`` | **Yes** |
+---------------------------------------+------------+
Expand Down
2 changes: 1 addition & 1 deletion eland/operations.py
Expand Up @@ -301,7 +301,7 @@ def _metric_aggs(self, query_compiler: "QueryCompiler", pd_aggs, numeric_only=Tr
)

# These aggregations maintain the column datatype
elif pd_agg in ("max", "min"):
elif pd_agg in {"max", "min", "median"}:
agg_value = field.np_dtype.type(agg_value)

values.append(agg_value)
Expand Down
43 changes: 39 additions & 4 deletions eland/series.py
Expand Up @@ -425,7 +425,7 @@ def to_pandas(self, show_progress: bool = False) -> pd.Series:
return self._query_compiler.to_pandas(show_progress=show_progress)[self.name]

@property
def _dtype(self) -> np.dtype:
def dtype(self) -> np.dtype:
# DO NOT MAKE PUBLIC (i.e. def dtype) as this breaks query eval implementation
return self._query_compiler.dtypes[0]

Expand Down Expand Up @@ -1192,7 +1192,7 @@ def _numeric_op(self, right: Any, method_name: str) -> "Series":
self._query_compiler.check_arithmetics(right._query_compiler)

right_object = ArithmeticSeries(
right._query_compiler, right.name, right._dtype
right._query_compiler, right.name, right.dtype
)
display_name = None
elif np.issubdtype(np.dtype(type(right)), np.number):
Expand All @@ -1204,11 +1204,11 @@ def _numeric_op(self, right: Any, method_name: str) -> "Series":
else:
raise TypeError(
f"unsupported operation type(s) [{method_name!r}] "
f"for operands ['{type(self)}' with dtype '{self._dtype}', "
f"for operands ['{type(self)}' with dtype '{self.dtype}', "
f"'{type(right).__name__}']"
)

left_object = ArithmeticSeries(self._query_compiler, self.name, self._dtype)
left_object = ArithmeticSeries(self._query_compiler, self.name, self.dtype)
left_object.arithmetic_operation(method_name, right_object)

series = Series(
Expand Down Expand Up @@ -1430,6 +1430,41 @@ def mad(self, numeric_only=None):
results = super().mad(numeric_only=numeric_only)
return results.squeeze()

def describe(self) -> pd.Series:
"""
Generate descriptive statistics that summarize the central tendency, dispersion and shape of a
dataset’s distribution, excluding NaN values.

Analyzes both numeric and object series, as well as DataFrame column sets of mixed data types.
The output will vary depending on what is provided. Refer to the notes below for more detail.

TODO - add additional arguments (current only numeric values supported)

Returns
-------
pandas.Series:
Summary information

See Also
--------
:pandas_api_docs:`pandas.Series.describe`

Examples
--------
>>> df = ed.DataFrame('localhost', 'flights')
>>> df.AvgTicketPrice.describe() # ignoring percentiles as they don't generate consistent results
count 13059.000000
mean 628.253689
std 266.386661
min 100.020531
...
...
...
max 1199.729004
Name: AvgTicketPrice, dtype: float64
"""
return super().describe().squeeze()

# def values TODO - not implemented as causes current implementation of query to fail

def to_numpy(self):
Expand Down
22 changes: 22 additions & 0 deletions eland/tests/dataframe/test_metrics_pytest.py
Expand Up @@ -228,3 +228,25 @@ def test_flights_datetime_metrics_median(self):
<= median
<= pd.to_datetime("2018-01-01 12:00:00.000")
)

def test_metric_agg_keep_dtypes(self):
# max, min, and median maintain their dtypes
df = self.ed_flights_small()[["AvgTicketPrice", "Cancelled", "dayOfWeek"]]
assert df.min().tolist() == [131.81910705566406, False, 0]
assert df.max().tolist() == [989.9527587890625, True, 0]
assert df.median().tolist() == [550.276123046875, False, 0]
all_agg = df.agg(["min", "max", "median"])
assert all_agg.dtypes.tolist() == [
np.dtype("float64"),
np.dtype("bool"),
np.dtype("int64"),
]
assert all_agg.to_dict() == {
"AvgTicketPrice": {
"max": 989.9527587890625,
"median": 550.276123046875,
"min": 131.81910705566406,
},
"Cancelled": {"max": True, "median": False, "min": False},
"dayOfWeek": {"max": 0, "median": 0, "min": 0},
}
40 changes: 40 additions & 0 deletions eland/tests/series/test_describe_pytest.py
@@ -0,0 +1,40 @@
# Licensed to Elasticsearch B.V. under one or more contributor
# license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright
# ownership. Elasticsearch B.V. licenses this file to you under
# the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

import pandas as pd
from eland.tests.common import TestData, assert_series_equal


class TestSeriesDescribe(TestData):
def test_series_describe(self):
ed_df = self.ed_flights_small()
pd_df = self.pd_flights_small()

ed_desc = ed_df.AvgTicketPrice.describe()
pd_desc = pd_df.AvgTicketPrice.describe()

assert isinstance(ed_desc, pd.Series)
assert ed_desc.shape == pd_desc.shape
assert ed_desc.dtype == pd_desc.dtype
assert ed_desc.index.equals(pd_desc.index)

# Percentiles calculations vary for Elasticsearch
assert_series_equal(
ed_desc[["count", "mean", "std", "min", "max"]],
pd_desc[["count", "mean", "std", "min", "max"]],
rtol=0.2,
)