From 02037c1c294d4d03c093c38d4407220c3da8db2f Mon Sep 17 00:00:00 2001 From: "P. Sai Vinay" Date: Thu, 12 Nov 2020 14:33:39 +0530 Subject: [PATCH 1/6] Add mode to df, series --- docs/sphinx/reference/dataframe.rst | 1 + docs/sphinx/reference/series.rst | 1 + eland/dataframe.py | 62 +++++++++++++ eland/field_mappings.py | 3 + eland/groupby.py | 3 + eland/operations.py | 120 ++++++++++++++++++++++--- eland/query.py | 19 +++- eland/query_compiler.py | 18 +++- eland/series.py | 42 +++++++++ tests/dataframe/test_groupby_pytest.py | 6 ++ tests/dataframe/test_metrics_pytest.py | 15 ++++ tests/series/test_metrics_pytest.py | 23 +++++ 12 files changed, 299 insertions(+), 14 deletions(-) diff --git a/docs/sphinx/reference/dataframe.rst b/docs/sphinx/reference/dataframe.rst index 72869aa8..391c66ac 100644 --- a/docs/sphinx/reference/dataframe.rst +++ b/docs/sphinx/reference/dataframe.rst @@ -89,6 +89,7 @@ Computations / Descriptive Stats DataFrame.var DataFrame.sum DataFrame.nunique + DataFrame.mode Reindexing / Selection / Label Manipulation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/sphinx/reference/series.rst b/docs/sphinx/reference/series.rst index 3e34a2f3..b4355fb4 100644 --- a/docs/sphinx/reference/series.rst +++ b/docs/sphinx/reference/series.rst @@ -79,6 +79,7 @@ Computations / Descriptive Stats Series.var Series.nunique Series.value_counts + Series.mode Reindexing / Selection / Label Manipulation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/eland/dataframe.py b/eland/dataframe.py index 7578a4f7..fe24ab26 100644 --- a/eland/dataframe.py +++ b/eland/dataframe.py @@ -1628,6 +1628,68 @@ def groupby( by=by, query_compiler=self._query_compiler.copy(), dropna=dropna ) + def mode( + self, + numeric_only: bool = False, + dropna: bool = True, + es_size: int = 10, + ) -> pd.DataFrame: + """ + Calculate mode of a DataFrame + + Parameters + ---------- + numeric_only: {True, False} Default is False + Which datatype to be returned + - True: Returns all numeric or timestamp columns + - False: Returns all columns + dropna: {True, False} Default is True + - True: Don’t consider counts of NaN/NaT. + - False: Consider counts of NaN/NaT. + es_size: default 10 + number of rows to be returned if mode has multiple values + + See Also + -------- + :pandas_api_docs:`pandas.DataFrame.mode` + + Examples + -------- + >>> ed_ecommerce = ed.DataFrame('localhost', 'ecommerce') + >>> ed_df = ed_ecommerce.filter(["total_quantity", "geoip.city_name", "customer_birth_date", "day_of_week", "taxful_total_price"]) + >>> ed_df.mode(numeric_only=False) + total_quantity geoip.city_name customer_birth_date day_of_week taxful_total_price + 0 2 New York NaT Thursday 53.98 + + >>> ed_df.mode(numeric_only=True) + total_quantity taxful_total_price + 0 2 53.98 + + >>> ed_df = ed_ecommerce.filter(["products.tax_amount","order_date"]) + >>> ed_df.mode() + products.tax_amount order_date + 0 0.0 2016-12-02 20:36:58 + 1 NaN 2016-12-04 23:44:10 + 2 NaN 2016-12-08 06:21:36 + 3 NaN 2016-12-08 09:38:53 + 4 NaN 2016-12-12 11:38:24 + 5 NaN 2016-12-12 19:46:34 + 6 NaN 2016-12-14 18:00:00 + 7 NaN 2016-12-15 11:38:24 + 8 NaN 2016-12-22 19:39:22 + 9 NaN 2016-12-24 06:21:36 + + >>> ed_df.mode(es_size = 3) + products.tax_amount order_date + 0 0.0 2016-12-02 20:36:58 + 1 NaN 2016-12-04 23:44:10 + 2 NaN 2016-12-08 06:21:36 + """ + # TODO dropna=False + return self._query_compiler.mode( + numeric_only=numeric_only, dropna=True, is_dataframe=True, es_size=es_size + ) + def query(self, expr) -> "DataFrame": """ Query the columns of a DataFrame with a boolean expression. diff --git a/eland/field_mappings.py b/eland/field_mappings.py index e2c3b577..9a123248 100644 --- a/eland/field_mappings.py +++ b/eland/field_mappings.py @@ -109,6 +109,9 @@ def is_es_agg_compatible(self, es_agg) -> bool: # Timestamps also work for 'min', 'max' and 'avg' if es_agg in {"min", "max", "avg", "percentiles"} and self.is_timestamp: return True + # All datatypes work for mode + if es_agg == "mode": + return True return False @property diff --git a/eland/groupby.py b/eland/groupby.py index d57ad939..2183039b 100644 --- a/eland/groupby.py +++ b/eland/groupby.py @@ -617,3 +617,6 @@ def count(self) -> "pd.DataFrame": numeric_only=False, is_dataframe_agg=False, ) + + def mode(self) -> None: + raise NotImplementedError("Currently mode is not supported for groupby") diff --git a/eland/operations.py b/eland/operations.py index 2a12986c..21c0e360 100644 --- a/eland/operations.py +++ b/eland/operations.py @@ -181,7 +181,7 @@ def _metric_agg_series( dtype = "object" return build_pd_series(results, index=results.keys(), dtype=dtype) - def value_counts(self, query_compiler, es_size): + def value_counts(self, query_compiler: "QueryCompiler", es_size: int) -> pd.Series: return self._terms_aggs(query_compiler, "terms", es_size) def hist(self, query_compiler, bins): @@ -195,12 +195,54 @@ def aggs(self, query_compiler, pd_aggs, numeric_only=None) -> pd.DataFrame: results, index=pd_aggs, dtype=(np.float64 if numeric_only else None) ) + def mode( + self, + query_compiler: "QueryCompiler", + pd_aggs: List[str], + is_dataframe: bool, + es_size: int, + numeric_only: bool = False, + dropna: bool = True, + ) -> Union[pd.DataFrame, pd.Series]: + + results = self._metric_aggs( + query_compiler, + pd_aggs=pd_aggs, + numeric_only=numeric_only, + dropna=dropna, + es_mode_size=es_size, + ) + + pd_dict: Dict[str, Any] = {} + row_diff: Optional[int] = None + + if is_dataframe: + # If multiple values of mode is returned for a particular column + # find the maximum length and use that to fill dataframe with NaN/NaT + rows_len = max([len(value) for value in results.values()]) + for key, values in results.items(): + row_diff = rows_len - len(values) + # Convert np.ndarray to list + values = list(values) + if row_diff: + if isinstance(values[0], pd.Timestamp): + values.extend([pd.NaT] * row_diff) + else: + values.extend([np.NaN] * row_diff) + pd_dict[key] = values + + return pd.DataFrame(pd_dict) + else: + return pd.DataFrame(results.values()).iloc[0].rename() + def _metric_aggs( self, query_compiler: "QueryCompiler", pd_aggs: List[str], numeric_only: Optional[bool] = None, is_dataframe_agg: bool = False, + es_mode_size: Optional[int] = None, + dropna: bool = True, ) -> Dict[str, Any]: """ Used to calculate metric aggregations @@ -216,6 +258,10 @@ def _metric_aggs( return either all numeric values or NaN/NaT is_dataframe_agg: know if this method is called from single-agg or aggreagation method + es_mode_size: + number of rows to return when multiple mode values are present. + dropna: + drop NaN/NaT for a dataframe Returns ------- @@ -252,6 +298,15 @@ def _metric_aggs( es_agg[0], field.aggregatable_es_field_name, ) + elif es_agg == "mode": + # TODO for dropna=False, Check If field is timestamp or boolean or numeric, + # then use missing parameter for terms aggregation. + body.terms_aggs( + f"{es_agg}_{field.es_field_name}", + "terms", + field.aggregatable_es_field_name, + es_mode_size, + ) else: body.metric_aggs( f"{es_agg}_{field.es_field_name}", @@ -280,7 +335,9 @@ def _metric_aggs( is_dataframe_agg=is_dataframe_agg, ) - def _terms_aggs(self, query_compiler, func, es_size=None): + def _terms_aggs( + self, query_compiler: "QueryCompiler", func: str, es_size: int + ) -> pd.Series: """ Parameters ---------- @@ -499,13 +556,42 @@ def _unpack_metric_aggs( agg_value = np.sqrt( (count / (count - 1.0)) * agg_value * agg_value ) + elif es_agg == "mode": + # For terms aggregation buckets are returned + # agg_value will be of type list + agg_value = response["aggregations"][ + f"{es_agg}_{field.es_field_name}" + ]["buckets"] else: agg_value = response["aggregations"][ f"{es_agg}_{field.es_field_name}" ]["value"] + if isinstance(agg_value, list): + # include top-terms in the result. + if not agg_value: + # If the all the documents for a field are empty + agg_value = [np.NaN] + else: + max_doc_count = agg_value[0]["doc_count"] + # We need only keys which are equal to max_doc_count + # lesser values are ignored + agg_value = [ + agg_value[i]["key"] + for i in range(len(agg_value)) + if agg_value[i]["doc_count"] == max_doc_count + ] + # Maintain datatype by default because pandas does the same + # text are returned as-is + if field.is_bool or field.is_numeric: + agg_value = [ + field.np_dtype.type(value) for value in agg_value + ] + # Null usually means there were no results. - if agg_value is None or np.isnan(agg_value): + if not isinstance(agg_value, list) and ( + agg_value is None or np.isnan(agg_value) + ): if is_dataframe_agg and not numeric_only: agg_value = np.NaN elif not is_dataframe_agg and numeric_only is False: @@ -517,13 +603,22 @@ def _unpack_metric_aggs( # If this is a non-null timestamp field convert to a pd.Timestamp() elif field.is_timestamp: - agg_value = elasticsearch_date_to_pandas_date( - agg_value, field.es_date_format - ) + if isinstance(agg_value, list): + # convert to timestamp results for mode + agg_value = [ + elasticsearch_date_to_pandas_date( + value, field.es_date_format + ) + for value in agg_value + ] + else: + agg_value = elasticsearch_date_to_pandas_date( + agg_value, field.es_date_format + ) # If numeric_only is False | None then maintain column datatype elif not numeric_only: # we're only converting to bool for lossless aggs like min, max, and median. - if pd_agg in {"max", "min", "median", "sum"}: + if pd_agg in {"max", "min", "median", "sum", "mode"}: # 'sum' isn't representable with bool, use int64 if pd_agg == "sum" and field.is_bool: agg_value = np.int64(agg_value) @@ -791,10 +886,15 @@ def _map_pd_aggs_to_es_aggs(pd_aggs): elif pd_agg == "median": es_aggs.append(("percentiles", "50.0")) - # Not implemented elif pd_agg == "mode": - # We could do this via top term - raise NotImplementedError(pd_agg, " not currently implemented") + if len(pd_aggs) != 1: + raise NotImplementedError( + "Currently mode is not supported in df.agg[...]. Try df.mode()" + ) + else: + es_aggs.append("mode") + + # Not implemented elif pd_agg == "quantile": # TODO raise NotImplementedError(pd_agg, " not currently implemented") diff --git a/eland/query.py b/eland/query.py index 5f7fe2ea..68b89754 100644 --- a/eland/query.py +++ b/eland/query.py @@ -101,7 +101,14 @@ def regexp(self, field: str, value: str) -> None: else: self._query = self._query & Rlike(field, value) - def terms_aggs(self, name: str, func: str, field: str, es_size: int) -> None: + def terms_aggs( + self, + name: str, + func: str, + field: str, + es_size: Optional[int] = None, + missing: Optional[Any] = None, + ) -> None: """ Add terms agg e.g @@ -109,12 +116,18 @@ def terms_aggs(self, name: str, func: str, field: str, es_size: int) -> None: "name": { "terms": { "field": "Airline", - "size": 10 + "size": 10, + "missing": "null" } } } """ - agg = {func: {"field": field, "size": es_size}} + agg = {func: {"field": field}} + if es_size: + agg[func]["size"] = str(es_size) + + if missing: + agg[func]["missing"] = missing self._aggs[name] = agg def metric_aggs(self, name: str, func: str, field: str) -> None: diff --git a/eland/query_compiler.py b/eland/query_compiler.py index f3a4b077..5b41f04e 100644 --- a/eland/query_compiler.py +++ b/eland/query_compiler.py @@ -621,6 +621,22 @@ def nunique(self): self, ["nunique"], numeric_only=False ) + def mode( + self, + es_size: int, + numeric_only: bool = False, + dropna: bool = True, + is_dataframe: bool = True, + ) -> Union[pd.DataFrame, pd.Series]: + return self._operations.mode( + self, + pd_aggs=["mode"], + numeric_only=numeric_only, + dropna=dropna, + is_dataframe=is_dataframe, + es_size=es_size, + ) + def aggs_groupby( self, by: List[str], @@ -638,7 +654,7 @@ def aggs_groupby( numeric_only=numeric_only, ) - def value_counts(self, es_size): + def value_counts(self, es_size: int) -> pd.Series: return self._operations.value_counts(self, es_size) def es_info(self, buf): diff --git a/eland/series.py b/eland/series.py index 020fb67a..c023b647 100644 --- a/eland/series.py +++ b/eland/series.py @@ -637,6 +637,48 @@ def filter( ) return Series(_query_compiler=new_query_compiler) + def mode(self, es_size: int = 10) -> pd.Series: + """ + Calculate mode of a series + + Parameters + ---------- + es_size: default 10 + number of rows to be returned if mode has multiple values + + See Also + -------- + :pandas_api_docs:`pandas.Series.mode` + + Examples + -------- + >>> ed_ecommerce = ed.DataFrame('localhost', 'ecommerce') + >>> ed_ecommerce["day_of_week"].mode() + 0 Thursday + dtype: object + + >>> ed_ecommerce["order_date"].mode() + 0 2016-12-02 20:36:58 + 1 2016-12-04 23:44:10 + 2 2016-12-08 06:21:36 + 3 2016-12-08 09:38:53 + 4 2016-12-12 11:38:24 + 5 2016-12-12 19:46:34 + 6 2016-12-14 18:00:00 + 7 2016-12-15 11:38:24 + 8 2016-12-22 19:39:22 + 9 2016-12-24 06:21:36 + dtype: datetime64[ns] + + >>> ed_ecommerce["order_date"].mode(es_size=3) + 0 2016-12-02 20:36:58 + 1 2016-12-04 23:44:10 + 2 2016-12-08 06:21:36 + dtype: datetime64[ns] + + """ + return self._query_compiler.mode(is_dataframe=False, es_size=es_size) + def es_match( self, text: str, diff --git a/tests/dataframe/test_groupby_pytest.py b/tests/dataframe/test_groupby_pytest.py index 9dda3c32..1ba1b81a 100644 --- a/tests/dataframe/test_groupby_pytest.py +++ b/tests/dataframe/test_groupby_pytest.py @@ -194,3 +194,9 @@ def test_groupby_dataframe_mad(self): assert_index_equal(pd_min_mad.columns, ed_min_mad.columns) assert_index_equal(pd_min_mad.index, ed_min_mad.index) assert_series_equal(pd_min_mad.dtypes, ed_min_mad.dtypes) + + def test_groupby_mode(self): + ed_flights = self.ed_flights() + match = "Currently mode is not supported for groupby" + with pytest.raises(NotImplementedError, match=match): + ed_flights.groupby("Cancelled").mode() diff --git a/tests/dataframe/test_metrics_pytest.py b/tests/dataframe/test_metrics_pytest.py index 71050fbb..c067baca 100644 --- a/tests/dataframe/test_metrics_pytest.py +++ b/tests/dataframe/test_metrics_pytest.py @@ -426,3 +426,18 @@ def test_aggs_count(self): ed_count = ed_flights.agg(["count"]) assert_frame_equal(pd_count, ed_count) + + @pytest.mark.parametrize("numeric_only", [True, False]) + @pytest.mark.parametrize("es_size", [2, 10, 20]) + def test_aggs_mode(self, es_size, numeric_only): + pd_flights = self.pd_flights().filter( + ["Cancelled", "dayOfWeek", "timestamp", "DestCountry"] + ) + ed_flights = self.ed_flights().filter( + ["Cancelled", "dayOfWeek", "timestamp", "DestCountry"] + ) + + pd_mode = pd_flights.mode(numeric_only=numeric_only)[:es_size] + ed_mode = ed_flights.mode(numeric_only=numeric_only, es_size=es_size) + + assert_frame_equal(pd_mode, ed_mode) diff --git a/tests/series/test_metrics_pytest.py b/tests/series/test_metrics_pytest.py index 35244bab..ad6aacb4 100644 --- a/tests/series/test_metrics_pytest.py +++ b/tests/series/test_metrics_pytest.py @@ -22,6 +22,7 @@ import numpy as np import pandas as pd import pytest +from pandas.testing import assert_series_equal from tests.common import TestData, assert_almost_equal @@ -114,3 +115,25 @@ def test_flights_datetime_median_metric(self): <= median <= pd.to_datetime("2018-01-01 12:00:00.000") ) + + @pytest.mark.parametrize( + "column", ["day_of_week", "geoip.region_name", "taxful_total_price", "user"] + ) + def test_ecommerce_mode(self, column): + ed_series = self.ed_ecommerce() + pd_series = self.pd_ecommerce() + + ed_mode = ed_series[column].mode() + pd_mode = pd_series[column].mode() + + assert_series_equal(ed_mode, pd_mode) + + @pytest.mark.parametrize("es_size", [2, 10, 20]) + def test_ecommerce_mode_es_size(self, es_size): + ed_series = self.ed_ecommerce() + pd_series = self.pd_ecommerce() + + pd_mode = pd_series["order_date"].mode()[:es_size] + ed_mode = ed_series["order_date"].mode(es_size) + + assert_series_equal(pd_mode, ed_mode) From 27f7fd3c0326d585d6937b576aa9f9dd986b0d9a Mon Sep 17 00:00:00 2001 From: "P. Sai Vinay" Date: Tue, 17 Nov 2020 16:07:20 +0530 Subject: [PATCH 2/6] Changes Requested --- eland/field_mappings.py | 11 ++++++----- eland/operations.py | 9 +++++---- tests/dataframe/test_metrics_pytest.py | 7 ++++--- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/eland/field_mappings.py b/eland/field_mappings.py index 9a123248..40b71a1a 100644 --- a/eland/field_mappings.py +++ b/eland/field_mappings.py @@ -102,16 +102,17 @@ def is_es_agg_compatible(self, es_agg) -> bool: # Except "median_absolute_deviation" which doesn't support bool if es_agg == "median_absolute_deviation" and self.is_bool: return False - # Cardinality and Count work for all types + # Cardinality, Count and mode work for all types # Numerics and bools work for all aggs - if es_agg in ("cardinality", "value_count") or self.is_numeric or self.is_bool: + if ( + es_agg in ("cardinality", "value_count", "mode") + or self.is_numeric + or self.is_bool + ): return True # Timestamps also work for 'min', 'max' and 'avg' if es_agg in {"min", "max", "avg", "percentiles"} and self.is_timestamp: return True - # All datatypes work for mode - if es_agg == "mode": - return True return False @property diff --git a/eland/operations.py b/eland/operations.py index 21c0e360..a3be8841 100644 --- a/eland/operations.py +++ b/eland/operations.py @@ -571,16 +571,17 @@ def _unpack_metric_aggs( # include top-terms in the result. if not agg_value: # If the all the documents for a field are empty - agg_value = [np.NaN] + agg_value = [field.nan_value] else: max_doc_count = agg_value[0]["doc_count"] # We need only keys which are equal to max_doc_count # lesser values are ignored agg_value = [ - agg_value[i]["key"] - for i in range(len(agg_value)) - if agg_value[i]["doc_count"] == max_doc_count + item["key"] + for item in agg_value + if item["doc_count"] == max_doc_count ] + # Maintain datatype by default because pandas does the same # text are returned as-is if field.is_bool or field.is_numeric: diff --git a/tests/dataframe/test_metrics_pytest.py b/tests/dataframe/test_metrics_pytest.py index c067baca..ce688014 100644 --- a/tests/dataframe/test_metrics_pytest.py +++ b/tests/dataframe/test_metrics_pytest.py @@ -428,13 +428,14 @@ def test_aggs_count(self): assert_frame_equal(pd_count, ed_count) @pytest.mark.parametrize("numeric_only", [True, False]) - @pytest.mark.parametrize("es_size", [2, 10, 20]) + @pytest.mark.parametrize("es_size", [2, 20, 100, 5000, 3000]) def test_aggs_mode(self, es_size, numeric_only): + # FlightNum has unique values, so we can test `fill` NaN/NaT for remaining columns pd_flights = self.pd_flights().filter( - ["Cancelled", "dayOfWeek", "timestamp", "DestCountry"] + ["Cancelled", "dayOfWeek", "timestamp", "DestCountry", "FlightNum"] ) ed_flights = self.ed_flights().filter( - ["Cancelled", "dayOfWeek", "timestamp", "DestCountry"] + ["Cancelled", "dayOfWeek", "timestamp", "DestCountry", "FlightNum"] ) pd_mode = pd_flights.mode(numeric_only=numeric_only)[:es_size] From 3af0b39744e59aaf3b391cd2da3083c12c9afc4c Mon Sep 17 00:00:00 2001 From: "P. Sai Vinay" Date: Thu, 31 Dec 2020 15:15:33 +0530 Subject: [PATCH 3/6] Add eland in manifest.in --- MANIFEST.in | 1 + 1 file changed, 1 insertion(+) diff --git a/MANIFEST.in b/MANIFEST.in index 345d551a..fe043a09 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,4 @@ include LICENSE.txt include README.md include eland/py.typed +recursive-include eland From c0e723b1ebe3194863c6c848a5dcee2fcc74244b Mon Sep 17 00:00:00 2001 From: "P. Sai Vinay" Date: Thu, 31 Dec 2020 15:26:55 +0530 Subject: [PATCH 4/6] make tuple into set --- eland/field_mappings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eland/field_mappings.py b/eland/field_mappings.py index 40b71a1a..45724a43 100644 --- a/eland/field_mappings.py +++ b/eland/field_mappings.py @@ -105,7 +105,7 @@ def is_es_agg_compatible(self, es_agg) -> bool: # Cardinality, Count and mode work for all types # Numerics and bools work for all aggs if ( - es_agg in ("cardinality", "value_count", "mode") + es_agg in {"cardinality", "value_count", "mode"} or self.is_numeric or self.is_bool ): From 015e532ef4fe2e007b7d8d8fabe69477a9198530 Mon Sep 17 00:00:00 2001 From: "P. Sai Vinay" Date: Tue, 5 Jan 2021 23:48:35 +0530 Subject: [PATCH 5/6] Requested changes --- MANIFEST.in | 1 - eland/dataframe.py | 2 +- eland/operations.py | 2 +- tests/dataframe/test_metrics_pytest.py | 8 ++++++-- tests/series/test_metrics_pytest.py | 2 +- 5 files changed, 9 insertions(+), 6 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index fe043a09..345d551a 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,3 @@ include LICENSE.txt include README.md include eland/py.typed -recursive-include eland diff --git a/eland/dataframe.py b/eland/dataframe.py index fe24ab26..9b8dc4c6 100644 --- a/eland/dataframe.py +++ b/eland/dataframe.py @@ -1635,7 +1635,7 @@ def mode( es_size: int = 10, ) -> pd.DataFrame: """ - Calculate mode of a DataFrame + Calculate mode of a DataFrame Parameters ---------- diff --git a/eland/operations.py b/eland/operations.py index a3be8841..c2615512 100644 --- a/eland/operations.py +++ b/eland/operations.py @@ -890,7 +890,7 @@ def _map_pd_aggs_to_es_aggs(pd_aggs): elif pd_agg == "mode": if len(pd_aggs) != 1: raise NotImplementedError( - "Currently mode is not supported in df.agg[...]. Try df.mode()" + "Currently mode is not supported in df.agg(...). Try df.mode()" ) else: es_aggs.append("mode") diff --git a/tests/dataframe/test_metrics_pytest.py b/tests/dataframe/test_metrics_pytest.py index ce688014..7a907dd6 100644 --- a/tests/dataframe/test_metrics_pytest.py +++ b/tests/dataframe/test_metrics_pytest.py @@ -428,7 +428,7 @@ def test_aggs_count(self): assert_frame_equal(pd_count, ed_count) @pytest.mark.parametrize("numeric_only", [True, False]) - @pytest.mark.parametrize("es_size", [2, 20, 100, 5000, 3000]) + @pytest.mark.parametrize("es_size", [1, 2, 20, 100, 5000, 3000]) def test_aggs_mode(self, es_size, numeric_only): # FlightNum has unique values, so we can test `fill` NaN/NaT for remaining columns pd_flights = self.pd_flights().filter( @@ -441,4 +441,8 @@ def test_aggs_mode(self, es_size, numeric_only): pd_mode = pd_flights.mode(numeric_only=numeric_only)[:es_size] ed_mode = ed_flights.mode(numeric_only=numeric_only, es_size=es_size) - assert_frame_equal(pd_mode, ed_mode) + # Skipping dtype check because eland is giving Cancelled dtype as bool + # but pandas is referring it as object + assert_frame_equal( + pd_mode, ed_mode, check_dtype=(False if es_size == 1 else True) + ) diff --git a/tests/series/test_metrics_pytest.py b/tests/series/test_metrics_pytest.py index ad6aacb4..c471d97e 100644 --- a/tests/series/test_metrics_pytest.py +++ b/tests/series/test_metrics_pytest.py @@ -128,7 +128,7 @@ def test_ecommerce_mode(self, column): assert_series_equal(ed_mode, pd_mode) - @pytest.mark.parametrize("es_size", [2, 10, 20]) + @pytest.mark.parametrize("es_size", [1, 2, 10, 20]) def test_ecommerce_mode_es_size(self, es_size): ed_series = self.ed_ecommerce() pd_series = self.pd_ecommerce() From 0c0a1a08be03707a7f0dee06f78e4b3482810565 Mon Sep 17 00:00:00 2001 From: "P. Sai Vinay" Date: Wed, 6 Jan 2021 10:53:50 +0530 Subject: [PATCH 6/6] Add missing files --- docs/sphinx/reference/api/eland.DataFrame.mode.rst | 6 ++++++ docs/sphinx/reference/api/eland.Series.mode.rst | 6 ++++++ 2 files changed, 12 insertions(+) create mode 100644 docs/sphinx/reference/api/eland.DataFrame.mode.rst create mode 100644 docs/sphinx/reference/api/eland.Series.mode.rst diff --git a/docs/sphinx/reference/api/eland.DataFrame.mode.rst b/docs/sphinx/reference/api/eland.DataFrame.mode.rst new file mode 100644 index 00000000..56c55aaa --- /dev/null +++ b/docs/sphinx/reference/api/eland.DataFrame.mode.rst @@ -0,0 +1,6 @@ +eland.DataFrame.mode +==================== + +.. currentmodule:: eland + +.. automethod:: DataFrame.mode \ No newline at end of file diff --git a/docs/sphinx/reference/api/eland.Series.mode.rst b/docs/sphinx/reference/api/eland.Series.mode.rst new file mode 100644 index 00000000..866f5535 --- /dev/null +++ b/docs/sphinx/reference/api/eland.Series.mode.rst @@ -0,0 +1,6 @@ +eland.Series.mode +==================== + +.. currentmodule:: eland + +.. automethod:: Series.mode \ No newline at end of file