diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 61aaab1120..e968172c76 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -1996,6 +1996,31 @@ def _generate_resample_label( Literal["epoch", "start", "start_day", "end", "end_day"], ] = "start_day", ) -> Block: + if not isinstance(rule, str): + raise NotImplementedError( + f"Only offset strings are currently supported for rule, but got {repr(rule)}. {constants.FEEDBACK_LINK}" + ) + + if rule in ("ME", "YE", "QE", "BME", "BA", "BQE", "W"): + raise NotImplementedError( + f"Offset strings 'ME', 'YE', 'QE', 'BME', 'BA', 'BQE', 'W' are not currently supported for rule, but got {repr(rule)}. {constants.FEEDBACK_LINK}" + ) + + if closed == "right": + raise NotImplementedError( + f"Only closed='left' is currently supported. {constants.FEEDBACK_LINK}", + ) + + if label == "right": + raise NotImplementedError( + f"Only label='left' is currently supported. {constants.FEEDBACK_LINK}", + ) + + if origin not in ("epoch", "start", "start_day"): + raise NotImplementedError( + f"Only origin='epoch', 'start', 'start_day' are currently supported, but got {repr(origin)}. {constants.FEEDBACK_LINK}" + ) + # Validate and resolve the index or column to use for grouping if on is None: if len(self.index_columns) == 0: diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index df8c87416f..7471cf587b 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -4182,10 +4182,12 @@ def _split( return [DataFrame(block) for block in blocks] @validations.requires_ordering() - def _resample( + def resample( self, rule: str, *, + closed: Optional[Literal["right", "left"]] = None, + label: Optional[Literal["right", "left"]] = None, on: blocks.Label = None, level: Optional[LevelsType] = None, origin: Union[ @@ -4195,64 +4197,10 @@ def _resample( Literal["epoch", "start", "start_day", "end", "end_day"], ] = "start_day", ) -> bigframes.core.groupby.DataFrameGroupBy: - """Internal function to support resample. Resample time-series data. - - **Examples:** - - >>> import bigframes.pandas as bpd - >>> data = { - ... "timestamp_col": pd.date_range( - ... start="2021-01-01 13:00:00", periods=30, freq="1s" - ... ), - ... "int64_col": range(30), - ... "int64_too": range(10, 40), - ... } - - Resample on a DataFrame with index: - - >>> df = bpd.DataFrame(data).set_index("timestamp_col") - >>> df._resample(rule="7s").min() - int64_col int64_too - 2021-01-01 12:59:55 0 10 - 2021-01-01 13:00:02 2 12 - 2021-01-01 13:00:09 9 19 - 2021-01-01 13:00:16 16 26 - 2021-01-01 13:00:23 23 33 - - [5 rows x 2 columns] - - Resample with column and origin set to 'start': - - >>> df = bpd.DataFrame(data) - >>> df._resample(rule="7s", on = "timestamp_col", origin="start").min() - int64_col int64_too - 2021-01-01 13:00:00 0 10 - 2021-01-01 13:00:07 7 17 - 2021-01-01 13:00:14 14 24 - 2021-01-01 13:00:21 21 31 - 2021-01-01 13:00:28 28 38 - - [5 rows x 2 columns] - - Args: - rule (str): - The offset string representing target conversion. - on (str, default None): - For a DataFrame, column to use instead of index for resampling. Column - must be datetime-like. - level (str or int, default None): - For a MultiIndex, level (name or number) to use for resampling. - level must be datetime-like. - origin(str, default 'start_day'): - The timestamp on which to adjust the grouping. Must be one of the following: - 'epoch': origin is 1970-01-01 - 'start': origin is the first value of the timeseries - 'start_day': origin is the first day at midnight of the timeseries - Returns: - DataFrameGroupBy: DataFrameGroupBy object. - """ block = self._block._generate_resample_label( rule=rule, + closed=closed, + label=label, on=on, level=level, origin=origin, diff --git a/bigframes/series.py b/bigframes/series.py index ef0da32dfc..c11cc48394 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -2505,7 +2505,7 @@ def explode(self, *, ignore_index: Optional[bool] = False) -> Series: ) @validations.requires_ordering() - def _resample( + def resample( self, rule: str, *, @@ -2519,43 +2519,6 @@ def _resample( Literal["epoch", "start", "start_day", "end", "end_day"], ] = "start_day", ) -> bigframes.core.groupby.SeriesGroupBy: - """Internal function to support resample. Resample time-series data. - - **Examples:** - - >>> import bigframes.pandas as bpd - >>> data = { - ... "timestamp_col": pd.date_range( - ... start="2021-01-01 13:00:00", periods=30, freq="1s" - ... ), - ... "int64_col": range(30), - ... } - >>> s = bpd.DataFrame(data).set_index("timestamp_col") - >>> s._resample(rule="7s", origin="epoch").min() - int64_col - 2021-01-01 12:59:56 0 - 2021-01-01 13:00:03 3 - 2021-01-01 13:00:10 10 - 2021-01-01 13:00:17 17 - 2021-01-01 13:00:24 24 - - [5 rows x 1 columns] - - - Args: - rule (str): - The offset string representing target conversion. - level (str or int, default None): - For a MultiIndex, level (name or number) to use for resampling. - level must be datetime-like. - origin(str, default 'start_day'): - The timestamp on which to adjust the grouping. Must be one of the following: - 'epoch': origin is 1970-01-01 - 'start': origin is the first value of the timeseries - 'start_day': origin is the first day at midnight of the timeseries - Returns: - SeriesGroupBy: SeriesGroupBy object. - """ block = self._block._generate_resample_label( rule=rule, closed=closed, diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 79f8efd00f..475f98407b 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -5915,21 +5915,15 @@ def test_dataframe_explode_xfail(col_names): pytest.param("datetime_col", "5M", "epoch"), pytest.param("datetime_col", "3Q", "start_day"), pytest.param("datetime_col", "3YE", "start"), - pytest.param( - "int64_col", "100D", "start", marks=pytest.mark.xfail(raises=TypeError) - ), - pytest.param( - "datetime_col", "100D", "end", marks=pytest.mark.xfail(raises=ValueError) - ), ], ) -def test__resample_with_column( +def test_resample_with_column( scalars_df_index, scalars_pandas_df_index, on, rule, origin ): # TODO: supply a reason why this isn't compatible with pandas 1.x pytest.importorskip("pandas", minversion="2.0.0") bf_result = ( - scalars_df_index._resample(rule=rule, on=on, origin=origin)[ + scalars_df_index.resample(rule=rule, on=on, origin=origin)[ ["int64_col", "int64_too"] ] .max() @@ -5943,30 +5937,54 @@ def test__resample_with_column( ) +@pytest.mark.parametrize("index_col", ["timestamp_col", "datetime_col"]) +@pytest.mark.parametrize( + ("index_append", "level"), + [(True, 1), (False, None), (False, 0)], +) @pytest.mark.parametrize( - ("append", "level", "col", "rule"), + "rule", [ - pytest.param(False, None, "timestamp_col", "100d"), - pytest.param(True, 1, "timestamp_col", "1200h"), - pytest.param(False, None, "datetime_col", "100d"), + # TODO(tswast): support timedeltas and dataoffsets. + # TODO(tswast): support bins that default to "right". + "100d", + "1200h", ], ) -def test__resample_with_index( - scalars_df_index, scalars_pandas_df_index, append, level, col, rule +# TODO(tswast): support "right" +@pytest.mark.parametrize("closed", ["left", None]) +# TODO(tswast): support "right" +@pytest.mark.parametrize("label", ["left", None]) +@pytest.mark.parametrize( + "origin", + ["epoch", "start", "start_day"], # TODO(tswast): support end, end_day. +) +def test_resample_with_index( + scalars_df_index, + scalars_pandas_df_index, + index_append, + level, + index_col, + rule, + closed, + origin, + label, ): # TODO: supply a reason why this isn't compatible with pandas 1.x pytest.importorskip("pandas", minversion="2.0.0") - scalars_df_index = scalars_df_index.set_index(col, append=append) - scalars_pandas_df_index = scalars_pandas_df_index.set_index(col, append=append) + scalars_df_index = scalars_df_index.set_index(index_col, append=index_append) + scalars_pandas_df_index = scalars_pandas_df_index.set_index( + index_col, append=index_append + ) bf_result = ( scalars_df_index[["int64_col", "int64_too"]] - ._resample(rule=rule, level=level) + .resample(rule=rule, level=level, closed=closed, origin=origin, label=label) .min() .to_pandas() ) pd_result = ( scalars_pandas_df_index[["int64_col", "int64_too"]] - .resample(rule=rule, level=level) + .resample(rule=rule, level=level, closed=closed, origin=origin, label=label) .min() ) assert_pandas_df_equal(bf_result, pd_result) @@ -6010,7 +6028,7 @@ def test__resample_with_index( ), ], ) -def test__resample_start_time(rule, origin, data): +def test_resample_start_time(rule, origin, data): # TODO: supply a reason why this isn't compatible with pandas 1.x pytest.importorskip("pandas", minversion="2.0.0") col = "timestamp_col" @@ -6018,7 +6036,7 @@ def test__resample_start_time(rule, origin, data): scalars_pandas_df_index = pd.DataFrame(data).set_index(col) scalars_pandas_df_index.index.name = None - bf_result = scalars_df_index._resample(rule=rule, origin=origin).min().to_pandas() + bf_result = scalars_df_index.resample(rule=rule, origin=origin).min().to_pandas() pd_result = scalars_pandas_df_index.resample(rule=rule, origin=origin).min() diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 5ace3f54d8..4df257423f 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -4856,14 +4856,14 @@ def test_series_explode_null(data): pytest.param(True, "timestamp_col", "timestamp_col", "1YE"), ], ) -def test__resample(scalars_df_index, scalars_pandas_df_index, append, level, col, rule): +def test_resample(scalars_df_index, scalars_pandas_df_index, append, level, col, rule): # TODO: supply a reason why this isn't compatible with pandas 1.x pytest.importorskip("pandas", minversion="2.0.0") scalars_df_index = scalars_df_index.set_index(col, append=append)["int64_col"] scalars_pandas_df_index = scalars_pandas_df_index.set_index(col, append=append)[ "int64_col" ] - bf_result = scalars_df_index._resample(rule=rule, level=level).min().to_pandas() + bf_result = scalars_df_index.resample(rule=rule, level=level).min().to_pandas() pd_result = scalars_pandas_df_index.resample(rule=rule, level=level).min() pd.testing.assert_series_equal(bf_result, pd_result) diff --git a/tests/system/small/test_unordered.py b/tests/system/small/test_unordered.py index 9cfa54146a..07fdb215df 100644 --- a/tests/system/small/test_unordered.py +++ b/tests/system/small/test_unordered.py @@ -248,7 +248,7 @@ def test_unordered_mode_no_ambiguity_warning(unordered_session): ), ], ) -def test__resample_with_index(unordered_session, rule, origin, data): +def test_resample_with_index(unordered_session, rule, origin, data): # TODO: supply a reason why this isn't compatible with pandas 1.x pytest.importorskip("pandas", minversion="2.0.0") col = "timestamp_col" @@ -256,12 +256,16 @@ def test__resample_with_index(unordered_session, rule, origin, data): scalars_pandas_df_index = pd.DataFrame(data).set_index(col) scalars_pandas_df_index.index.name = None - bf_result = scalars_df_index._resample(rule=rule, origin=origin).min().to_pandas() - + bf_result = scalars_df_index.resample(rule=rule, origin=origin).min() pd_result = scalars_pandas_df_index.resample(rule=rule, origin=origin).min() + assert isinstance(bf_result.index, bpd.DatetimeIndex) + assert isinstance(pd_result.index, pd.DatetimeIndex) pd.testing.assert_frame_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False + bf_result.to_pandas(), + pd_result, + check_index_type=False, + check_dtype=False, ) diff --git a/tests/unit/test_dataframe.py b/tests/unit/test_dataframe.py index 2326f2595b..015dbd030e 100644 --- a/tests/unit/test_dataframe.py +++ b/tests/unit/test_dataframe.py @@ -42,6 +42,68 @@ def test_dataframe_repr_with_uninitialized_object(): assert "DataFrame" in got +@pytest.mark.parametrize( + "rule", + [ + pd.DateOffset(weeks=1), + pd.Timedelta(hours=8), + # According to + # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.resample.html + # these all default to "right" for closed and label, which isn't yet supported. + "ME", + "YE", + "QE", + "BME", + "BA", + "BQE", + "W", + ], +) +def test_dataframe_rule_not_implememented( + monkeypatch: pytest.MonkeyPatch, + rule, +): + dataframe = mocks.create_dataframe(monkeypatch) + + with pytest.raises(NotImplementedError, match="rule"): + dataframe.resample(rule=rule) + + +def test_dataframe_closed_not_implememented( + monkeypatch: pytest.MonkeyPatch, +): + dataframe = mocks.create_dataframe(monkeypatch) + + with pytest.raises(NotImplementedError, match="Only closed='left'"): + dataframe.resample(rule="1d", closed="right") + + +def test_dataframe_label_not_implememented( + monkeypatch: pytest.MonkeyPatch, +): + dataframe = mocks.create_dataframe(monkeypatch) + + with pytest.raises(NotImplementedError, match="Only label='left'"): + dataframe.resample(rule="1d", label="right") + + +@pytest.mark.parametrize( + "origin", + [ + "end", + "end_day", + ], +) +def test_dataframe_origin_not_implememented( + monkeypatch: pytest.MonkeyPatch, + origin, +): + dataframe = mocks.create_dataframe(monkeypatch) + + with pytest.raises(NotImplementedError, match="origin"): + dataframe.resample(rule="1d", origin=origin) + + def test_dataframe_setattr_with_uninitialized_object(): """Ensures DataFrame can be subclassed without trying to set attributes as columns.""" # Avoid calling __init__ since it might be called later in a subclass. diff --git a/tests/unit/test_series_polars.py b/tests/unit/test_series_polars.py index 55bc048bcd..6f729b0df0 100644 --- a/tests/unit/test_series_polars.py +++ b/tests/unit/test_series_polars.py @@ -5006,14 +5006,14 @@ def test_series_explode_null(data): pytest.param(True, "timestamp_col", "timestamp_col", "1YE"), ], ) -def test__resample(scalars_df_index, scalars_pandas_df_index, append, level, col, rule): +def test_resample(scalars_df_index, scalars_pandas_df_index, append, level, col, rule): # TODO: supply a reason why this isn't compatible with pandas 1.x pytest.importorskip("pandas", minversion="2.0.0") scalars_df_index = scalars_df_index.set_index(col, append=append)["int64_col"] scalars_pandas_df_index = scalars_pandas_df_index.set_index(col, append=append)[ "int64_col" ] - bf_result = scalars_df_index._resample(rule=rule, level=level).min().to_pandas() + bf_result = scalars_df_index.resample(rule=rule, level=level).min().to_pandas() pd_result = scalars_pandas_df_index.resample(rule=rule, level=level).min() pd.testing.assert_series_equal(bf_result, pd_result) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index b434b51fb3..1e90e2e210 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -11,6 +11,7 @@ """ from __future__ import annotations +import datetime from typing import Hashable, Iterable, Literal, Optional, Sequence, Union from bigframes_vendored import constants @@ -4734,6 +4735,86 @@ def merge( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def resample( + self, + rule: str, + *, + closed: Optional[Literal["right", "left"]] = None, + label: Optional[Literal["right", "left"]] = None, + on=None, + level=None, + origin: Union[ + Union[pd.Timestamp, datetime.datetime, np.datetime64, int, float, str], + Literal["epoch", "start", "start_day", "end", "end_day"], + ] = "start_day", + ): + """Resample time-series data. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> data = { + ... "timestamp_col": pd.date_range( + ... start="2021-01-01 13:00:00", periods=30, freq="1s" + ... ), + ... "int64_col": range(30), + ... "int64_too": range(10, 40), + ... } + + Resample on a DataFrame with index: + + >>> df = bpd.DataFrame(data).set_index("timestamp_col") + >>> df.resample(rule="7s").min() + int64_col int64_too + 2021-01-01 12:59:55 0 10 + 2021-01-01 13:00:02 2 12 + 2021-01-01 13:00:09 9 19 + 2021-01-01 13:00:16 16 26 + 2021-01-01 13:00:23 23 33 + + [5 rows x 2 columns] + + Resample with column and origin set to 'start': + + >>> df = bpd.DataFrame(data) + >>> df.resample(rule="7s", on = "timestamp_col", origin="start").min() + int64_col int64_too + 2021-01-01 13:00:00 0 10 + 2021-01-01 13:00:07 7 17 + 2021-01-01 13:00:14 14 24 + 2021-01-01 13:00:21 21 31 + 2021-01-01 13:00:28 28 38 + + [5 rows x 2 columns] + + Args: + rule (str): + The offset string representing target conversion. + Offsets 'ME', 'YE', 'QE', 'BME', 'BA', 'BQE', and 'W' are *not* + supported. + closed (Literal['left'] | None): + Which side of bin interval is closed. The default is 'left' for + all supported frequency offsets. + label (Literal['right'] | Literal['left'] | None): + Which bin edge label to label bucket with. The default is 'left' + for all supported frequency offsets. + on (str, default None): + For a DataFrame, column to use instead of index for resampling. Column + must be datetime-like. + level (str or int, default None): + For a MultiIndex, level (name or number) to use for resampling. + level must be datetime-like. + origin(str, default 'start_day'): + The timestamp on which to adjust the grouping. Must be one of the following: + 'epoch': origin is 1970-01-01 + 'start': origin is the first value of the timeseries + 'start_day': origin is the first day at midnight of the timeseries + Origin values 'end' and 'end_day' are *not* supported. + Returns: + DataFrameGroupBy: DataFrameGroupBy object. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def round(self, decimals): """ Round a DataFrame to a variable number of decimal places. diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 8de1c10f93..2c0f493d81 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -3,6 +3,7 @@ """ from __future__ import annotations +import datetime from typing import ( Hashable, IO, @@ -19,6 +20,7 @@ from bigframes_vendored.pandas.core.generic import NDFrame import numpy import numpy as np +import pandas as pd from pandas._typing import Axis, FilePath, NaPosition, WriteBuffer from pandas.api import extensions as pd_ext @@ -2502,6 +2504,68 @@ def replace( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def resample( + self, + rule: str, + *, + closed: Optional[Literal["right", "left"]] = None, + label: Optional[Literal["right", "left"]] = None, + level=None, + origin: Union[ + Union[pd.Timestamp, datetime.datetime, numpy.datetime64, int, float, str], + Literal["epoch", "start", "start_day", "end", "end_day"], + ] = "start_day", + ): + """Resample time-series data. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> data = { + ... "timestamp_col": pd.date_range( + ... start="2021-01-01 13:00:00", periods=30, freq="1s" + ... ), + ... "int64_col": range(30), + ... } + >>> s = bpd.DataFrame(data).set_index("timestamp_col") + >>> s.resample(rule="7s", origin="epoch").min() + int64_col + 2021-01-01 12:59:56 0 + 2021-01-01 13:00:03 3 + 2021-01-01 13:00:10 10 + 2021-01-01 13:00:17 17 + 2021-01-01 13:00:24 24 + + [5 rows x 1 columns] + + Args: + rule (str): + The offset string representing target conversion. + Offsets 'ME', 'YE', 'QE', 'BME', 'BA', 'BQE', and 'W' are *not* + supported. + closed (Literal['left'] | None): + Which side of bin interval is closed. The default is 'left' for + all supported frequency offsets. + label (Literal['right'] | Literal['left'] | None): + Which bin edge label to label bucket with. The default is 'left' + for all supported frequency offsets. + on (str, default None): + For a DataFrame, column to use instead of index for resampling. Column + must be datetime-like. + level (str or int, default None): + For a MultiIndex, level (name or number) to use for resampling. + level must be datetime-like. + origin(str, default 'start_day'): + The timestamp on which to adjust the grouping. Must be one of the following: + 'epoch': origin is 1970-01-01 + 'start': origin is the first value of the timeseries + 'start_day': origin is the first day at midnight of the timeseries + Origin values 'end' and 'end_day' are *not* supported. + Returns: + SeriesGroupBy: SeriesGroupBy object. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def dropna(self, *, axis=0, inplace: bool = False, how=None) -> Series: """ Return a new Series with missing values removed.