From ae03756f5ee45e0e74e0c0bdd4777e018eba2273 Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Tue, 28 Nov 2023 12:53:51 -0800 Subject: [PATCH] fix: make to_pandas override enable_downsampling when sampling_method is manually set. (#200) * fix: make to_pandas override enable_downsampling when sampling_method is manually set. * fix: make to_pandas override enable_downsampling when sampling_method is manually set. * fix: make to_pandas override enable_downsampling when sampling_method is manually set. --- bigframes/core/blocks.py | 41 ++++++++++++++++------------ tests/system/small/test_dataframe.py | 11 ++++++++ 2 files changed, 34 insertions(+), 18 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index f1113d938..34913872e 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -389,23 +389,6 @@ def to_pandas( ordered: bool = True, ) -> Tuple[pd.DataFrame, bigquery.QueryJob]: """Run query and download results as a pandas DataFrame.""" - if max_download_size is None: - max_download_size = bigframes.options.sampling.max_download_size - if sampling_method is None: - sampling_method = ( - bigframes.options.sampling.sampling_method - if bigframes.options.sampling.sampling_method is not None - else _UNIFORM - ) - if random_state is None: - random_state = bigframes.options.sampling.random_state - - sampling_method = sampling_method.lower() - if sampling_method not in _SAMPLING_METHODS: - raise NotImplementedError( - f"The downsampling method {sampling_method} is not implemented, " - f"please choose from {','.join(_SAMPLING_METHODS)}." - ) df, _, query_job = self._compute_and_count( value_keys=value_keys, @@ -453,6 +436,28 @@ def _compute_and_count( ) -> Tuple[pd.DataFrame, int, bigquery.QueryJob]: """Run query and download results as a pandas DataFrame. Return the total number of results as well.""" # TODO(swast): Allow for dry run and timeout. + enable_downsampling = ( + True + if sampling_method is not None + else bigframes.options.sampling.enable_downsampling + ) + + max_download_size = ( + max_download_size or bigframes.options.sampling.max_download_size + ) + + random_state = random_state or bigframes.options.sampling.random_state + + if sampling_method is None: + sampling_method = bigframes.options.sampling.sampling_method or _UNIFORM + sampling_method = sampling_method.lower() + + if sampling_method not in _SAMPLING_METHODS: + raise NotImplementedError( + f"The downsampling method {sampling_method} is not implemented, " + f"please choose from {','.join(_SAMPLING_METHODS)}." + ) + expr = self._apply_value_keys_to_expr(value_keys=value_keys) results_iterator, query_job = expr.start_query( @@ -469,7 +474,7 @@ def _compute_and_count( ) if fraction < 1: - if not bigframes.options.sampling.enable_downsampling: + if not enable_downsampling: raise RuntimeError( f"The data size ({table_size:.2f} MB) exceeds the maximum download limit of " f"{max_download_size} MB. You can:\n\t* Enable downsampling in global options:\n" diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index e25e9ce50..9b9567418 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -3546,3 +3546,14 @@ def test_df_dot_operator_series( bf_result, pd_result, ) + + +def test_to_pandas_downsampling_option_override(session): + df = session.read_gbq("bigframes-dev.bigframes_tests_sys.batting") + download_size = 1 + + df = df.to_pandas(max_download_size=download_size, sampling_method="head") + + total_memory_bytes = df.memory_usage(deep=True).sum() + total_memory_mb = total_memory_bytes / (1024 * 1024) + assert total_memory_mb == pytest.approx(download_size, rel=0.3)