fix: make to_pandas override enable_downsampling when sampling_method…

… is manually set. (#200) * fix: make to_pandas override enable_downsampling when sampling_method is manually set. * fix: make to_pandas override enable_downsampling when sampling_method is manually set. * fix: make to_pandas override enable_downsampling when sampling_method is manually set.
googleapis · Nov 28, 2023 · ae03756 · ae03756
1 parent edd0522
commit ae03756
Show file tree

Hide file tree

Showing 2 changed files with 34 additions and 18 deletions.
diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py
@@ -389,23 +389,6 @@ def to_pandas(
         ordered: bool = True,
     ) -> Tuple[pd.DataFrame, bigquery.QueryJob]:
         """Run query and download results as a pandas DataFrame."""
-        if max_download_size is None:
-            max_download_size = bigframes.options.sampling.max_download_size
-        if sampling_method is None:
-            sampling_method = (
-                bigframes.options.sampling.sampling_method
-                if bigframes.options.sampling.sampling_method is not None
-                else _UNIFORM
-            )
-        if random_state is None:
-            random_state = bigframes.options.sampling.random_state
-
-        sampling_method = sampling_method.lower()
-        if sampling_method not in _SAMPLING_METHODS:
-            raise NotImplementedError(
-                f"The downsampling method {sampling_method} is not implemented, "
-                f"please choose from {','.join(_SAMPLING_METHODS)}."
-            )
 
         df, _, query_job = self._compute_and_count(
             value_keys=value_keys,
@@ -453,6 +436,28 @@ def _compute_and_count(
     ) -> Tuple[pd.DataFrame, int, bigquery.QueryJob]:
         """Run query and download results as a pandas DataFrame. Return the total number of results as well."""
         # TODO(swast): Allow for dry run and timeout.
+        enable_downsampling = (
+            True
+            if sampling_method is not None
+            else bigframes.options.sampling.enable_downsampling
+        )
+
+        max_download_size = (
+            max_download_size or bigframes.options.sampling.max_download_size
+        )
+
+        random_state = random_state or bigframes.options.sampling.random_state
+
+        if sampling_method is None:
+            sampling_method = bigframes.options.sampling.sampling_method or _UNIFORM
+        sampling_method = sampling_method.lower()
+
+        if sampling_method not in _SAMPLING_METHODS:
+            raise NotImplementedError(
+                f"The downsampling method {sampling_method} is not implemented, "
+                f"please choose from {','.join(_SAMPLING_METHODS)}."
+            )
+
         expr = self._apply_value_keys_to_expr(value_keys=value_keys)
 
         results_iterator, query_job = expr.start_query(
@@ -469,7 +474,7 @@ def _compute_and_count(
         )
 
         if fraction < 1:
-            if not bigframes.options.sampling.enable_downsampling:
+            if not enable_downsampling:
                 raise RuntimeError(
                     f"The data size ({table_size:.2f} MB) exceeds the maximum download limit of "
                     f"{max_download_size} MB. You can:\n\t* Enable downsampling in global options:\n"

diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py
@@ -3546,3 +3546,14 @@ def test_df_dot_operator_series(
         bf_result,
         pd_result,
     )
+
+
+def test_to_pandas_downsampling_option_override(session):
+    df = session.read_gbq("bigframes-dev.bigframes_tests_sys.batting")
+    download_size = 1
+
+    df = df.to_pandas(max_download_size=download_size, sampling_method="head")
+
+    total_memory_bytes = df.memory_usage(deep=True).sum()
+    total_memory_mb = total_memory_bytes / (1024 * 1024)
+    assert total_memory_mb == pytest.approx(download_size, rel=0.3)