Skip to content

Commit

Permalink
fix: make to_pandas override enable_downsampling when sampling_method…
Browse files Browse the repository at this point in the history
… is manually set. (#200)

* fix: make to_pandas override enable_downsampling when sampling_method is manually set.

* fix: make to_pandas override enable_downsampling when sampling_method is manually set.

* fix: make to_pandas override enable_downsampling when sampling_method is manually set.
  • Loading branch information
Genesis929 committed Nov 28, 2023
1 parent edd0522 commit ae03756
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 18 deletions.
41 changes: 23 additions & 18 deletions bigframes/core/blocks.py
Expand Up @@ -389,23 +389,6 @@ def to_pandas(
ordered: bool = True,
) -> Tuple[pd.DataFrame, bigquery.QueryJob]:
"""Run query and download results as a pandas DataFrame."""
if max_download_size is None:
max_download_size = bigframes.options.sampling.max_download_size
if sampling_method is None:
sampling_method = (
bigframes.options.sampling.sampling_method
if bigframes.options.sampling.sampling_method is not None
else _UNIFORM
)
if random_state is None:
random_state = bigframes.options.sampling.random_state

sampling_method = sampling_method.lower()
if sampling_method not in _SAMPLING_METHODS:
raise NotImplementedError(
f"The downsampling method {sampling_method} is not implemented, "
f"please choose from {','.join(_SAMPLING_METHODS)}."
)

df, _, query_job = self._compute_and_count(
value_keys=value_keys,
Expand Down Expand Up @@ -453,6 +436,28 @@ def _compute_and_count(
) -> Tuple[pd.DataFrame, int, bigquery.QueryJob]:
"""Run query and download results as a pandas DataFrame. Return the total number of results as well."""
# TODO(swast): Allow for dry run and timeout.
enable_downsampling = (
True
if sampling_method is not None
else bigframes.options.sampling.enable_downsampling
)

max_download_size = (
max_download_size or bigframes.options.sampling.max_download_size
)

random_state = random_state or bigframes.options.sampling.random_state

if sampling_method is None:
sampling_method = bigframes.options.sampling.sampling_method or _UNIFORM
sampling_method = sampling_method.lower()

if sampling_method not in _SAMPLING_METHODS:
raise NotImplementedError(
f"The downsampling method {sampling_method} is not implemented, "
f"please choose from {','.join(_SAMPLING_METHODS)}."
)

expr = self._apply_value_keys_to_expr(value_keys=value_keys)

results_iterator, query_job = expr.start_query(
Expand All @@ -469,7 +474,7 @@ def _compute_and_count(
)

if fraction < 1:
if not bigframes.options.sampling.enable_downsampling:
if not enable_downsampling:
raise RuntimeError(
f"The data size ({table_size:.2f} MB) exceeds the maximum download limit of "
f"{max_download_size} MB. You can:\n\t* Enable downsampling in global options:\n"
Expand Down
11 changes: 11 additions & 0 deletions tests/system/small/test_dataframe.py
Expand Up @@ -3546,3 +3546,14 @@ def test_df_dot_operator_series(
bf_result,
pd_result,
)


def test_to_pandas_downsampling_option_override(session):
df = session.read_gbq("bigframes-dev.bigframes_tests_sys.batting")
download_size = 1

df = df.to_pandas(max_download_size=download_size, sampling_method="head")

total_memory_bytes = df.memory_usage(deep=True).sum()
total_memory_mb = total_memory_bytes / (1024 * 1024)
assert total_memory_mb == pytest.approx(download_size, rel=0.3)

0 comments on commit ae03756

Please sign in to comment.