Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion bigframes/session/_io/bigquery/read_gbq_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,7 @@ def get_index_cols(
| bigframes.enums.DefaultIndexKind,
*,
rename_to_schema: Optional[Dict[str, str]] = None,
default_index_type: bigframes.enums.DefaultIndexKind = bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64,
) -> List[str]:
"""
If we can get a total ordering from the table, such as via primary key
Expand Down Expand Up @@ -471,7 +472,11 @@ def get_index_cols(
# find index_cols to use. This is to avoid unexpected performance and
# resource utilization because of the default sequential index. See
# internal issue 335727141.
if _is_table_clustered_or_partitioned(table) and not primary_keys:
if (
_is_table_clustered_or_partitioned(table)
and not primary_keys
and default_index_type == bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64
):
msg = bfe.format_message(
f"Table '{str(table.reference)}' is clustered and/or "
"partitioned, but BigQuery DataFrames was not able to find a "
Expand Down
1 change: 1 addition & 0 deletions bigframes/session/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -696,6 +696,7 @@ def read_gbq_table(
table=table,
index_col=index_col,
rename_to_schema=rename_to_schema,
default_index_type=self._default_index_type,
)
_check_index_col_param(
index_cols,
Expand Down
43 changes: 43 additions & 0 deletions tests/unit/session/test_read_gbq_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,13 @@
"""Unit tests for read_gbq_table helper functions."""

import unittest.mock as mock
import warnings

import google.cloud.bigquery
import pytest

import bigframes.enums
import bigframes.exceptions
import bigframes.session._io.bigquery.read_gbq_table as bf_read_gbq_table
from bigframes.testing import mocks

Expand Down Expand Up @@ -143,3 +146,43 @@ def test_check_if_index_columns_are_unique(index_cols, values_distinct, expected
)

assert result == expected


def test_get_index_cols_warns_if_clustered_but_sequential_index():
table = google.cloud.bigquery.Table.from_api_repr(
{
"tableReference": {
"projectId": "my-project",
"datasetId": "my_dataset",
"tableId": "my_table",
},
"clustering": {
"fields": ["col1", "col2"],
},
},
)
table.schema = (
google.cloud.bigquery.SchemaField("col1", "INT64"),
google.cloud.bigquery.SchemaField("col2", "INT64"),
google.cloud.bigquery.SchemaField("col3", "INT64"),
google.cloud.bigquery.SchemaField("col4", "INT64"),
)

with pytest.warns(bigframes.exceptions.DefaultIndexWarning, match="is clustered"):
bf_read_gbq_table.get_index_cols(
table,
index_col=(),
default_index_type=bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64,
)

# Ensure that we don't raise if using a NULL index by default, such as in
# partial ordering mode. See: internal issue b/356872356.
with warnings.catch_warnings():
warnings.simplefilter(
"error", category=bigframes.exceptions.DefaultIndexWarning
)
bf_read_gbq_table.get_index_cols(
table,
index_col=(),
default_index_type=bigframes.enums.DefaultIndexKind.NULL,
)