diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py index 465fa08187..e12fe502c0 100644 --- a/bigframes/session/_io/bigquery/read_gbq_table.py +++ b/bigframes/session/_io/bigquery/read_gbq_table.py @@ -402,6 +402,7 @@ def get_index_cols( | bigframes.enums.DefaultIndexKind, *, rename_to_schema: Optional[Dict[str, str]] = None, + default_index_type: bigframes.enums.DefaultIndexKind = bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64, ) -> List[str]: """ If we can get a total ordering from the table, such as via primary key @@ -471,7 +472,11 @@ def get_index_cols( # find index_cols to use. This is to avoid unexpected performance and # resource utilization because of the default sequential index. See # internal issue 335727141. - if _is_table_clustered_or_partitioned(table) and not primary_keys: + if ( + _is_table_clustered_or_partitioned(table) + and not primary_keys + and default_index_type == bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64 + ): msg = bfe.format_message( f"Table '{str(table.reference)}' is clustered and/or " "partitioned, but BigQuery DataFrames was not able to find a " diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index 4e67eac9ae..1bebd460a9 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -696,6 +696,7 @@ def read_gbq_table( table=table, index_col=index_col, rename_to_schema=rename_to_schema, + default_index_type=self._default_index_type, ) _check_index_col_param( index_cols, diff --git a/tests/unit/session/test_read_gbq_table.py b/tests/unit/session/test_read_gbq_table.py index d21f0000a9..ce9b587d6b 100644 --- a/tests/unit/session/test_read_gbq_table.py +++ b/tests/unit/session/test_read_gbq_table.py @@ -15,10 +15,13 @@ """Unit tests for read_gbq_table helper functions.""" import unittest.mock as mock +import warnings import google.cloud.bigquery import pytest +import bigframes.enums +import bigframes.exceptions import bigframes.session._io.bigquery.read_gbq_table as bf_read_gbq_table from bigframes.testing import mocks @@ -143,3 +146,43 @@ def test_check_if_index_columns_are_unique(index_cols, values_distinct, expected ) assert result == expected + + +def test_get_index_cols_warns_if_clustered_but_sequential_index(): + table = google.cloud.bigquery.Table.from_api_repr( + { + "tableReference": { + "projectId": "my-project", + "datasetId": "my_dataset", + "tableId": "my_table", + }, + "clustering": { + "fields": ["col1", "col2"], + }, + }, + ) + table.schema = ( + google.cloud.bigquery.SchemaField("col1", "INT64"), + google.cloud.bigquery.SchemaField("col2", "INT64"), + google.cloud.bigquery.SchemaField("col3", "INT64"), + google.cloud.bigquery.SchemaField("col4", "INT64"), + ) + + with pytest.warns(bigframes.exceptions.DefaultIndexWarning, match="is clustered"): + bf_read_gbq_table.get_index_cols( + table, + index_col=(), + default_index_type=bigframes.enums.DefaultIndexKind.SEQUENTIAL_INT64, + ) + + # Ensure that we don't raise if using a NULL index by default, such as in + # partial ordering mode. See: internal issue b/356872356. + with warnings.catch_warnings(): + warnings.simplefilter( + "error", category=bigframes.exceptions.DefaultIndexWarning + ) + bf_read_gbq_table.get_index_cols( + table, + index_col=(), + default_index_type=bigframes.enums.DefaultIndexKind.NULL, + )