fix: allow multi-part dataset IDs to support BigLake tables (#17137)

tswast · web-flow · commit f93911c0a7f1 · 2026-05-14T14:45:17.000-07:00
Relaxes DatasetReference.from_string and TableReference.from_string validation. Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/google-cloud-python/issues) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes b/512823729 🦕
diff --git a/packages/google-cloud-bigquery/google/cloud/bigquery/_string_references.py b/packages/google-cloud-bigquery/google/cloud/bigquery/_string_references.py
@@ -0,0 +1,166 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Helper to turn string references into REST resources."""
+
+# TODO(b/513204277): Consolidate these transformations with pandas-gbq and bigframes.
+
+from __future__ import annotations
+
+import re
+from typing import TypedDict, Union
+
+
+ParsedDatasetReference = TypedDict(
+    "ParsedDatasetReference",
+    {
+        "projectId": str,
+        "datasetId": str,
+    },
+)
+
+
+ParsedTableReference = TypedDict(
+    "ParsedTableReference",
+    {
+        "projectId": str,
+        "datasetId": str,
+        "tableId": str,
+    },
+)
+
+
+_FULLY_QUALIFIED_DATASET_REFERENCE_PATTERN = re.compile(
+    # In the past, organizations could prefix their project IDs with a domain
+    # name. Such projects still exist, especially at Google.
+    r"^(?P<legacy_project_domain>[^:]+:)?"
+    r"(?P<project>[^.]+)\."
+    # Match dataset or catalog + namespace.
+    #
+    # Namespace could be arbitrarily deeply nested in Iceberg/BigLake. Support
+    # this without catastrophic backtracking by moving the trailing "." to the
+    # table group.
+    r"(?P<inner_parts>.*)"
+)
+
+
+_FULLY_QUALIFIED_TABLE_REFERENCE_PATTERN = re.compile(
+    # In the past, organizations could prefix their project IDs with a domain
+    # name. Such projects still exist, especially at Google.
+    r"^(?P<legacy_project_domain>[^:]+:)?"
+    r"(?P<project>[^.]+)\."
+    # Match dataset or catalog + namespace.
+    #
+    # Namespace could be arbitrarily deeply nested in Iceberg/BigLake. Support
+    # this without catastrophic backtracking by moving the trailing "." to the
+    # table group.
+    r"(?P<inner_parts>.*)"
+    # Table names can't contain ".", as that's used as the separator.
+    r"\.(?P<table>[^.]+)$"
+)
+
+
+_RELATIVE_TABLE_REFERENCE_PATTERN = re.compile(
+    # Match dataset or catalog + namespace.
+    #
+    # Namespace could be arbitrarily deeply nested in Iceberg/BigLake. Support
+    # this without catastrophic backtracking by moving the trailing "." to the
+    # table group.
+    r"(?P<inner_parts>.*)"
+    # Table names can't contain ".", as that's used as the separator.
+    r"\.(?P<table>[^.]+)$"
+)
+
+
+def parse_dataset_reference(
+    dataset_id: str, *, default_project: Union[str, None]
+) -> ParsedDatasetReference:
+    """Parse a dataset ID string.
+
+    Returns:
+        ParsedDatasetReference: A typed dictionary (to avoid circular dependencies).
+
+    Raises:
+        ValueError: When a fully-qualified dataset ID can't be determined.
+    """
+    regex_match = _FULLY_QUALIFIED_DATASET_REFERENCE_PATTERN.match(dataset_id)
+    if regex_match:
+        legacy_project_domain = regex_match.group("legacy_project_domain")
+        project = regex_match.group("project")
+
+        if legacy_project_domain:
+            output_project_id = f"{legacy_project_domain}{project}"
+        else:
+            output_project_id = project
+
+        return {
+            "projectId": output_project_id,
+            "datasetId": regex_match.group("inner_parts"),
+        }
+
+    if not default_project:
+        raise ValueError(
+            "When default_project is not set, dataset_id must be a "
+            "fully-qualified dataset ID in standard SQL format, "
+            'e.g., "project.dataset_id" got {}'.format(dataset_id)
+        )
+
+    return {"datasetId": dataset_id, "projectId": default_project}
+
+
+def parse_table_reference(
+    table_id: str, *, default_project: Union[str, None]
+) -> ParsedTableReference:
+    """Parse a table ID string.
+
+    Returns:
+        ParsedTableReference: A typed dictionary (to avoid circular dependencies).
+
+    Raises:
+        ValueError: When a fully-qualified table ID can't be determined.
+    """
+    regex_match = _FULLY_QUALIFIED_TABLE_REFERENCE_PATTERN.match(table_id)
+    if regex_match:
+        legacy_project_domain = regex_match.group("legacy_project_domain")
+        project = regex_match.group("project")
+
+        if legacy_project_domain:
+            output_project_id = f"{legacy_project_domain}{project}"
+        else:
+            output_project_id = project
+
+        return {
+            "projectId": output_project_id,
+            "datasetId": regex_match.group("inner_parts"),
+            "tableId": regex_match.group("table"),
+        }
+
+    if not default_project:
+        raise ValueError(
+            "Could not determine project ID. Supply a default project or a fully-qualified table ID, "
+            f"such as 'project.dataset.table'. Got {table_id}."
+        )
+
+    regex_match = _RELATIVE_TABLE_REFERENCE_PATTERN.match(table_id)
+    if not regex_match:
+        raise ValueError(
+            "Could not parse table_id. Expected a table ID"
+            f"such as 'project.dataset.table', but got {table_id}."
+        )
+
+    return {
+        "projectId": default_project,
+        "datasetId": regex_match.group("inner_parts"),
+        "tableId": regex_match.group("table"),
+    }
diff --git a/packages/google-cloud-bigquery/google/cloud/bigquery/dataset.py b/packages/google-cloud-bigquery/google/cloud/bigquery/dataset.py
@@ -30,6 +30,7 @@
 from google.cloud.bigquery.table import Table, TableReference
 from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration
 from google.cloud.bigquery import external_config
+from google.cloud.bigquery import _string_references
 
 
 def _get_table_reference(self, table_id: str) -> TableReference:
@@ -123,7 +124,9 @@ def path(self):
     routine = _get_routine_reference
 
     @classmethod
-    def from_api_repr(cls, resource: dict) -> "DatasetReference":
+    def from_api_repr(
+        cls, resource: Union[dict, _string_references.ParsedDatasetReference]
+    ) -> "DatasetReference":
         """Factory: construct a dataset reference given its API representation
 
         Args:
@@ -166,28 +169,12 @@ def from_string(
                 If ``dataset_id`` is not a fully-qualified dataset ID in
                 standard SQL format.
         """
-        output_dataset_id = dataset_id
-        parts = _helpers._split_id(dataset_id)
-
-        if len(parts) == 1:
-            if default_project is not None:
-                output_project_id = default_project
-            else:
-                raise ValueError(
-                    "When default_project is not set, dataset_id must be a "
-                    "fully-qualified dataset ID in standard SQL format, "
-                    'e.g., "project.dataset_id" got {}'.format(dataset_id)
-                )
-        elif len(parts) == 2:
-            output_project_id, output_dataset_id = parts
-        else:
-            raise ValueError(
-                "Too many parts in dataset_id. Expected a fully-qualified "
-                "dataset ID in standard SQL format, "
-                'e.g. "project.dataset_id", got {}'.format(dataset_id)
+        return cls.from_api_repr(
+            _string_references.parse_dataset_reference(
+                dataset_id=dataset_id,
+                default_project=default_project,
             )
-
-        return cls(output_project_id, output_dataset_id)
+        )
 
     def to_api_repr(self) -> dict:
         """Construct the API resource representation of this dataset reference
diff --git a/packages/google-cloud-bigquery/google/cloud/bigquery/table.py b/packages/google-cloud-bigquery/google/cloud/bigquery/table.py
@@ -72,6 +72,7 @@
 from google.cloud.bigquery.schema import _parse_schema_resource
 from google.cloud.bigquery.schema import _to_schema_fields
 from google.cloud.bigquery import external_config
+from google.cloud.bigquery import _string_references
 
 if typing.TYPE_CHECKING:  # pragma: NO COVER
     # Unconditionally import optional dependencies again to tell pytype that
@@ -281,22 +282,17 @@ def from_string(
                 If ``table_id`` is not a fully-qualified table ID in
                 standard SQL format.
         """
-        from google.cloud.bigquery.dataset import DatasetReference
-
-        (
-            output_project_id,
-            output_dataset_id,
-            output_table_id,
-        ) = _helpers._parse_3_part_id(
-            table_id, default_project=default_project, property_name="table_id"
-        )
-
-        return cls(
-            DatasetReference(output_project_id, output_dataset_id), output_table_id
+        return cls.from_api_repr(
+            _string_references.parse_table_reference(
+                table_id=table_id,
+                default_project=default_project,
+            )
         )
 
     @classmethod
-    def from_api_repr(cls, resource: dict) -> "TableReference":
+    def from_api_repr(
+        cls, resource: Union[dict, _string_references.ParsedTableReference]
+    ) -> "TableReference":
         """Factory:  construct a table reference given its API representation
 
         Args:
diff --git a/packages/google-cloud-bigquery/tests/system/test_client.py b/packages/google-cloud-bigquery/tests/system/test_client.py
@@ -304,6 +304,18 @@ def test_get_dataset(self):
         self.assertEqual(got.friendly_name, "Friendly")
         self.assertEqual(got.description, "Description")
 
+    def test_get_dataset_w_public_biglake(self):
+        dataset_id = "bigquery-public-data.biglake-public-nyc-taxi-iceberg.public_data"
+
+        dataset = Config.CLIENT.get_dataset(dataset_id)
+        self.assertEqual(
+            dataset.dataset_id, "biglake-public-nyc-taxi-iceberg.public_data"
+        )
+        self.assertEqual(dataset.project, "bigquery-public-data")
+        self.assertGreater(
+            dataset.created, datetime.datetime(2025, 1, 1, tzinfo=datetime.timezone.utc)
+        )
+
     def test_create_dataset_with_default_rounding_mode(self):
         DATASET_ID = _make_dataset_id("create_dataset_rounding_mode")
         dataset = self.temp_dataset(DATASET_ID, default_rounding_mode="ROUND_HALF_EVEN")
@@ -693,6 +705,18 @@ def test_delete_dataset_delete_contents_false(self):
         with self.assertRaises(exceptions.BadRequest):
             Config.CLIENT.delete_dataset(dataset)
 
+    def test_get_table_w_public_biglake(self):
+        table_id = "bigquery-public-data.biglake-public-nyc-taxi-iceberg.public_data.nyc_taxicab"
+
+        table = Config.CLIENT.get_table(table_id)
+        self.assertEqual(table.table_id, "nyc_taxicab")
+        self.assertEqual(
+            table.dataset_id, "biglake-public-nyc-taxi-iceberg.public_data"
+        )
+        self.assertEqual(table.project, "bigquery-public-data")
+        schema_names = [field.name for field in table.schema]
+        self.assertGreater(len(schema_names), 0)
+
     def test_get_table_w_public_dataset(self):
         public = "bigquery-public-data"
         dataset_id = "samples"
diff --git a/packages/google-cloud-bigquery/tests/unit/test_dataset.py b/packages/google-cloud-bigquery/tests/unit/test_dataset.py
@@ -820,54 +820,6 @@ def test_from_api_repr(self):
 
         self.assertEqual(expected, got)
 
-    def test_from_string(self):
-        cls = self._get_target_class()
-        got = cls.from_string("string-project.string_dataset")
-        self.assertEqual(got.project, "string-project")
-        self.assertEqual(got.dataset_id, "string_dataset")
-
-    def test_from_string_w_prefix(self):
-        cls = self._get_target_class()
-        got = cls.from_string("google.com:string-project.string_dataset")
-        self.assertEqual(got.project, "google.com:string-project")
-        self.assertEqual(got.dataset_id, "string_dataset")
-
-    def test_from_string_legacy_string(self):
-        cls = self._get_target_class()
-        with self.assertRaises(ValueError):
-            cls.from_string("string-project:string_dataset")
-
-    def test_from_string_w_incorrect_prefix(self):
-        cls = self._get_target_class()
-        with self.assertRaises(ValueError):
-            cls.from_string("google.com.string-project.dataset_id")
-
-    def test_from_string_w_prefix_and_too_many_parts(self):
-        cls = self._get_target_class()
-        with self.assertRaises(ValueError):
-            cls.from_string("google.com:string-project.dataset_id.table_id")
-
-    def test_from_string_not_fully_qualified(self):
-        cls = self._get_target_class()
-        with self.assertRaises(ValueError):
-            cls.from_string("string_dataset")
-        with self.assertRaises(ValueError):
-            cls.from_string("a.b.c")
-
-    def test_from_string_with_default_project(self):
-        cls = self._get_target_class()
-        got = cls.from_string("string_dataset", default_project="default-project")
-        self.assertEqual(got.project, "default-project")
-        self.assertEqual(got.dataset_id, "string_dataset")
-
-    def test_from_string_ignores_default_project(self):
-        cls = self._get_target_class()
-        got = cls.from_string(
-            "string-project.string_dataset", default_project="default-project"
-        )
-        self.assertEqual(got.project, "string-project")
-        self.assertEqual(got.dataset_id, "string_dataset")
-
     def test___eq___wrong_type(self):
         dataset = self._make_one("project_1", "dataset_1")
         other = object()
diff --git a/packages/google-cloud-bigquery/tests/unit/test_magics.py b/packages/google-cloud-bigquery/tests/unit/test_magics.py
@@ -1337,12 +1337,13 @@ def test_context_with_no_query_cache_from_context(monkeypatch):
     ip = IPython.get_ipython()
     monkeypatch.setattr(bigquery, "bigquery_magics", None)
     bigquery.load_ipython_extension(ip)
+    context = magics.Context()
     conn = make_connection()
-    monkeypatch.setattr(magics.context, "_connection", conn)
-    monkeypatch.setattr(magics.context, "project", "project-from-context")
-    monkeypatch.setattr(
-        magics.context.default_query_job_config, "use_query_cache", False
-    )
+    context._connection = conn
+    context.credentials = mock.create_autospec(google.auth.credentials.Credentials)
+    context.default_query_job_config = bigquery.QueryJobConfig(use_query_cache=False)
+    context.project = "project-from-context"
+    monkeypatch.setattr(magics, "context", context)
 
     ip.run_cell_magic("bigquery", "", QUERY_STRING)
 
@@ -1415,12 +1416,17 @@ def test_bigquery_magic_with_progress_bar_type(monkeypatch):
     ip = IPython.get_ipython()
     monkeypatch.setattr(bigquery, "bigquery_magics", None)
     bigquery.load_ipython_extension(ip)
-    magics.context.progress_bar_type = None
+    context = magics.Context()
+    conn = make_connection()
+    context._connection = conn
+    context.credentials = mock.create_autospec(google.auth.credentials.Credentials)
+    context.progress_bar_type = None
+    context.project = "unit-test-project"
+    monkeypatch.setattr(magics, "context", context)
 
     run_query_patch = mock.patch(
         "google.cloud.bigquery.magics.magics._run_query", autospec=True
     )
-    magics.context.project = "unit-test-project"
 
     with run_query_patch as run_query_mock:
         ip.run_cell_magic(
@@ -2045,7 +2051,7 @@ def test_bigquery_magic_query_variable_not_identifier(monkeypatch):
     # considered a table name, thus we expect an error that the table ID is not valid.
     output = captured_io.stderr
     assert "ERROR:" in output
-    assert "must be a fully-qualified ID" in output
+    assert "Could not parse table_id." in output
 
 
 @pytest.mark.usefixtures("ipython_interactive")
diff --git a/packages/google-cloud-bigquery/tests/unit/test_string_references.py b/packages/google-cloud-bigquery/tests/unit/test_string_references.py
diff --git a/packages/google-cloud-bigquery/tests/unit/test_table.py b/packages/google-cloud-bigquery/tests/unit/test_table.py