Skip to content

Commit f93911c

Browse files
authored
fix: allow multi-part dataset IDs to support BigLake tables (#17137)
Relaxes DatasetReference.from_string and TableReference.from_string validation. Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/google-cloud-python/issues) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes b/512823729 🦕
1 parent 3624f3b commit f93911c

8 files changed

Lines changed: 405 additions & 145 deletions

File tree

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
# Copyright 2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""Helper to turn string references into REST resources."""
16+
17+
# TODO(b/513204277): Consolidate these transformations with pandas-gbq and bigframes.
18+
19+
from __future__ import annotations
20+
21+
import re
22+
from typing import TypedDict, Union
23+
24+
25+
ParsedDatasetReference = TypedDict(
26+
"ParsedDatasetReference",
27+
{
28+
"projectId": str,
29+
"datasetId": str,
30+
},
31+
)
32+
33+
34+
ParsedTableReference = TypedDict(
35+
"ParsedTableReference",
36+
{
37+
"projectId": str,
38+
"datasetId": str,
39+
"tableId": str,
40+
},
41+
)
42+
43+
44+
_FULLY_QUALIFIED_DATASET_REFERENCE_PATTERN = re.compile(
45+
# In the past, organizations could prefix their project IDs with a domain
46+
# name. Such projects still exist, especially at Google.
47+
r"^(?P<legacy_project_domain>[^:]+:)?"
48+
r"(?P<project>[^.]+)\."
49+
# Match dataset or catalog + namespace.
50+
#
51+
# Namespace could be arbitrarily deeply nested in Iceberg/BigLake. Support
52+
# this without catastrophic backtracking by moving the trailing "." to the
53+
# table group.
54+
r"(?P<inner_parts>.*)"
55+
)
56+
57+
58+
_FULLY_QUALIFIED_TABLE_REFERENCE_PATTERN = re.compile(
59+
# In the past, organizations could prefix their project IDs with a domain
60+
# name. Such projects still exist, especially at Google.
61+
r"^(?P<legacy_project_domain>[^:]+:)?"
62+
r"(?P<project>[^.]+)\."
63+
# Match dataset or catalog + namespace.
64+
#
65+
# Namespace could be arbitrarily deeply nested in Iceberg/BigLake. Support
66+
# this without catastrophic backtracking by moving the trailing "." to the
67+
# table group.
68+
r"(?P<inner_parts>.*)"
69+
# Table names can't contain ".", as that's used as the separator.
70+
r"\.(?P<table>[^.]+)$"
71+
)
72+
73+
74+
_RELATIVE_TABLE_REFERENCE_PATTERN = re.compile(
75+
# Match dataset or catalog + namespace.
76+
#
77+
# Namespace could be arbitrarily deeply nested in Iceberg/BigLake. Support
78+
# this without catastrophic backtracking by moving the trailing "." to the
79+
# table group.
80+
r"(?P<inner_parts>.*)"
81+
# Table names can't contain ".", as that's used as the separator.
82+
r"\.(?P<table>[^.]+)$"
83+
)
84+
85+
86+
def parse_dataset_reference(
87+
dataset_id: str, *, default_project: Union[str, None]
88+
) -> ParsedDatasetReference:
89+
"""Parse a dataset ID string.
90+
91+
Returns:
92+
ParsedDatasetReference: A typed dictionary (to avoid circular dependencies).
93+
94+
Raises:
95+
ValueError: When a fully-qualified dataset ID can't be determined.
96+
"""
97+
regex_match = _FULLY_QUALIFIED_DATASET_REFERENCE_PATTERN.match(dataset_id)
98+
if regex_match:
99+
legacy_project_domain = regex_match.group("legacy_project_domain")
100+
project = regex_match.group("project")
101+
102+
if legacy_project_domain:
103+
output_project_id = f"{legacy_project_domain}{project}"
104+
else:
105+
output_project_id = project
106+
107+
return {
108+
"projectId": output_project_id,
109+
"datasetId": regex_match.group("inner_parts"),
110+
}
111+
112+
if not default_project:
113+
raise ValueError(
114+
"When default_project is not set, dataset_id must be a "
115+
"fully-qualified dataset ID in standard SQL format, "
116+
'e.g., "project.dataset_id" got {}'.format(dataset_id)
117+
)
118+
119+
return {"datasetId": dataset_id, "projectId": default_project}
120+
121+
122+
def parse_table_reference(
123+
table_id: str, *, default_project: Union[str, None]
124+
) -> ParsedTableReference:
125+
"""Parse a table ID string.
126+
127+
Returns:
128+
ParsedTableReference: A typed dictionary (to avoid circular dependencies).
129+
130+
Raises:
131+
ValueError: When a fully-qualified table ID can't be determined.
132+
"""
133+
regex_match = _FULLY_QUALIFIED_TABLE_REFERENCE_PATTERN.match(table_id)
134+
if regex_match:
135+
legacy_project_domain = regex_match.group("legacy_project_domain")
136+
project = regex_match.group("project")
137+
138+
if legacy_project_domain:
139+
output_project_id = f"{legacy_project_domain}{project}"
140+
else:
141+
output_project_id = project
142+
143+
return {
144+
"projectId": output_project_id,
145+
"datasetId": regex_match.group("inner_parts"),
146+
"tableId": regex_match.group("table"),
147+
}
148+
149+
if not default_project:
150+
raise ValueError(
151+
"Could not determine project ID. Supply a default project or a fully-qualified table ID, "
152+
f"such as 'project.dataset.table'. Got {table_id}."
153+
)
154+
155+
regex_match = _RELATIVE_TABLE_REFERENCE_PATTERN.match(table_id)
156+
if not regex_match:
157+
raise ValueError(
158+
"Could not parse table_id. Expected a table ID"
159+
f"such as 'project.dataset.table', but got {table_id}."
160+
)
161+
162+
return {
163+
"projectId": default_project,
164+
"datasetId": regex_match.group("inner_parts"),
165+
"tableId": regex_match.group("table"),
166+
}

packages/google-cloud-bigquery/google/cloud/bigquery/dataset.py

Lines changed: 9 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
from google.cloud.bigquery.table import Table, TableReference
3131
from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration
3232
from google.cloud.bigquery import external_config
33+
from google.cloud.bigquery import _string_references
3334

3435

3536
def _get_table_reference(self, table_id: str) -> TableReference:
@@ -123,7 +124,9 @@ def path(self):
123124
routine = _get_routine_reference
124125

125126
@classmethod
126-
def from_api_repr(cls, resource: dict) -> "DatasetReference":
127+
def from_api_repr(
128+
cls, resource: Union[dict, _string_references.ParsedDatasetReference]
129+
) -> "DatasetReference":
127130
"""Factory: construct a dataset reference given its API representation
128131
129132
Args:
@@ -166,28 +169,12 @@ def from_string(
166169
If ``dataset_id`` is not a fully-qualified dataset ID in
167170
standard SQL format.
168171
"""
169-
output_dataset_id = dataset_id
170-
parts = _helpers._split_id(dataset_id)
171-
172-
if len(parts) == 1:
173-
if default_project is not None:
174-
output_project_id = default_project
175-
else:
176-
raise ValueError(
177-
"When default_project is not set, dataset_id must be a "
178-
"fully-qualified dataset ID in standard SQL format, "
179-
'e.g., "project.dataset_id" got {}'.format(dataset_id)
180-
)
181-
elif len(parts) == 2:
182-
output_project_id, output_dataset_id = parts
183-
else:
184-
raise ValueError(
185-
"Too many parts in dataset_id. Expected a fully-qualified "
186-
"dataset ID in standard SQL format, "
187-
'e.g. "project.dataset_id", got {}'.format(dataset_id)
172+
return cls.from_api_repr(
173+
_string_references.parse_dataset_reference(
174+
dataset_id=dataset_id,
175+
default_project=default_project,
188176
)
189-
190-
return cls(output_project_id, output_dataset_id)
177+
)
191178

192179
def to_api_repr(self) -> dict:
193180
"""Construct the API resource representation of this dataset reference

packages/google-cloud-bigquery/google/cloud/bigquery/table.py

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@
7272
from google.cloud.bigquery.schema import _parse_schema_resource
7373
from google.cloud.bigquery.schema import _to_schema_fields
7474
from google.cloud.bigquery import external_config
75+
from google.cloud.bigquery import _string_references
7576

7677
if typing.TYPE_CHECKING: # pragma: NO COVER
7778
# Unconditionally import optional dependencies again to tell pytype that
@@ -281,22 +282,17 @@ def from_string(
281282
If ``table_id`` is not a fully-qualified table ID in
282283
standard SQL format.
283284
"""
284-
from google.cloud.bigquery.dataset import DatasetReference
285-
286-
(
287-
output_project_id,
288-
output_dataset_id,
289-
output_table_id,
290-
) = _helpers._parse_3_part_id(
291-
table_id, default_project=default_project, property_name="table_id"
292-
)
293-
294-
return cls(
295-
DatasetReference(output_project_id, output_dataset_id), output_table_id
285+
return cls.from_api_repr(
286+
_string_references.parse_table_reference(
287+
table_id=table_id,
288+
default_project=default_project,
289+
)
296290
)
297291

298292
@classmethod
299-
def from_api_repr(cls, resource: dict) -> "TableReference":
293+
def from_api_repr(
294+
cls, resource: Union[dict, _string_references.ParsedTableReference]
295+
) -> "TableReference":
300296
"""Factory: construct a table reference given its API representation
301297
302298
Args:

packages/google-cloud-bigquery/tests/system/test_client.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,18 @@ def test_get_dataset(self):
304304
self.assertEqual(got.friendly_name, "Friendly")
305305
self.assertEqual(got.description, "Description")
306306

307+
def test_get_dataset_w_public_biglake(self):
308+
dataset_id = "bigquery-public-data.biglake-public-nyc-taxi-iceberg.public_data"
309+
310+
dataset = Config.CLIENT.get_dataset(dataset_id)
311+
self.assertEqual(
312+
dataset.dataset_id, "biglake-public-nyc-taxi-iceberg.public_data"
313+
)
314+
self.assertEqual(dataset.project, "bigquery-public-data")
315+
self.assertGreater(
316+
dataset.created, datetime.datetime(2025, 1, 1, tzinfo=datetime.timezone.utc)
317+
)
318+
307319
def test_create_dataset_with_default_rounding_mode(self):
308320
DATASET_ID = _make_dataset_id("create_dataset_rounding_mode")
309321
dataset = self.temp_dataset(DATASET_ID, default_rounding_mode="ROUND_HALF_EVEN")
@@ -693,6 +705,18 @@ def test_delete_dataset_delete_contents_false(self):
693705
with self.assertRaises(exceptions.BadRequest):
694706
Config.CLIENT.delete_dataset(dataset)
695707

708+
def test_get_table_w_public_biglake(self):
709+
table_id = "bigquery-public-data.biglake-public-nyc-taxi-iceberg.public_data.nyc_taxicab"
710+
711+
table = Config.CLIENT.get_table(table_id)
712+
self.assertEqual(table.table_id, "nyc_taxicab")
713+
self.assertEqual(
714+
table.dataset_id, "biglake-public-nyc-taxi-iceberg.public_data"
715+
)
716+
self.assertEqual(table.project, "bigquery-public-data")
717+
schema_names = [field.name for field in table.schema]
718+
self.assertGreater(len(schema_names), 0)
719+
696720
def test_get_table_w_public_dataset(self):
697721
public = "bigquery-public-data"
698722
dataset_id = "samples"

packages/google-cloud-bigquery/tests/unit/test_dataset.py

Lines changed: 0 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -820,54 +820,6 @@ def test_from_api_repr(self):
820820

821821
self.assertEqual(expected, got)
822822

823-
def test_from_string(self):
824-
cls = self._get_target_class()
825-
got = cls.from_string("string-project.string_dataset")
826-
self.assertEqual(got.project, "string-project")
827-
self.assertEqual(got.dataset_id, "string_dataset")
828-
829-
def test_from_string_w_prefix(self):
830-
cls = self._get_target_class()
831-
got = cls.from_string("google.com:string-project.string_dataset")
832-
self.assertEqual(got.project, "google.com:string-project")
833-
self.assertEqual(got.dataset_id, "string_dataset")
834-
835-
def test_from_string_legacy_string(self):
836-
cls = self._get_target_class()
837-
with self.assertRaises(ValueError):
838-
cls.from_string("string-project:string_dataset")
839-
840-
def test_from_string_w_incorrect_prefix(self):
841-
cls = self._get_target_class()
842-
with self.assertRaises(ValueError):
843-
cls.from_string("google.com.string-project.dataset_id")
844-
845-
def test_from_string_w_prefix_and_too_many_parts(self):
846-
cls = self._get_target_class()
847-
with self.assertRaises(ValueError):
848-
cls.from_string("google.com:string-project.dataset_id.table_id")
849-
850-
def test_from_string_not_fully_qualified(self):
851-
cls = self._get_target_class()
852-
with self.assertRaises(ValueError):
853-
cls.from_string("string_dataset")
854-
with self.assertRaises(ValueError):
855-
cls.from_string("a.b.c")
856-
857-
def test_from_string_with_default_project(self):
858-
cls = self._get_target_class()
859-
got = cls.from_string("string_dataset", default_project="default-project")
860-
self.assertEqual(got.project, "default-project")
861-
self.assertEqual(got.dataset_id, "string_dataset")
862-
863-
def test_from_string_ignores_default_project(self):
864-
cls = self._get_target_class()
865-
got = cls.from_string(
866-
"string-project.string_dataset", default_project="default-project"
867-
)
868-
self.assertEqual(got.project, "string-project")
869-
self.assertEqual(got.dataset_id, "string_dataset")
870-
871823
def test___eq___wrong_type(self):
872824
dataset = self._make_one("project_1", "dataset_1")
873825
other = object()

packages/google-cloud-bigquery/tests/unit/test_magics.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1337,12 +1337,13 @@ def test_context_with_no_query_cache_from_context(monkeypatch):
13371337
ip = IPython.get_ipython()
13381338
monkeypatch.setattr(bigquery, "bigquery_magics", None)
13391339
bigquery.load_ipython_extension(ip)
1340+
context = magics.Context()
13401341
conn = make_connection()
1341-
monkeypatch.setattr(magics.context, "_connection", conn)
1342-
monkeypatch.setattr(magics.context, "project", "project-from-context")
1343-
monkeypatch.setattr(
1344-
magics.context.default_query_job_config, "use_query_cache", False
1345-
)
1342+
context._connection = conn
1343+
context.credentials = mock.create_autospec(google.auth.credentials.Credentials)
1344+
context.default_query_job_config = bigquery.QueryJobConfig(use_query_cache=False)
1345+
context.project = "project-from-context"
1346+
monkeypatch.setattr(magics, "context", context)
13461347

13471348
ip.run_cell_magic("bigquery", "", QUERY_STRING)
13481349

@@ -1415,12 +1416,17 @@ def test_bigquery_magic_with_progress_bar_type(monkeypatch):
14151416
ip = IPython.get_ipython()
14161417
monkeypatch.setattr(bigquery, "bigquery_magics", None)
14171418
bigquery.load_ipython_extension(ip)
1418-
magics.context.progress_bar_type = None
1419+
context = magics.Context()
1420+
conn = make_connection()
1421+
context._connection = conn
1422+
context.credentials = mock.create_autospec(google.auth.credentials.Credentials)
1423+
context.progress_bar_type = None
1424+
context.project = "unit-test-project"
1425+
monkeypatch.setattr(magics, "context", context)
14191426

14201427
run_query_patch = mock.patch(
14211428
"google.cloud.bigquery.magics.magics._run_query", autospec=True
14221429
)
1423-
magics.context.project = "unit-test-project"
14241430

14251431
with run_query_patch as run_query_mock:
14261432
ip.run_cell_magic(
@@ -2045,7 +2051,7 @@ def test_bigquery_magic_query_variable_not_identifier(monkeypatch):
20452051
# considered a table name, thus we expect an error that the table ID is not valid.
20462052
output = captured_io.stderr
20472053
assert "ERROR:" in output
2048-
assert "must be a fully-qualified ID" in output
2054+
assert "Could not parse table_id." in output
20492055

20502056

20512057
@pytest.mark.usefixtures("ipython_interactive")

0 commit comments

Comments
 (0)