Skip to content

Commit 91b6c24

Browse files
authored
feat: add df.bigquery pandas accessor (#2513)
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes #<issue_number_goes_here> 🦕
1 parent 5cf3788 commit 91b6c24

File tree

14 files changed

+430
-17
lines changed

14 files changed

+430
-17
lines changed

bigframes/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@
3232
)
3333
import bigframes.enums as enums # noqa: E402
3434
import bigframes.exceptions as exceptions # noqa: E402
35+
36+
# Register pandas extensions
37+
import bigframes.extensions.pandas.dataframe_accessor # noqa: F401, E402
3538
from bigframes.session import connect, Session # noqa: E402
3639
from bigframes.version import __version__ # noqa: E402
3740

bigframes/bigquery/_operations/sql.py

Lines changed: 57 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -16,19 +16,31 @@
1616

1717
from __future__ import annotations
1818

19-
from typing import Sequence
19+
from typing import cast, Optional, Sequence, Union
2020

2121
import google.cloud.bigquery
2222

2323
from bigframes.core.compile.sqlglot import sql
24+
import bigframes.dataframe
2425
import bigframes.dtypes
2526
import bigframes.operations
2627
import bigframes.series
2728

2829

30+
def _format_names(sql_template: str, dataframe: bigframes.dataframe.DataFrame):
31+
"""Turn sql_template from a template that uses names to one that uses
32+
numbers.
33+
"""
34+
names_to_numbers = {name: f"{{{i}}}" for i, name in enumerate(dataframe.columns)}
35+
numbers = [f"{{{i}}}" for i in range(len(dataframe.columns))]
36+
return sql_template.format(*numbers, **names_to_numbers)
37+
38+
2939
def sql_scalar(
3040
sql_template: str,
31-
columns: Sequence[bigframes.series.Series],
41+
columns: Union[bigframes.dataframe.DataFrame, Sequence[bigframes.series.Series]],
42+
*,
43+
output_dtype: Optional[bigframes.dtypes.Dtype] = None,
3244
) -> bigframes.series.Series:
3345
"""Create a Series from a SQL template.
3446
@@ -37,6 +49,9 @@ def sql_scalar(
3749
>>> import bigframes.pandas as bpd
3850
>>> import bigframes.bigquery as bbq
3951
52+
Either pass in a sequence of series, in which case use integers in the
53+
format strings.
54+
4055
>>> s = bpd.Series(["1.5", "2.5", "3.5"])
4156
>>> s = s.astype(pd.ArrowDtype(pa.decimal128(38, 9)))
4257
>>> bbq.sql_scalar("ROUND({0}, 0, 'ROUND_HALF_EVEN')", [s])
@@ -45,13 +60,29 @@ def sql_scalar(
4560
2 4.000000000
4661
dtype: decimal128(38, 9)[pyarrow]
4762
63+
Or pass in a DataFrame, in which case use the column names in the format
64+
strings.
65+
66+
>>> df = bpd.DataFrame({"a": ["1.5", "2.5", "3.5"]})
67+
>>> df = df.astype({"a": pd.ArrowDtype(pa.decimal128(38, 9))})
68+
>>> bbq.sql_scalar("ROUND({a}, 0, 'ROUND_HALF_EVEN')", df)
69+
0 2.000000000
70+
1 2.000000000
71+
2 4.000000000
72+
dtype: decimal128(38, 9)[pyarrow]
73+
4874
Args:
4975
sql_template (str):
5076
A SQL format string with Python-style {0} placeholders for each of
5177
the Series objects in ``columns``.
52-
columns (Sequence[bigframes.pandas.Series]):
78+
columns (
79+
Sequence[bigframes.pandas.Series] | bigframes.pandas.DataFrame
80+
):
5381
Series objects representing the column inputs to the
5482
``sql_template``. Must contain at least one Series.
83+
output_dtype (a BigQuery DataFrames compatible dtype, optional):
84+
If provided, BigQuery DataFrames uses this to determine the output
85+
of the returned Series. This avoids a dry run query.
5586
5687
Returns:
5788
bigframes.pandas.Series:
@@ -60,28 +91,38 @@ def sql_scalar(
6091
Raises:
6192
ValueError: If ``columns`` is empty.
6293
"""
94+
if isinstance(columns, bigframes.dataframe.DataFrame):
95+
sql_template = _format_names(sql_template, columns)
96+
columns = [
97+
cast(bigframes.series.Series, columns[column]) for column in columns.columns
98+
]
99+
63100
if len(columns) == 0:
64101
raise ValueError("Must provide at least one column in columns")
65102

103+
base_series = columns[0]
104+
66105
# To integrate this into our expression trees, we need to get the output
67106
# type, so we do some manual compilation and a dry run query to get that.
68107
# Another benefit of this is that if there is a syntax error in the SQL
69108
# template, then this will fail with an error earlier in the process,
70109
# aiding users in debugging.
71-
literals_sql = [sql.to_sql(sql.literal(None, column.dtype)) for column in columns]
72-
select_sql = sql_template.format(*literals_sql)
73-
dry_run_sql = f"SELECT {select_sql}"
74-
75-
# Use the executor directly, because we want the original column IDs, not
76-
# the user-friendly column names that block.to_sql_query() would produce.
77-
base_series = columns[0]
78-
bqclient = base_series._session.bqclient
79-
job = bqclient.query(
80-
dry_run_sql, job_config=google.cloud.bigquery.QueryJobConfig(dry_run=True)
81-
)
82-
_, output_type = bigframes.dtypes.convert_schema_field(job.schema[0])
110+
if output_dtype is None:
111+
literals_sql = [
112+
sql.to_sql(sql.literal(None, column.dtype)) for column in columns
113+
]
114+
select_sql = sql_template.format(*literals_sql)
115+
dry_run_sql = f"SELECT {select_sql}"
116+
117+
# Use the executor directly, because we want the original column IDs, not
118+
# the user-friendly column names that block.to_sql_query() would produce.
119+
bqclient = base_series._session.bqclient
120+
job = bqclient.query(
121+
dry_run_sql, job_config=google.cloud.bigquery.QueryJobConfig(dry_run=True)
122+
)
123+
_, output_dtype = bigframes.dtypes.convert_schema_field(job.schema[0])
83124

84125
op = bigframes.operations.SqlScalarOp(
85-
_output_type=output_type, sql_template=sql_template
126+
_output_type=output_dtype, sql_template=sql_template
86127
)
87128
return base_series._apply_nary_op(op, columns[1:])

bigframes/extensions/__init__.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright 2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright 2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# Copyright 2026 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from typing import cast
16+
17+
import pandas
18+
import pandas.api.extensions
19+
20+
import bigframes.core.global_session as bf_session
21+
import bigframes.pandas as bpd
22+
23+
24+
@pandas.api.extensions.register_dataframe_accessor("bigquery")
25+
class BigQueryDataFrameAccessor:
26+
"""
27+
Pandas DataFrame accessor for BigQuery DataFrames functionality.
28+
29+
This accessor is registered under the ``bigquery`` namespace on pandas DataFrame objects.
30+
"""
31+
32+
def __init__(self, pandas_obj: pandas.DataFrame):
33+
self._obj = pandas_obj
34+
35+
def sql_scalar(self, sql_template: str, *, output_dtype=None, session=None):
36+
"""
37+
Compute a new pandas Series by applying a SQL scalar function to the DataFrame.
38+
39+
The DataFrame is converted to BigFrames by calling ``read_pandas``, then the SQL
40+
template is applied using ``bigframes.bigquery.sql_scalar``, and the result is
41+
converted back to a pandas Series using ``to_pandas``.
42+
43+
Args:
44+
sql_template (str):
45+
A SQL format string with Python-style {0}, {1}, etc. placeholders for each of
46+
the columns in the DataFrame (in the order they appear in ``df.columns``).
47+
output_dtype (a BigQuery DataFrames compatible dtype, optional):
48+
If provided, BigQuery DataFrames uses this to determine the output
49+
of the returned Series. This avoids a dry run query.
50+
session (bigframes.session.Session, optional):
51+
The BigFrames session to use. If not provided, the default global session is used.
52+
53+
Returns:
54+
pandas.Series:
55+
The result of the SQL scalar function as a pandas Series.
56+
"""
57+
# Import bigframes.bigquery here to avoid circular imports
58+
import bigframes.bigquery
59+
60+
if session is None:
61+
session = bf_session.get_global_session()
62+
63+
bf_df = cast(bpd.DataFrame, session.read_pandas(self._obj))
64+
result = bigframes.bigquery.sql_scalar(
65+
sql_template, bf_df, output_dtype=output_dtype
66+
)
67+
return result.to_pandas(ordered=True)

docs/conf.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,8 @@
280280
# See https://github.com/sphinx-doc/sphinx/blob
281281
# /2a65ffeef5c107c19084fabdd706cdff3f52d93c/sphinx/domains/python.py#L843
282282
"ref.python",
283+
# Allow external websites to be down occasionally.
284+
"intersphinx.external",
283285
]
284286

285287
# -- Options for LaTeX output ---------------------------------------------
@@ -388,7 +390,8 @@
388390
"grpc": ("https://grpc.github.io/grpc/python/", None),
389391
"proto-plus": ("https://proto-plus-python.readthedocs.io/en/latest/", None),
390392
"protobuf": ("https://googleapis.dev/python/protobuf/latest/", None),
391-
"pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None),
393+
# TODO(tswast): re-enable if we can get temporary failures to be ignored.
394+
# "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None),
392395
"pydata-google-auth": (
393396
"https://pydata-google-auth.readthedocs.io/en/latest/",
394397
None,

docs/reference/index.rst

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,16 @@ packages.
1919
bigframes.pandas.api.typing
2020
bigframes.streaming
2121

22+
Pandas Extensions
23+
~~~~~~~~~~~~~~~~~
24+
25+
BigQuery DataFrames provides extensions to pandas DataFrame objects.
26+
27+
.. autosummary::
28+
:toctree: api
29+
30+
bigframes.extensions.pandas.dataframe_accessor.BigQueryDataFrameAccessor
31+
2232
ML APIs
2333
~~~~~~~
2434

docs/user_guide/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ User Guide
1818
Getting Started <../notebooks/getting_started/getting_started_bq_dataframes.ipynb>
1919
Magics <../notebooks/getting_started/magics.ipynb>
2020
ML Fundamentals <../notebooks/getting_started/ml_fundamentals_bq_dataframes.ipynb>
21+
Pandas Extensions <../notebooks/getting_started/pandas_extensions.ipynb>
2122

2223
.. toctree::
2324
:caption: DataFrames

0 commit comments

Comments
 (0)