From 1eccb3a80e12802dc19e1f63c96e61801ec434bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Thu, 25 Sep 2025 21:04:07 +0000 Subject: [PATCH 01/16] fix: show progress even in job optional queries --- bigframes/core/events.py | 205 ++++++++++++++++++++++++++++++++++++ setup.py | 2 +- testing/constraints-3.9.txt | 2 +- 3 files changed, 207 insertions(+), 2 deletions(-) create mode 100644 bigframes/core/events.py diff --git a/bigframes/core/events.py b/bigframes/core/events.py new file mode 100644 index 0000000000..437819de3a --- /dev/null +++ b/bigframes/core/events.py @@ -0,0 +1,205 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import dataclasses +import datetime +import threading +from typing import List, Optional +import weakref + +import google.cloud.bigquery._job_helpers +import google.cloud.bigquery.job.query +import google.cloud.bigquery.table + + +@dataclasses.dataclass(frozen=True) +class Subscriber: + callback_ref: weakref.ref + # TODO(tswast): Add block_id to allow filter in context managers. + + +class Publisher: + def __init__(self): + self._subscribers: List[Subscriber] = [] + self._subscribers_lock = threading.Lock() + + def subscribe(self, callback): + subscriber = Subscriber(callback_ref=weakref.ref(callback)) + + with self._subscribers_lock: + # TODO(tswast): Add block_id to allow filter in context managers. + self._subscribers.append(subscriber) + + def send(self, event: Event): + to_delete = [] + to_call = [] + + with self._subscribers_lock: + for sid, subscriber in enumerate(self._subscribers): + callback = subscriber.callback_ref() + + if callback is None: + to_delete.append(sid) + else: + # TODO(tswast): Add if statement for block_id to allow filter + # in context managers. + to_call.append(callback) + + for sid in reversed(to_delete): + del self._subscribers[sid] + + for callback in to_call: + callback(event) + + +publisher = Publisher() + + +class Event: + pass + + +class ExecutionStarted(Event): + pass + + +class ExecutionRunning(Event): + pass + + +class ExecutionStopped(Event): + pass + + +@dataclasses.dataclass(frozen=True) +class BigQuerySentEvent(ExecutionStarted): + """Query sent to BigQuery.""" + + query: str + billing_project: Optional[str] = None + location: Optional[str] = None + job_id: Optional[str] = None + request_id: Optional[str] = None + + @classmethod + def from_bqclient(cls, event: google.cloud.bigquery._job_helpers.QuerySentEvent): + return cls( + query=event.query, + billing_project=event.billing_project, + location=event.location, + job_id=event.job_id, + request_id=event.request_id, + ) + + +@dataclasses.dataclass(frozen=True) +class BigQueryRetryEvent(ExecutionRunning): + """Query sent another time because the previous attempt failed.""" + + query: str + billing_project: Optional[str] = None + location: Optional[str] = None + job_id: Optional[str] = None + request_id: Optional[str] = None + + @classmethod + def from_bqclient(cls, event: google.cloud.bigquery._job_helpers.QueryRetryEvent): + return cls( + query=event.query, + billing_project=event.billing_project, + location=event.location, + job_id=event.job_id, + request_id=event.request_id, + ) + + +@dataclasses.dataclass(frozen=True) +class BigQueryReceivedEvent(ExecutionRunning): + """Query received and acknowledged by the BigQuery API.""" + + billing_project: Optional[str] = None + location: Optional[str] = None + job_id: Optional[str] = None + statement_type: Optional[str] = None + state: Optional[str] = None + query_plan: Optional[list[google.cloud.bigquery.job.query.QueryPlanEntry]] = None + created: Optional[datetime.datetime] = None + started: Optional[datetime.datetime] = None + ended: Optional[datetime.datetime] = None + + @classmethod + def from_bqclient( + cls, event: google.cloud.bigquery._job_helpers.QueryReceivedEvent + ): + return cls( + billing_project=event.billing_project, + location=event.location, + job_id=event.job_id, + statement_type=event.statement_type, + state=event.state, + query_plan=event.query_plan, + created=event.created, + started=event.started, + ended=event.ended, + ) + + +@dataclasses.dataclass(frozen=True) +class BigQueryFinishedEvent(ExecutionStopped): + """Query finished successfully.""" + + billing_project: Optional[str] = None + location: Optional[str] = None + query_id: Optional[str] = None + job_id: Optional[str] = None + destination: Optional[google.cloud.bigquery.table.TableReference] = None + total_rows: Optional[int] = None + total_bytes_processed: Optional[int] = None + slot_millis: Optional[int] = None + created: Optional[datetime.datetime] = None + started: Optional[datetime.datetime] = None + ended: Optional[datetime.datetime] = None + + @classmethod + def from_bqclient( + cls, event: google.cloud.bigquery._job_helpers.QueryFinishedEvent + ): + return cls( + billing_project=event.billing_project, + location=event.location, + query_id=event.query_id, + job_id=event.job_id, + destination=event.destination, + total_rows=event.total_rows, + total_bytes_processed=event.total_bytes_processed, + slot_millis=event.slot_millis, + created=event.created, + started=event.started, + ended=event.ended, + ) + + +@dataclasses.dataclass(frozen=True) +class BigQueryUnknownEvent(ExecutionRunning): + """Got unknown event from the BigQuery client library.""" + + # TODO: should we just skip sending unknown events? + + event: object + + @classmethod + def from_bqclient(cls, event): + return cls(event) diff --git a/setup.py b/setup.py index 2aef514749..a2f4399fdf 100644 --- a/setup.py +++ b/setup.py @@ -39,7 +39,7 @@ "gcsfs >=2023.3.0, !=2025.5.0", "geopandas >=0.12.2", "google-auth >=2.15.0,<3.0", - "google-cloud-bigquery[bqstorage,pandas] >=3.31.0", + "google-cloud-bigquery[bqstorage,pandas] >=3.38.0", # 2.30 needed for arrow support. "google-cloud-bigquery-storage >= 2.30.0, < 3.0.0", "google-cloud-functions >=1.12.0", diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 8df3a3a2c3..4348a6abee 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -6,7 +6,7 @@ geopandas==0.12.2 google-auth==2.15.0 google-cloud-bigtable==2.24.0 google-cloud-pubsub==2.21.4 -google-cloud-bigquery==3.31.0 +google-cloud-bigquery==3.38.0 google-cloud-functions==1.12.0 google-cloud-bigquery-connection==1.12.0 google-cloud-iam==2.12.1 From d03e5d13241e4c9f42dbbed5d5b19e523b3e6a17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Thu, 25 Sep 2025 21:59:12 +0000 Subject: [PATCH 02/16] first attempt at publisher --- bigframes/core/events.py | 7 +- bigframes/formatting_helpers.py | 106 ++-- bigframes/session/_io/bigquery/__init__.py | 67 +- .../getting_started_bq_dataframes.ipynb | 588 +++++------------- setup.py | 2 +- testing/constraints-3.9.txt | 2 +- 6 files changed, 245 insertions(+), 527 deletions(-) diff --git a/bigframes/core/events.py b/bigframes/core/events.py index 437819de3a..d4c176704a 100644 --- a/bigframes/core/events.py +++ b/bigframes/core/events.py @@ -24,6 +24,8 @@ import google.cloud.bigquery.job.query import google.cloud.bigquery.table +import bigframes.formatting_helpers + @dataclasses.dataclass(frozen=True) class Subscriber: @@ -66,6 +68,7 @@ def send(self, event: Event): publisher = Publisher() +publisher.subscribe(bigframes.formatting_helpers.progress_callback) class Event: @@ -85,7 +88,7 @@ class ExecutionStopped(Event): @dataclasses.dataclass(frozen=True) -class BigQuerySentEvent(ExecutionStarted): +class BigQuerySentEvent(ExecutionRunning): """Query sent to BigQuery.""" query: str @@ -158,7 +161,7 @@ def from_bqclient( @dataclasses.dataclass(frozen=True) -class BigQueryFinishedEvent(ExecutionStopped): +class BigQueryFinishedEvent(ExecutionRunning): """Query finished successfully.""" billing_project: Optional[str] = None diff --git a/bigframes/formatting_helpers.py b/bigframes/formatting_helpers.py index 48afb4fdbd..2ec8077b26 100644 --- a/bigframes/formatting_helpers.py +++ b/bigframes/formatting_helpers.py @@ -13,11 +13,12 @@ # limitations under the License. """Shared helper functions for formatting jobs related info.""" -# TODO(orrbradford): cleanup up typings and documenttion in this file + +from __future__ import annotations import datetime import random -from typing import Any, Optional, Type, Union +from typing import Any, Optional, Type, TYPE_CHECKING, Union import bigframes_vendored.constants as constants import google.api_core.exceptions as api_core_exceptions @@ -27,6 +28,9 @@ import IPython.display as display import ipywidgets as widgets +if TYPE_CHECKING: + import bigframes.core.events + GenericJob = Union[ bigquery.LoadJob, bigquery.ExtractJob, bigquery.QueryJob, bigquery.CopyJob ] @@ -119,71 +123,51 @@ def repr_query_job(query_job: Optional[bigquery.QueryJob]): return res -def wait_for_query_job( - query_job: bigquery.QueryJob, - max_results: Optional[int] = None, - page_size: Optional[int] = None, - progress_bar: Optional[str] = None, -) -> bigquery.table.RowIterator: - """Return query results. Displays a progress bar while the query is running - Args: - query_job (bigquery.QueryJob, Optional): - The job representing the execution of the query on the server. - max_results (int, Optional): - The maximum number of rows the row iterator should return. - page_size (int, Optional): - The number of results to return on each results page. - progress_bar (str, Optional): - Which progress bar to show. - Returns: - A row iterator over the query results. - """ +current_display: Optional[display.HTML] = None +current_display_id: Optional[str] = None + + +def progress_callback( + event: bigframes.core.events.Event, +): + """Displays a progress bar while the query is running""" + global current_display, current_display_id + + import bigframes._config + import bigframes.core.events + + progress_bar = bigframes._config.options.display.progress_bar + if progress_bar == "auto": progress_bar = "notebook" if in_ipython() else "terminal" - try: - if progress_bar == "notebook": - display_id = str(random.random()) - loading_bar = display.HTML(get_query_job_loading_html(query_job)) - display.display(loading_bar, display_id=display_id) - query_result = query_job.result( - max_results=max_results, page_size=page_size - ) - query_job.reload() + if progress_bar == "notebook": + if ( + isinstance(event, bigframes.core.events.ExecutionStarted) + or current_display is None + or current_display_id is None + ): + current_display_id = str(random.random()) + current_display = display.HTML("Starting execution.") + display.display(current_display) + + if isinstance(event, bigframes.core.events.ExecutionRunning): display.update_display( - display.HTML(get_query_job_loading_html(query_job)), - display_id=display_id, + display.HTML("Execution happening."), + display_id=current_display_id, ) - elif progress_bar == "terminal": - initial_loading_bar = get_query_job_loading_string(query_job) - print(initial_loading_bar) - query_result = query_job.result( - max_results=max_results, page_size=page_size - ) - query_job.reload() - if initial_loading_bar != get_query_job_loading_string(query_job): - print(get_query_job_loading_string(query_job)) - else: - # No progress bar. - query_result = query_job.result( - max_results=max_results, page_size=page_size + elif isinstance(event, bigframes.core.events.ExecutionStopped): + display.update_display( + display.HTML("Execution done."), + display_id=current_display_id, ) - query_job.reload() - return query_result - except api_core_exceptions.RetryError as exc: - add_feedback_link(exc) - raise - except api_core_exceptions.GoogleAPICallError as exc: - add_feedback_link(exc) - raise - except KeyboardInterrupt: - query_job.cancel() - print( - f"Requested cancellation for {query_job.job_type.capitalize()}" - f" job {query_job.job_id} in location {query_job.location}..." - ) - # begin the cancel request before immediately rethrowing - raise + elif progress_bar == "terminal": + if isinstance(event, bigframes.core.events.ExecutionStarted): + print("Starting execution.") + elif isinstance(event, bigframes.core.events.ExecutionRunning): + print("Execution happening.") + elif isinstance(event, bigframes.core.events.ExecutionStopped): + print("Execution done.") def wait_for_job(job: GenericJob, progress_bar: Optional[str] = None): diff --git a/bigframes/session/_io/bigquery/__init__.py b/bigframes/session/_io/bigquery/__init__.py index 83f63e8b9a..58d0da696a 100644 --- a/bigframes/session/_io/bigquery/__init__.py +++ b/bigframes/session/_io/bigquery/__init__.py @@ -29,11 +29,13 @@ import google.api_core.exceptions import google.api_core.retry import google.cloud.bigquery as bigquery +import google.cloud.bigquery._job_helpers +import google.cloud.bigquery.table from bigframes.core import log_adapter import bigframes.core.compile.googlesql as googlesql +import bigframes.core.events import bigframes.core.sql -import bigframes.formatting_helpers as formatting_helpers import bigframes.session.metrics CHECK_DRIVE_PERMISSIONS = "\nCheck https://cloud.google.com/bigquery/docs/query-drive-data#Google_Drive_permissions." @@ -238,6 +240,15 @@ def add_and_trim_labels(job_config): ) +def publish_bq_event(event): + if isinstance(event, google.cloud.bigquery._job_helpers.QuerySentEvent): + bf_event = bigframes.core.events.BigQuerySentEvent.from_bqclient(event) + else: + bf_event = bigframes.core.events.BigQueryUnknownEvent(event) + + bigframes.core.events.publisher.send(bf_event) + + @overload def start_query_with_client( bq_client: bigquery.Client, @@ -249,7 +260,7 @@ def start_query_with_client( timeout: Optional[float], metrics: Optional[bigframes.session.metrics.ExecutionMetrics], query_with_job: Literal[True], -) -> Tuple[bigquery.table.RowIterator, bigquery.QueryJob]: +) -> Tuple[google.cloud.bigquery.table.RowIterator, bigquery.QueryJob]: ... @@ -264,7 +275,7 @@ def start_query_with_client( timeout: Optional[float], metrics: Optional[bigframes.session.metrics.ExecutionMetrics], query_with_job: Literal[False], -) -> Tuple[bigquery.table.RowIterator, Optional[bigquery.QueryJob]]: +) -> Tuple[google.cloud.bigquery.table.RowIterator, Optional[bigquery.QueryJob]]: ... @@ -280,7 +291,7 @@ def start_query_with_client( metrics: Optional[bigframes.session.metrics.ExecutionMetrics], query_with_job: Literal[True], job_retry: google.api_core.retry.Retry, -) -> Tuple[bigquery.table.RowIterator, bigquery.QueryJob]: +) -> Tuple[google.cloud.bigquery.table.RowIterator, bigquery.QueryJob]: ... @@ -296,7 +307,7 @@ def start_query_with_client( metrics: Optional[bigframes.session.metrics.ExecutionMetrics], query_with_job: Literal[False], job_retry: google.api_core.retry.Retry, -) -> Tuple[bigquery.table.RowIterator, Optional[bigquery.QueryJob]]: +) -> Tuple[google.cloud.bigquery.table.RowIterator, Optional[bigquery.QueryJob]]: ... @@ -315,23 +326,25 @@ def start_query_with_client( # https://github.com/googleapis/python-bigquery/pull/2256 merged, likely # version 3.36.0 or later. job_retry: google.api_core.retry.Retry = third_party_gcb_retry.DEFAULT_JOB_RETRY, -) -> Tuple[bigquery.table.RowIterator, Optional[bigquery.QueryJob]]: +) -> Tuple[google.cloud.bigquery.table.RowIterator, Optional[bigquery.QueryJob]]: """ Starts query job and waits for results. """ + # Note: Ensure no additional labels are added to job_config after this + # point, as `add_and_trim_labels` ensures the label count does not + # exceed MAX_LABELS_COUNT. + add_and_trim_labels(job_config) + try: - # Note: Ensure no additional labels are added to job_config after this - # point, as `add_and_trim_labels` ensures the label count does not - # exceed MAX_LABELS_COUNT. - add_and_trim_labels(job_config) if not query_with_job: - results_iterator = bq_client.query_and_wait( + results_iterator = bq_client._query_and_wait_bigframes( sql, job_config=job_config, location=location, project=project, api_timeout=timeout, job_retry=job_retry, + callback=publish_bq_event, ) if metrics is not None: metrics.count_job_stats(row_iterator=results_iterator) @@ -350,14 +363,32 @@ def start_query_with_client( ex.message += CHECK_DRIVE_PERMISSIONS raise - opts = bigframes.options.display - if opts.progress_bar is not None and not query_job.configuration.dry_run: - results_iterator = formatting_helpers.wait_for_query_job( - query_job, - progress_bar=opts.progress_bar, + if not query_job.configuration.dry_run: + bigframes.core.events.publisher.send( + bigframes.core.events.BigQuerySentEvent( + sql, + billing_project=query_job.project, + location=query_job.location, + job_id=query_job.job_id, + request_id=None, + ) + ) + results_iterator = query_job.result() + if not query_job.configuration.dry_run: + bigframes.core.events.publisher.send( + bigframes.core.events.BigQueryFinishedEvent( + billing_project=query_job.project, + location=query_job.location, + job_id=query_job.job_id, + destination=query_job.destination, + total_rows=results_iterator.total_rows, + total_bytes_processed=query_job.total_bytes_processed, + slot_millis=query_job.slot_millis, + created=query_job.created, + started=query_job.started, + ended=query_job.ended, + ) ) - else: - results_iterator = query_job.result() if metrics is not None: metrics.count_job_stats(query_job=query_job) diff --git a/notebooks/getting_started/getting_started_bq_dataframes.ipynb b/notebooks/getting_started/getting_started_bq_dataframes.ipynb index 384f3b9c10..e721826a6f 100644 --- a/notebooks/getting_started/getting_started_bq_dataframes.ipynb +++ b/notebooks/getting_started/getting_started_bq_dataframes.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "id": "ur8xi4C7S06n" }, @@ -143,7 +143,7 @@ }, "outputs": [], "source": [ - "!pip install bigframes" + "# %pip install bigframes" ] }, { @@ -159,7 +159,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "id": "f200f10a1da3" }, @@ -230,20 +230,9 @@ "metadata": { "id": "oM1iC_MfAts1" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Updated property [core/project].\n" - ] - } - ], + "outputs": [], "source": [ - "PROJECT_ID = \"\" # @param {type:\"string\"}\n", - "\n", - "# Set the project id\n", - "! gcloud config set project {PROJECT_ID}" + "PROJECT_ID = \"\" # @param {type:\"string\"}" ] }, { @@ -259,7 +248,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "metadata": { "id": "eF-Twtc4XGem" }, @@ -303,7 +292,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": { "id": "254614fa0c46" }, @@ -325,7 +314,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": { "id": "603adbbf0532" }, @@ -346,7 +335,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 3, "metadata": { "id": "PyQmSRbKA8r-" }, @@ -367,7 +356,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 4, "metadata": { "id": "NPPMuw2PXGeo" }, @@ -375,8 +364,15 @@ "source": [ "# Note: The project option is not required in all environments.\n", "# On BigQuery Studio, the project ID is automatically detected.\n", - "bpd.options.bigquery.project = PROJECT_ID\n", - "\n", + "bpd.options.bigquery.project = PROJECT_ID" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ "# Note: The location option is not required.\n", "# It defaults to the location of the first table or query\n", "# passed to read_gbq(). For APIs where a location can't be\n", @@ -432,20 +428,7 @@ "metadata": { "id": "Vyex9BQI-BNa" }, - "outputs": [ - { - "data": { - "text/html": [ - "Query job badadf0b-27c8-4dac-a468-be3c40745538 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# bq_df_sample = bpd.read_gbq(\"bigquery-samples.wikipedia_pageviews.200809h\")" ] @@ -476,121 +459,7 @@ "metadata": { "id": "XfGq5apK-D_e" }, - "outputs": [ - { - "data": { - "text/html": [ - "Query job c8669c7f-bca3-4f54-b354-8e57b3321f5a is DONE. 34.9 GB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
titleviews
21911Google1414560
27669Google_Chrome962482
28394Google_Earth383566
29184Google_Maps205089
27251Google_Android99450
33900Google_search97665
31825Google_chrome78399
30204Google_Street_View71580
40798Image:Google_Chrome.png60746
35222Googleplex53848
\n", - "

10 rows × 2 columns

\n", - "
[10 rows x 2 columns in total]" - ], - "text/plain": [ - " title views\n", - "21911 Google 1414560\n", - "27669 Google_Chrome 962482\n", - "28394 Google_Earth 383566\n", - "29184 Google_Maps 205089\n", - "27251 Google_Android 99450\n", - "33900 Google_search 97665\n", - "31825 Google_chrome 78399\n", - "30204 Google_Street_View 71580\n", - "40798 Image:Google_Chrome.png 60746\n", - "35222 Googleplex 53848\n", - "\n", - "[10 rows x 2 columns]" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# bq_df_sample[bq_df_sample.title.str.contains(r\"[Gg]oogle\")]\\\n", "# .groupby(['title'], as_index=False)['views'].sum(numeric_only=True)\\\n", @@ -660,7 +529,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 6, "metadata": { "id": "SvyXzkRl783u" }, @@ -686,7 +555,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": { "id": "3QHQYlnoBLpt" }, @@ -712,7 +581,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 7, "metadata": { "id": "EDAaIwHpQCDZ" }, @@ -720,7 +589,7 @@ { "data": { "text/html": [ - "Load job 93903930-10b8-48b8-b41b-3da54917b281 is DONE. Open Job" + "Load job c67da84e-3cba-4a77-9a05-12ff74e65e3a is DONE. Open Job" ], "text/plain": [ "" @@ -747,7 +616,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 8, "metadata": { "id": "_gPD0Zn1Stdb" }, @@ -755,7 +624,7 @@ { "data": { "text/html": [ - "Query job 17f58b5c-88b2-4b26-8d0d-cc3d9a979a06 is DONE. 28.9 kB processed. Open Job" + "Starting execution." ], "text/plain": [ "" @@ -796,53 +665,53 @@ " \n", " \n", " \n", - " 78\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", - " Dream\n", - " 47.0\n", - " 17.3\n", - " 185\n", - " 3700\n", - " FEMALE\n", + " 9\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 54.3\n", + " 15.7\n", + " 231\n", + " 5650\n", + " MALE\n", " \n", " \n", - " 130\n", + " 329\n", " Adelie Penguin (Pygoscelis adeliae)\n", - " Biscoe\n", - " 40.5\n", + " Dream\n", + " 39.7\n", " 17.9\n", - " 187\n", - " 3200\n", - " FEMALE\n", - " \n", - " \n", - " 84\n", - " Gentoo penguin (Pygoscelis papua)\n", - " Biscoe\n", - " 49.1\n", - " 14.5\n", - " 212\n", - " 4625\n", - " FEMALE\n", + " 193\n", + " 4250\n", + " MALE\n", " \n", " \n", - " 334\n", + " 32\n", " Adelie Penguin (Pygoscelis adeliae)\n", - " Biscoe\n", - " 38.2\n", - " 20.0\n", - " 190\n", + " Dream\n", + " 37.2\n", + " 18.1\n", + " 178\n", " 3900\n", " MALE\n", " \n", " \n", - " 67\n", + " 121\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 55.8\n", - " 19.8\n", - " 207\n", - " 4000\n", + " 50.5\n", + " 19.6\n", + " 201\n", + " 4050\n", + " MALE\n", + " \n", + " \n", + " 122\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Dream\n", + " 41.1\n", + " 18.1\n", + " 205\n", + " 4300\n", " MALE\n", " \n", " \n", @@ -851,21 +720,21 @@ ], "text/plain": [ " species island culmen_length_mm \\\n", - "78 Chinstrap penguin (Pygoscelis antarctica) Dream 47.0 \n", - "130 Adelie Penguin (Pygoscelis adeliae) Biscoe 40.5 \n", - "84 Gentoo penguin (Pygoscelis papua) Biscoe 49.1 \n", - "334 Adelie Penguin (Pygoscelis adeliae) Biscoe 38.2 \n", - "67 Chinstrap penguin (Pygoscelis antarctica) Dream 55.8 \n", + "9 Gentoo penguin (Pygoscelis papua) Biscoe 54.3 \n", + "329 Adelie Penguin (Pygoscelis adeliae) Dream 39.7 \n", + "32 Adelie Penguin (Pygoscelis adeliae) Dream 37.2 \n", + "121 Chinstrap penguin (Pygoscelis antarctica) Dream 50.5 \n", + "122 Adelie Penguin (Pygoscelis adeliae) Dream 41.1 \n", "\n", - " culmen_depth_mm flipper_length_mm body_mass_g sex \n", - "78 17.3 185 3700 FEMALE \n", - "130 17.9 187 3200 FEMALE \n", - "84 14.5 212 4625 FEMALE \n", - "334 20.0 190 3900 MALE \n", - "67 19.8 207 4000 MALE " + " culmen_depth_mm flipper_length_mm body_mass_g sex \n", + "9 15.7 231 5650 MALE \n", + "329 17.9 193 4250 MALE \n", + "32 18.1 178 3900 MALE \n", + "121 19.6 201 4050 MALE \n", + "122 18.1 205 4300 MALE " ] }, - "execution_count": 15, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -896,7 +765,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 9, "metadata": { "id": "ZSP7gt13QrQt" }, @@ -931,30 +800,18 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 10, "metadata": { "id": "oP1NIAmUBjop" }, "outputs": [ - { - "data": { - "text/html": [ - "Query job 55aa9cc4-29b6-4052-aae4-5499dc5f1168 is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "text/plain": [ - "'bigframes-dev.birds.penguins'" + "'swast-scratch.birds.penguins'" ] }, - "execution_count": 17, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -987,23 +844,11 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 11, "metadata": { "id": "IBuo-d6dWfsA" }, "outputs": [ - { - "data": { - "text/html": [ - "Query job 7b2ff811-1563-4ac4-9d21-69f87e8e85bc is DONE. 28.9 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "text/html": [ @@ -1036,53 +881,53 @@ " \n", " \n", " \n", - " 12\n", - " Gentoo penguin (Pygoscelis papua)\n", - " Biscoe\n", - " 42.7\n", - " 13.7\n", - " 208\n", - " 3950\n", - " FEMALE\n", + " 15\n", + " Chinstrap penguin (Pygoscelis antarctica)\n", + " Dream\n", + " 49.6\n", + " 18.2\n", + " 193\n", + " 3775\n", + " MALE\n", " \n", " \n", - " 24\n", + " 138\n", " Gentoo penguin (Pygoscelis papua)\n", " Biscoe\n", - " 45.0\n", - " 15.4\n", - " 220\n", - " 5050\n", + " 51.3\n", + " 14.2\n", + " 218\n", + " 5300\n", " MALE\n", " \n", " \n", - " 62\n", + " 176\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Dream\n", - " 38.8\n", - " 20.0\n", - " 190\n", - " 3950\n", + " 39.0\n", + " 18.7\n", + " 185\n", + " 3650\n", " MALE\n", " \n", " \n", - " 123\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", - " Dream\n", - " 42.5\n", - " 17.3\n", - " 187\n", - " 3350\n", + " 197\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Torgersen\n", + " 38.8\n", + " 17.6\n", + " 191\n", + " 3275\n", " FEMALE\n", " \n", " \n", - " 27\n", - " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 44.1\n", - " 19.7\n", - " 196\n", - " 4400\n", + " 249\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 52.1\n", + " 17.0\n", + " 230\n", + " 5550\n", " MALE\n", " \n", " \n", @@ -1090,22 +935,22 @@ "" ], "text/plain": [ - " species island culmen_length_mm \\\n", - "12 Gentoo penguin (Pygoscelis papua) Biscoe 42.7 \n", - "24 Gentoo penguin (Pygoscelis papua) Biscoe 45.0 \n", - "62 Adelie Penguin (Pygoscelis adeliae) Dream 38.8 \n", - "123 Chinstrap penguin (Pygoscelis antarctica) Dream 42.5 \n", - "27 Adelie Penguin (Pygoscelis adeliae) Dream 44.1 \n", + " species island culmen_length_mm \\\n", + "15 Chinstrap penguin (Pygoscelis antarctica) Dream 49.6 \n", + "138 Gentoo penguin (Pygoscelis papua) Biscoe 51.3 \n", + "176 Adelie Penguin (Pygoscelis adeliae) Dream 39.0 \n", + "197 Adelie Penguin (Pygoscelis adeliae) Torgersen 38.8 \n", + "249 Gentoo penguin (Pygoscelis papua) Biscoe 52.1 \n", "\n", " culmen_depth_mm flipper_length_mm body_mass_g sex \n", - "12 13.7 208 3950 FEMALE \n", - "24 15.4 220 5050 MALE \n", - "62 20.0 190 3950 MALE \n", - "123 17.3 187 3350 FEMALE \n", - "27 19.7 196 4400 MALE " + "15 18.2 193 3775 MALE \n", + "138 14.2 218 5300 MALE \n", + "176 18.7 185 3650 MALE \n", + "197 17.6 191 3275 FEMALE \n", + "249 17.0 230 5550 MALE " ] }, - "execution_count": 18, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -1147,40 +992,28 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 12, "metadata": { "id": "6i6HkFJZa8na" }, "outputs": [ - { - "data": { - "text/html": [ - "Query job b396baed-6242-4478-9092-f5e86811b045 is DONE. 31.7 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "text/plain": [ "133 \n", "279 3150\n", "34 3400\n", - "96 3600\n", - "18 3800\n", "208 3950\n", - "310 3175\n", + "18 3800\n", + "96 3600\n", "64 2850\n", + "310 3175\n", "118 3550\n", "2 3075\n", "Name: body_mass_g, dtype: Int64" ] }, - "execution_count": 19, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -1200,7 +1033,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 13, "metadata": { "id": "YKwCW7Nsavap" }, @@ -1209,7 +1042,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "average_body_mass: 4201.754385964913\n" + "average_body_mass: 4201.754385964911\n" ] } ], @@ -1229,23 +1062,11 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 14, "metadata": { "id": "4PyKMR61-Mjy" }, "outputs": [ - { - "data": { - "text/html": [ - "Query job fef05ee2-9690-41a4-bd35-7cded77310f2 is DONE. 15.6 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "text/html": [ @@ -1302,7 +1123,7 @@ "[3 rows x 1 columns]" ] }, - "execution_count": 21, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -1333,7 +1154,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1362,24 +1183,11 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "metadata": { "id": "rSWTOG-vb2Fc" }, - "outputs": [ - { - "data": { - "text/html": [ - "Query job c7b6c009-d2c4-4739-a6f8-5ef51e6b1851 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "@bpd.remote_function(cloud_function_service_account=\"default\")\n", "def get_bucket(num: float) -> str:\n", @@ -1401,20 +1209,11 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "metadata": { "id": "6ejPXoyEQpWE" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Cloud Function Name projects/bigframes-dev/locations/us-central1/functions/bigframes-sessiondf1983-1d02aa9bc80939ba72e7ff69e37e27c8\n", - "Remote Function Name bigframes-dev._f36a8f778c434a1ec421979eaa3bf562a8561e38.bigframes_sessiondf1983_1d02aa9bc80939ba72e7ff69e37e27c8\n" - ] - } - ], + "outputs": [], "source": [ "CLOUD_FUNCTION_NAME = format(get_bucket.bigframes_cloud_function)\n", "print(\"Cloud Function Name \" + CLOUD_FUNCTION_NAME)\n", @@ -1433,110 +1232,11 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": { "id": "NxSd9WZFcIji" }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
body_mass_gbody_mass_bucket
133<NA>NA
2793150below_3500
343400below_3500
963600at_or_above_3500
183800at_or_above_3500
2083950at_or_above_3500
3103175below_3500
642850below_3500
1183550at_or_above_3500
23075below_3500
\n", - "
" - ], - "text/plain": [ - " body_mass_g body_mass_bucket\n", - "133 NA\n", - "279 3150 below_3500\n", - "34 3400 below_3500\n", - "96 3600 at_or_above_3500\n", - "18 3800 at_or_above_3500\n", - "208 3950 at_or_above_3500\n", - "310 3175 below_3500\n", - "64 2850 below_3500\n", - "118 3550 at_or_above_3500\n", - "2 3075 below_3500" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "bq_df = bq_df.assign(body_mass_bucket=bq_df['body_mass_g'].apply(get_bucket))\n", "bq_df[['body_mass_g', 'body_mass_bucket']].peek(10)" @@ -1571,7 +1271,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1581,7 +1281,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "metadata": { "id": "sx_vKniMq9ZX" }, @@ -1598,7 +1298,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "metadata": { "id": "_dTCXvCxtPw9" }, @@ -1614,7 +1314,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, "metadata": { "id": "EDAIIfcpwNOF" }, @@ -1626,7 +1326,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, "metadata": { "id": "QwumLUKmVpuH" }, @@ -1658,7 +1358,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.15" + "version": "3.10.16" } }, "nbformat": 4, diff --git a/setup.py b/setup.py index a2f4399fdf..abc760b691 100644 --- a/setup.py +++ b/setup.py @@ -39,7 +39,7 @@ "gcsfs >=2023.3.0, !=2025.5.0", "geopandas >=0.12.2", "google-auth >=2.15.0,<3.0", - "google-cloud-bigquery[bqstorage,pandas] >=3.38.0", + "google-cloud-bigquery[bqstorage,pandas] >=3.36.0", # 2.30 needed for arrow support. "google-cloud-bigquery-storage >= 2.30.0, < 3.0.0", "google-cloud-functions >=1.12.0", diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index 4348a6abee..eceec07dc4 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -6,7 +6,7 @@ geopandas==0.12.2 google-auth==2.15.0 google-cloud-bigtable==2.24.0 google-cloud-pubsub==2.21.4 -google-cloud-bigquery==3.38.0 +google-cloud-bigquery==3.36.0 google-cloud-functions==1.12.0 google-cloud-bigquery-connection==1.12.0 google-cloud-iam==2.12.1 From 70d83242edd252dd3626313a13e63fa06841a711 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Fri, 26 Sep 2025 16:08:58 +0000 Subject: [PATCH 03/16] report execution started/stopped in read_gbq_query --- bigframes/core/events.py | 6 +- bigframes/formatting_helpers.py | 12 +- bigframes/session/__init__.py | 3 - .../session/_io/bigquery/read_gbq_table.py | 40 +- bigframes/session/bq_caching_executor.py | 21 +- bigframes/session/loader.py | 61 +- .../getting_started_bq_dataframes.ipynb | 588 +++++++++++++----- 7 files changed, 558 insertions(+), 173 deletions(-) diff --git a/bigframes/core/events.py b/bigframes/core/events.py index d4c176704a..5a44fe3255 100644 --- a/bigframes/core/events.py +++ b/bigframes/core/events.py @@ -25,6 +25,7 @@ import google.cloud.bigquery.table import bigframes.formatting_helpers +import bigframes.session.executor @dataclasses.dataclass(frozen=True) @@ -83,8 +84,9 @@ class ExecutionRunning(Event): pass -class ExecutionStopped(Event): - pass +@dataclasses.dataclass(frozen=True) +class ExecutionFinished(Event): + result: Optional[bigframes.session.executor.ExecuteResult] = None @dataclasses.dataclass(frozen=True) diff --git a/bigframes/formatting_helpers.py b/bigframes/formatting_helpers.py index 2ec8077b26..fb37278987 100644 --- a/bigframes/formatting_helpers.py +++ b/bigframes/formatting_helpers.py @@ -125,6 +125,7 @@ def repr_query_job(query_job: Optional[bigquery.QueryJob]): current_display: Optional[display.HTML] = None current_display_id: Optional[str] = None +previous_message: str = "" def progress_callback( @@ -149,16 +150,19 @@ def progress_callback( ): current_display_id = str(random.random()) current_display = display.HTML("Starting execution.") - display.display(current_display) + display.display( + current_display, + display_id=current_display_id, + ) if isinstance(event, bigframes.core.events.ExecutionRunning): display.update_display( display.HTML("Execution happening."), display_id=current_display_id, ) - elif isinstance(event, bigframes.core.events.ExecutionStopped): + elif isinstance(event, bigframes.core.events.ExecutionFinished): display.update_display( - display.HTML("Execution done."), + display.HTML(f"{previous_message} Execution done."), display_id=current_display_id, ) elif progress_bar == "terminal": @@ -166,7 +170,7 @@ def progress_callback( print("Starting execution.") elif isinstance(event, bigframes.core.events.ExecutionRunning): print("Execution happening.") - elif isinstance(event, bigframes.core.events.ExecutionStopped): + elif isinstance(event, bigframes.core.events.ExecutionFinished): print("Execution done.") diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index f0cec864b4..eed5f8496b 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -75,10 +75,7 @@ import bigframes.functions.function as bff from bigframes.session import bigquery_session, bq_caching_executor, executor import bigframes.session._io.bigquery as bf_io_bigquery -import bigframes.session.anonymous_dataset import bigframes.session.clients -import bigframes.session.loader -import bigframes.session.metrics import bigframes.session.validation # Avoid circular imports. diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py index 00531ce25d..434196e921 100644 --- a/bigframes/session/_io/bigquery/read_gbq_table.py +++ b/bigframes/session/_io/bigquery/read_gbq_table.py @@ -26,8 +26,8 @@ import bigframes_vendored.constants as constants import google.api_core.exceptions import google.cloud.bigquery as bigquery +import google.cloud.bigquery.table -import bigframes.core.sql import bigframes.exceptions as bfe import bigframes.session._io.bigquery @@ -101,7 +101,7 @@ def get_table_metadata( def is_time_travel_eligible( bqclient: bigquery.Client, - table: bigquery.table.Table, + table: google.cloud.bigquery.table.Table, columns: Optional[Sequence[str]], snapshot_time: datetime.datetime, filter_str: Optional[str] = None, @@ -210,10 +210,8 @@ def is_time_travel_eligible( def infer_unique_columns( - bqclient: bigquery.Client, - table: bigquery.table.Table, + table: google.cloud.bigquery.table.Table, index_cols: List[str], - metadata_only: bool = False, ) -> Tuple[str, ...]: """Return a set of columns that can provide a unique row key or empty if none can be inferred. @@ -227,14 +225,34 @@ def infer_unique_columns( # Essentially, just reordering the primary key to match the index col order return tuple(index_col for index_col in index_cols if index_col in primary_keys) - if primary_keys or metadata_only or (not index_cols): - # Sometimes not worth scanning data to check uniqueness + if primary_keys: return primary_keys + + return () + + +def check_if_index_columns_are_unique( + bqclient: bigquery.Client, + table: google.cloud.bigquery.table.Table, + index_cols: List[str], +) -> Tuple[str, ...]: + import bigframes.core.sql + import bigframes.session._io.bigquery + # TODO(b/337925142): Avoid a "SELECT *" subquery here by ensuring # table_expression only selects just index_cols. is_unique_sql = bigframes.core.sql.is_distinct_sql(index_cols, table.reference) job_config = bigquery.QueryJobConfig() - results = bqclient.query_and_wait(is_unique_sql, job_config=job_config) + results, _ = bigframes.session._io.bigquery.start_query_with_client( + bq_client=bqclient, + sql=is_unique_sql, + job_config=job_config, + timeout=None, + location=None, + project=None, + metrics=None, + query_with_job=False, + ) row = next(iter(results)) if row["total_count"] == row["distinct_count"]: @@ -243,7 +261,7 @@ def infer_unique_columns( def _get_primary_keys( - table: bigquery.table.Table, + table: google.cloud.bigquery.table.Table, ) -> List[str]: """Get primary keys from table if they are set.""" @@ -261,7 +279,7 @@ def _get_primary_keys( def _is_table_clustered_or_partitioned( - table: bigquery.table.Table, + table: google.cloud.bigquery.table.Table, ) -> bool: """Returns True if the table is clustered or partitioned.""" @@ -284,7 +302,7 @@ def _is_table_clustered_or_partitioned( def get_index_cols( - table: bigquery.table.Table, + table: google.cloud.bigquery.table.Table, index_col: Iterable[str] | str | Iterable[int] diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py index b7412346bd..1b8ad3eece 100644 --- a/bigframes/session/bq_caching_executor.py +++ b/bigframes/session/bq_caching_executor.py @@ -32,6 +32,7 @@ import bigframes.core from bigframes.core import compile, local_data, rewrite import bigframes.core.compile.sqlglot.sqlglot_ir as sqlglot_ir +import bigframes.core.events import bigframes.core.guid import bigframes.core.identifiers import bigframes.core.nodes as nodes @@ -187,6 +188,8 @@ def execute( array_value: bigframes.core.ArrayValue, execution_spec: ex_spec.ExecutionSpec, ) -> executor.ExecuteResult: + bigframes.core.events.publisher.send(bigframes.core.events.ExecutionStarted()) + # TODO: Support export jobs in combination with semi executors if execution_spec.destination_spec is None: plan = self.prepare_plan(array_value.node, target="simplify") @@ -195,6 +198,11 @@ def execute( plan, ordered=execution_spec.ordered, peek=execution_spec.peek ) if maybe_result: + bigframes.core.events.publisher.send( + bigframes.core.events.ExecutionFinished( + result=maybe_result, + ) + ) return maybe_result if isinstance(execution_spec.destination_spec, ex_spec.TableOutputSpec): @@ -203,7 +211,13 @@ def execute( "Ordering and peeking not supported for gbq export" ) # separate path for export_gbq, as it has all sorts of annoying logic, such as possibly running as dml - return self._export_gbq(array_value, execution_spec.destination_spec) + result = self._export_gbq(array_value, execution_spec.destination_spec) + bigframes.core.events.publisher.send( + bigframes.core.events.ExecutionFinished( + result=result, + ) + ) + return result result = self._execute_plan_gbq( array_value.node, @@ -218,6 +232,11 @@ def execute( if isinstance(execution_spec.destination_spec, ex_spec.GcsOutputSpec): self._export_result_gcs(result, execution_spec.destination_spec) + bigframes.core.events.publisher.send( + bigframes.core.events.ExecutionFinished( + result=result, + ) + ) return result def _export_result_gcs( diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index 94d8db6f36..d0bc5c908a 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -50,6 +50,7 @@ from bigframes.core import guid, identifiers, local_data, nodes, ordering, utils import bigframes.core as core import bigframes.core.blocks as blocks +import bigframes.core.events import bigframes.core.schema as schemata import bigframes.dtypes import bigframes.formatting_helpers as formatting_helpers @@ -499,6 +500,7 @@ def read_gbq_table( # type: ignore[overload-overlap] force_total_order: Optional[bool] = ..., n_rows: Optional[int] = None, index_col_in_columns: bool = False, + publish_execution: bool = True, ) -> dataframe.DataFrame: ... @@ -522,6 +524,7 @@ def read_gbq_table( force_total_order: Optional[bool] = ..., n_rows: Optional[int] = None, index_col_in_columns: bool = False, + publish_execution: bool = True, ) -> pandas.Series: ... @@ -544,6 +547,7 @@ def read_gbq_table( force_total_order: Optional[bool] = None, n_rows: Optional[int] = None, index_col_in_columns: bool = False, + publish_execution: bool = True, ) -> dataframe.DataFrame | pandas.Series: """Read a BigQuery table into a BigQuery DataFrames DataFrame. @@ -603,8 +607,12 @@ def read_gbq_table( when the index is selected from the data columns (e.g., in a ``read_csv`` scenario). The column will be used as the DataFrame's index and removed from the list of value columns. + publish_execution (bool, optional): + If True, sends an execution started and stopped event if this + causes a query. Set to False if using read_gbq_table from + another function that is reporting execution. """ - import bigframes._tools.strings + import bigframes.core.events import bigframes.dataframe as dataframe # --------------------------------- @@ -768,12 +776,26 @@ def read_gbq_table( # TODO(b/338065601): Provide a way to assume uniqueness and avoid this # check. primary_key = bf_read_gbq_table.infer_unique_columns( - bqclient=self._bqclient, table=table, index_cols=index_cols, - # If non in strict ordering mode, don't go through overhead of scanning index column(s) to determine if unique - metadata_only=not self._scan_index_uniqueness, ) + + # If non in strict ordering mode, don't go through overhead of scanning index column(s) to determine if unique + if not primary_key and self._scan_index_uniqueness and index_cols: + if publish_execution: + bigframes.core.events.publisher.send( + bigframes.core.events.ExecutionStarted(), + ) + primary_key = bf_read_gbq_table.check_if_index_columns_are_unique( + self._bqclient, + table=table, + index_cols=index_cols, + ) + if publish_execution: + bigframes.core.events.publisher.send( + bigframes.core.events.ExecutionFinished(), + ) + schema = schemata.ArraySchema.from_bq_table(table) if not include_all_columns: schema = schema.select(index_cols + columns) @@ -991,6 +1013,12 @@ def read_gbq_query( query_job, list(columns), index_cols ) + # We want to make sure we show progress when we actually do execute a + # query. Since we have got this far, we know it's not a dry run. + bigframes.core.events.publisher.send( + bigframes.core.events.ExecutionStarted(), + ) + query_job_for_metrics: Optional[bigquery.QueryJob] = None destination: Optional[bigquery.TableReference] = None @@ -1046,20 +1074,28 @@ def read_gbq_query( # makes sense to download the results beyond the first page, even if # there is a job and destination table available. if query_job_for_metrics is None and rows is not None: - return bf_read_gbq_query.create_dataframe_from_row_iterator( + df = bf_read_gbq_query.create_dataframe_from_row_iterator( rows, session=self._session, index_col=index_col, columns=columns, ) + bigframes.core.events.publisher.send( + bigframes.core.events.ExecutionFinished(), + ) + return df # We already checked rows, so if there's no destination table, then # there are no results to return. if destination is None: - return bf_read_gbq_query.create_dataframe_from_query_job_stats( + df = bf_read_gbq_query.create_dataframe_from_query_job_stats( query_job_for_metrics, session=self._session, ) + bigframes.core.events.publisher.send( + bigframes.core.events.ExecutionFinished(), + ) + return df # If the query was DDL or DML, return some job metadata. See # https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobStatistics2.FIELDS.statement_type @@ -1070,10 +1106,14 @@ def read_gbq_query( query_job_for_metrics is not None and not bf_read_gbq_query.should_return_query_results(query_job_for_metrics) ): - return bf_read_gbq_query.create_dataframe_from_query_job_stats( + df = bf_read_gbq_query.create_dataframe_from_query_job_stats( query_job_for_metrics, session=self._session, ) + bigframes.core.events.publisher.send( + bigframes.core.events.ExecutionFinished(), + ) + return df # Speed up counts by getting counts from result metadata. if rows is not None: @@ -1083,16 +1123,21 @@ def read_gbq_query( else: n_rows = None - return self.read_gbq_table( + df = self.read_gbq_table( f"{destination.project}.{destination.dataset_id}.{destination.table_id}", index_col=index_col, columns=columns, use_cache=configuration["query"]["useQueryCache"], force_total_order=force_total_order, n_rows=n_rows, + publish_execution=False, # max_results and filters are omitted because they are already # handled by to_query(), above. ) + bigframes.core.events.publisher.send( + bigframes.core.events.ExecutionFinished(), + ) + return df def _query_to_destination( self, diff --git a/notebooks/getting_started/getting_started_bq_dataframes.ipynb b/notebooks/getting_started/getting_started_bq_dataframes.ipynb index e721826a6f..384f3b9c10 100644 --- a/notebooks/getting_started/getting_started_bq_dataframes.ipynb +++ b/notebooks/getting_started/getting_started_bq_dataframes.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "id": "ur8xi4C7S06n" }, @@ -143,7 +143,7 @@ }, "outputs": [], "source": [ - "# %pip install bigframes" + "!pip install bigframes" ] }, { @@ -159,7 +159,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { "id": "f200f10a1da3" }, @@ -230,9 +230,20 @@ "metadata": { "id": "oM1iC_MfAts1" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Updated property [core/project].\n" + ] + } + ], "source": [ - "PROJECT_ID = \"\" # @param {type:\"string\"}" + "PROJECT_ID = \"\" # @param {type:\"string\"}\n", + "\n", + "# Set the project id\n", + "! gcloud config set project {PROJECT_ID}" ] }, { @@ -248,7 +259,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 5, "metadata": { "id": "eF-Twtc4XGem" }, @@ -292,7 +303,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { "id": "254614fa0c46" }, @@ -314,7 +325,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": { "id": "603adbbf0532" }, @@ -335,7 +346,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 8, "metadata": { "id": "PyQmSRbKA8r-" }, @@ -356,7 +367,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 9, "metadata": { "id": "NPPMuw2PXGeo" }, @@ -364,15 +375,8 @@ "source": [ "# Note: The project option is not required in all environments.\n", "# On BigQuery Studio, the project ID is automatically detected.\n", - "bpd.options.bigquery.project = PROJECT_ID" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ + "bpd.options.bigquery.project = PROJECT_ID\n", + "\n", "# Note: The location option is not required.\n", "# It defaults to the location of the first table or query\n", "# passed to read_gbq(). For APIs where a location can't be\n", @@ -428,7 +432,20 @@ "metadata": { "id": "Vyex9BQI-BNa" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Query job badadf0b-27c8-4dac-a468-be3c40745538 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "# bq_df_sample = bpd.read_gbq(\"bigquery-samples.wikipedia_pageviews.200809h\")" ] @@ -459,7 +476,121 @@ "metadata": { "id": "XfGq5apK-D_e" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Query job c8669c7f-bca3-4f54-b354-8e57b3321f5a is DONE. 34.9 GB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titleviews
21911Google1414560
27669Google_Chrome962482
28394Google_Earth383566
29184Google_Maps205089
27251Google_Android99450
33900Google_search97665
31825Google_chrome78399
30204Google_Street_View71580
40798Image:Google_Chrome.png60746
35222Googleplex53848
\n", + "

10 rows × 2 columns

\n", + "
[10 rows x 2 columns in total]" + ], + "text/plain": [ + " title views\n", + "21911 Google 1414560\n", + "27669 Google_Chrome 962482\n", + "28394 Google_Earth 383566\n", + "29184 Google_Maps 205089\n", + "27251 Google_Android 99450\n", + "33900 Google_search 97665\n", + "31825 Google_chrome 78399\n", + "30204 Google_Street_View 71580\n", + "40798 Image:Google_Chrome.png 60746\n", + "35222 Googleplex 53848\n", + "\n", + "[10 rows x 2 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# bq_df_sample[bq_df_sample.title.str.contains(r\"[Gg]oogle\")]\\\n", "# .groupby(['title'], as_index=False)['views'].sum(numeric_only=True)\\\n", @@ -529,7 +660,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 12, "metadata": { "id": "SvyXzkRl783u" }, @@ -555,7 +686,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": { "id": "3QHQYlnoBLpt" }, @@ -581,7 +712,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 14, "metadata": { "id": "EDAaIwHpQCDZ" }, @@ -589,7 +720,7 @@ { "data": { "text/html": [ - "Load job c67da84e-3cba-4a77-9a05-12ff74e65e3a is DONE. Open Job" + "Load job 93903930-10b8-48b8-b41b-3da54917b281 is DONE. Open Job" ], "text/plain": [ "" @@ -616,7 +747,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 15, "metadata": { "id": "_gPD0Zn1Stdb" }, @@ -624,7 +755,7 @@ { "data": { "text/html": [ - "Starting execution." + "Query job 17f58b5c-88b2-4b26-8d0d-cc3d9a979a06 is DONE. 28.9 kB processed. Open Job" ], "text/plain": [ "" @@ -665,53 +796,53 @@ " \n", " \n", " \n", - " 9\n", - " Gentoo penguin (Pygoscelis papua)\n", - " Biscoe\n", - " 54.3\n", - " 15.7\n", - " 231\n", - " 5650\n", - " MALE\n", + " 78\n", + " Chinstrap penguin (Pygoscelis antarctica)\n", + " Dream\n", + " 47.0\n", + " 17.3\n", + " 185\n", + " 3700\n", + " FEMALE\n", " \n", " \n", - " 329\n", + " 130\n", " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 39.7\n", + " Biscoe\n", + " 40.5\n", " 17.9\n", - " 193\n", - " 4250\n", - " MALE\n", + " 187\n", + " 3200\n", + " FEMALE\n", + " \n", + " \n", + " 84\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 49.1\n", + " 14.5\n", + " 212\n", + " 4625\n", + " FEMALE\n", " \n", " \n", - " 32\n", + " 334\n", " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 37.2\n", - " 18.1\n", - " 178\n", + " Biscoe\n", + " 38.2\n", + " 20.0\n", + " 190\n", " 3900\n", " MALE\n", " \n", " \n", - " 121\n", + " 67\n", " Chinstrap penguin (Pygoscelis antarctica)\n", " Dream\n", - " 50.5\n", - " 19.6\n", - " 201\n", - " 4050\n", - " MALE\n", - " \n", - " \n", - " 122\n", - " Adelie Penguin (Pygoscelis adeliae)\n", - " Dream\n", - " 41.1\n", - " 18.1\n", - " 205\n", - " 4300\n", + " 55.8\n", + " 19.8\n", + " 207\n", + " 4000\n", " MALE\n", " \n", " \n", @@ -720,21 +851,21 @@ ], "text/plain": [ " species island culmen_length_mm \\\n", - "9 Gentoo penguin (Pygoscelis papua) Biscoe 54.3 \n", - "329 Adelie Penguin (Pygoscelis adeliae) Dream 39.7 \n", - "32 Adelie Penguin (Pygoscelis adeliae) Dream 37.2 \n", - "121 Chinstrap penguin (Pygoscelis antarctica) Dream 50.5 \n", - "122 Adelie Penguin (Pygoscelis adeliae) Dream 41.1 \n", + "78 Chinstrap penguin (Pygoscelis antarctica) Dream 47.0 \n", + "130 Adelie Penguin (Pygoscelis adeliae) Biscoe 40.5 \n", + "84 Gentoo penguin (Pygoscelis papua) Biscoe 49.1 \n", + "334 Adelie Penguin (Pygoscelis adeliae) Biscoe 38.2 \n", + "67 Chinstrap penguin (Pygoscelis antarctica) Dream 55.8 \n", "\n", - " culmen_depth_mm flipper_length_mm body_mass_g sex \n", - "9 15.7 231 5650 MALE \n", - "329 17.9 193 4250 MALE \n", - "32 18.1 178 3900 MALE \n", - "121 19.6 201 4050 MALE \n", - "122 18.1 205 4300 MALE " + " culmen_depth_mm flipper_length_mm body_mass_g sex \n", + "78 17.3 185 3700 FEMALE \n", + "130 17.9 187 3200 FEMALE \n", + "84 14.5 212 4625 FEMALE \n", + "334 20.0 190 3900 MALE \n", + "67 19.8 207 4000 MALE " ] }, - "execution_count": 8, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -765,7 +896,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 16, "metadata": { "id": "ZSP7gt13QrQt" }, @@ -800,18 +931,30 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 17, "metadata": { "id": "oP1NIAmUBjop" }, "outputs": [ + { + "data": { + "text/html": [ + "Query job 55aa9cc4-29b6-4052-aae4-5499dc5f1168 is DONE. 28.9 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "data": { "text/plain": [ - "'swast-scratch.birds.penguins'" + "'bigframes-dev.birds.penguins'" ] }, - "execution_count": 10, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -844,11 +987,23 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 18, "metadata": { "id": "IBuo-d6dWfsA" }, "outputs": [ + { + "data": { + "text/html": [ + "Query job 7b2ff811-1563-4ac4-9d21-69f87e8e85bc is DONE. 28.9 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "data": { "text/html": [ @@ -881,53 +1036,53 @@ " \n", " \n", " \n", - " 15\n", - " Chinstrap penguin (Pygoscelis antarctica)\n", - " Dream\n", - " 49.6\n", - " 18.2\n", - " 193\n", - " 3775\n", - " MALE\n", + " 12\n", + " Gentoo penguin (Pygoscelis papua)\n", + " Biscoe\n", + " 42.7\n", + " 13.7\n", + " 208\n", + " 3950\n", + " FEMALE\n", " \n", " \n", - " 138\n", + " 24\n", " Gentoo penguin (Pygoscelis papua)\n", " Biscoe\n", - " 51.3\n", - " 14.2\n", - " 218\n", - " 5300\n", + " 45.0\n", + " 15.4\n", + " 220\n", + " 5050\n", " MALE\n", " \n", " \n", - " 176\n", + " 62\n", " Adelie Penguin (Pygoscelis adeliae)\n", " Dream\n", - " 39.0\n", - " 18.7\n", - " 185\n", - " 3650\n", + " 38.8\n", + " 20.0\n", + " 190\n", + " 3950\n", " MALE\n", " \n", " \n", - " 197\n", - " Adelie Penguin (Pygoscelis adeliae)\n", - " Torgersen\n", - " 38.8\n", - " 17.6\n", - " 191\n", - " 3275\n", + " 123\n", + " Chinstrap penguin (Pygoscelis antarctica)\n", + " Dream\n", + " 42.5\n", + " 17.3\n", + " 187\n", + " 3350\n", " FEMALE\n", " \n", " \n", - " 249\n", - " Gentoo penguin (Pygoscelis papua)\n", - " Biscoe\n", - " 52.1\n", - " 17.0\n", - " 230\n", - " 5550\n", + " 27\n", + " Adelie Penguin (Pygoscelis adeliae)\n", + " Dream\n", + " 44.1\n", + " 19.7\n", + " 196\n", + " 4400\n", " MALE\n", " \n", " \n", @@ -935,22 +1090,22 @@ "" ], "text/plain": [ - " species island culmen_length_mm \\\n", - "15 Chinstrap penguin (Pygoscelis antarctica) Dream 49.6 \n", - "138 Gentoo penguin (Pygoscelis papua) Biscoe 51.3 \n", - "176 Adelie Penguin (Pygoscelis adeliae) Dream 39.0 \n", - "197 Adelie Penguin (Pygoscelis adeliae) Torgersen 38.8 \n", - "249 Gentoo penguin (Pygoscelis papua) Biscoe 52.1 \n", + " species island culmen_length_mm \\\n", + "12 Gentoo penguin (Pygoscelis papua) Biscoe 42.7 \n", + "24 Gentoo penguin (Pygoscelis papua) Biscoe 45.0 \n", + "62 Adelie Penguin (Pygoscelis adeliae) Dream 38.8 \n", + "123 Chinstrap penguin (Pygoscelis antarctica) Dream 42.5 \n", + "27 Adelie Penguin (Pygoscelis adeliae) Dream 44.1 \n", "\n", " culmen_depth_mm flipper_length_mm body_mass_g sex \n", - "15 18.2 193 3775 MALE \n", - "138 14.2 218 5300 MALE \n", - "176 18.7 185 3650 MALE \n", - "197 17.6 191 3275 FEMALE \n", - "249 17.0 230 5550 MALE " + "12 13.7 208 3950 FEMALE \n", + "24 15.4 220 5050 MALE \n", + "62 20.0 190 3950 MALE \n", + "123 17.3 187 3350 FEMALE \n", + "27 19.7 196 4400 MALE " ] }, - "execution_count": 11, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -992,28 +1147,40 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 19, "metadata": { "id": "6i6HkFJZa8na" }, "outputs": [ + { + "data": { + "text/html": [ + "Query job b396baed-6242-4478-9092-f5e86811b045 is DONE. 31.7 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "data": { "text/plain": [ "133 \n", "279 3150\n", "34 3400\n", - "208 3950\n", - "18 3800\n", "96 3600\n", - "64 2850\n", + "18 3800\n", + "208 3950\n", "310 3175\n", + "64 2850\n", "118 3550\n", "2 3075\n", "Name: body_mass_g, dtype: Int64" ] }, - "execution_count": 12, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -1033,7 +1200,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 20, "metadata": { "id": "YKwCW7Nsavap" }, @@ -1042,7 +1209,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "average_body_mass: 4201.754385964911\n" + "average_body_mass: 4201.754385964913\n" ] } ], @@ -1062,11 +1229,23 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 21, "metadata": { "id": "4PyKMR61-Mjy" }, "outputs": [ + { + "data": { + "text/html": [ + "Query job fef05ee2-9690-41a4-bd35-7cded77310f2 is DONE. 15.6 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "data": { "text/html": [ @@ -1123,7 +1302,7 @@ "[3 rows x 1 columns]" ] }, - "execution_count": 14, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -1154,7 +1333,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -1183,11 +1362,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": { "id": "rSWTOG-vb2Fc" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Query job c7b6c009-d2c4-4739-a6f8-5ef51e6b1851 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "@bpd.remote_function(cloud_function_service_account=\"default\")\n", "def get_bucket(num: float) -> str:\n", @@ -1209,11 +1401,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": { "id": "6ejPXoyEQpWE" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cloud Function Name projects/bigframes-dev/locations/us-central1/functions/bigframes-sessiondf1983-1d02aa9bc80939ba72e7ff69e37e27c8\n", + "Remote Function Name bigframes-dev._f36a8f778c434a1ec421979eaa3bf562a8561e38.bigframes_sessiondf1983_1d02aa9bc80939ba72e7ff69e37e27c8\n" + ] + } + ], "source": [ "CLOUD_FUNCTION_NAME = format(get_bucket.bigframes_cloud_function)\n", "print(\"Cloud Function Name \" + CLOUD_FUNCTION_NAME)\n", @@ -1232,11 +1433,110 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": { "id": "NxSd9WZFcIji" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
body_mass_gbody_mass_bucket
133<NA>NA
2793150below_3500
343400below_3500
963600at_or_above_3500
183800at_or_above_3500
2083950at_or_above_3500
3103175below_3500
642850below_3500
1183550at_or_above_3500
23075below_3500
\n", + "
" + ], + "text/plain": [ + " body_mass_g body_mass_bucket\n", + "133 NA\n", + "279 3150 below_3500\n", + "34 3400 below_3500\n", + "96 3600 at_or_above_3500\n", + "18 3800 at_or_above_3500\n", + "208 3950 at_or_above_3500\n", + "310 3175 below_3500\n", + "64 2850 below_3500\n", + "118 3550 at_or_above_3500\n", + "2 3075 below_3500" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "bq_df = bq_df.assign(body_mass_bucket=bq_df['body_mass_g'].apply(get_bucket))\n", "bq_df[['body_mass_g', 'body_mass_bucket']].peek(10)" @@ -1271,7 +1571,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -1281,7 +1581,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "metadata": { "id": "sx_vKniMq9ZX" }, @@ -1298,7 +1598,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "metadata": { "id": "_dTCXvCxtPw9" }, @@ -1314,7 +1614,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "metadata": { "id": "EDAIIfcpwNOF" }, @@ -1326,7 +1626,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": { "id": "QwumLUKmVpuH" }, @@ -1358,7 +1658,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.16" + "version": "3.10.15" } }, "nbformat": 4, From 5b4b250f3609264d77e59e7b1e99fcf27ed0e593 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Fri, 26 Sep 2025 16:49:11 +0000 Subject: [PATCH 04/16] render bigquery sent events --- bigframes/formatting_helpers.py | 143 ++++++++++++++---------- tests/system/small/test_progress_bar.py | 18 --- 2 files changed, 83 insertions(+), 78 deletions(-) diff --git a/bigframes/formatting_helpers.py b/bigframes/formatting_helpers.py index fb37278987..8ca0eaa426 100644 --- a/bigframes/formatting_helpers.py +++ b/bigframes/formatting_helpers.py @@ -17,6 +17,7 @@ from __future__ import annotations import datetime +import html import random from typing import Any, Optional, Type, TYPE_CHECKING, Union @@ -26,7 +27,6 @@ import humanize import IPython import IPython.display as display -import ipywidgets as widgets if TYPE_CHECKING: import bigframes.core.events @@ -62,39 +62,6 @@ def create_exception_with_feedback_link( return exception(constants.FEEDBACK_LINK) -def repr_query_job_html(query_job: Optional[bigquery.QueryJob]): - """Return query job in html format. - Args: - query_job (bigquery.QueryJob, Optional): - The job representing the execution of the query on the server. - Returns: - Pywidget html table. - """ - if query_job is None: - return display.HTML("No job information available") - if query_job.dry_run: - return display.HTML( - f"Computation deferred. Computation will process {get_formatted_bytes(query_job.total_bytes_processed)}" - ) - table_html = "" - table_html += "" - for key, value in query_job_prop_pairs.items(): - job_val = getattr(query_job, value) - if job_val is not None: - if key == "Job Id": # add link to job - table_html += f"""""" - elif key == "Slot Time": - table_html += ( - f"""""" - ) - elif key == "Bytes Processed": - table_html += f"""""" - else: - table_html += f"""""" - table_html += "
{key}{job_val}
{key}{get_formatted_time(job_val)}
{key}{get_formatted_bytes(job_val)}
{key}{job_val}
" - return widgets.HTML(table_html) - - def repr_query_job(query_job: Optional[bigquery.QueryJob]): """Return query job as a formatted string. Args: @@ -113,7 +80,11 @@ def repr_query_job(query_job: Optional[bigquery.QueryJob]): if job_val is not None: res += "\n" if key == "Job Id": # add link to job - res += f"""Job url: {get_job_url(query_job)}""" + res += f"""Job url: {get_job_url( + project_id=query_job.project, + location=query_job.location, + job_id=query_job.job_id, + )}""" elif key == "Slot Time": res += f"""{key}: {get_formatted_time(job_val)}""" elif key == "Bytes Processed": @@ -125,14 +96,14 @@ def repr_query_job(query_job: Optional[bigquery.QueryJob]): current_display: Optional[display.HTML] = None current_display_id: Optional[str] = None -previous_message: str = "" +previous_display_html: str = "" def progress_callback( event: bigframes.core.events.Event, ): """Displays a progress bar while the query is running""" - global current_display, current_display_id + global current_display, current_display_id, previous_display_html import bigframes._config import bigframes.core.events @@ -148,6 +119,7 @@ def progress_callback( or current_display is None or current_display_id is None ): + previous_display_html = "" current_display_id = str(random.random()) current_display = display.HTML("Starting execution.") display.display( @@ -155,21 +127,23 @@ def progress_callback( display_id=current_display_id, ) - if isinstance(event, bigframes.core.events.ExecutionRunning): + if isinstance(event, bigframes.core.events.BigQuerySentEvent): + previous_display_html = render_bqquery_sent_event_html(event) display.update_display( - display.HTML("Execution happening."), + display.HTML(previous_display_html), display_id=current_display_id, ) elif isinstance(event, bigframes.core.events.ExecutionFinished): display.update_display( - display.HTML(f"{previous_message} Execution done."), + display.HTML(f"{previous_display_html} Execution done."), display_id=current_display_id, ) elif progress_bar == "terminal": if isinstance(event, bigframes.core.events.ExecutionStarted): print("Starting execution.") - elif isinstance(event, bigframes.core.events.ExecutionRunning): - print("Execution happening.") + elif isinstance(event, bigframes.core.events.BigQuerySentEvent): + message = render_bqquery_sent_event_plaintext(event) + print(message) elif isinstance(event, bigframes.core.events.ExecutionFinished): print("Execution done.") @@ -222,24 +196,25 @@ def wait_for_job(job: GenericJob, progress_bar: Optional[str] = None): raise -def get_job_url(query_job: GenericJob): +def get_job_url( + *, + project_id: Optional[str], + location: Optional[str], + job_id: Optional[str], +): """Return url to the query job in cloud console. - Args: - query_job (GenericJob): - The job representing the execution of the query on the server. + Returns: String url. """ - if ( - query_job.project is None - or query_job.location is None - or query_job.job_id is None - ): + if project_id is None or location is None or job_id is None: return None - return f"""https://console.cloud.google.com/bigquery?project={query_job.project}&j=bq:{query_job.location}:{query_job.job_id}&page=queryresults""" + return f"""https://console.cloud.google.com/bigquery?project={project_id}&j=bq:{location}:{job_id}&page=queryresults""" -def get_query_job_loading_html(query_job: bigquery.QueryJob): +def render_bqquery_sent_event_html( + event: bigframes.core.events.BigQuerySentEvent, +) -> str: """Return progress bar html string Args: query_job (bigquery.QueryJob): @@ -247,18 +222,58 @@ def get_query_job_loading_html(query_job: bigquery.QueryJob): Returns: Html string. """ - return f"""Query job {query_job.job_id} is {query_job.state}. {get_bytes_processed_string(query_job.total_bytes_processed)}Open Job""" + job_url = get_job_url( + project_id=event.billing_project, + location=event.location, + job_id=event.job_id, + ) + if job_url: + job_link = f'Open Job' + else: + job_link = "" + + query_id = "" + if event.job_id: + query_id = f" with job ID {event.job_id}:{event.billing_project}.{event.job_id}" + elif event.request_id: + query_id = f" with request ID {event.job_id}:{event.billing_project}.{event.request_id}" + + query_text_details = f"
SQL
{html.escape(event.query)}
" + + return f""" + Query started{query_id}.{job_link}{query_text_details} + """ -def get_query_job_loading_string(query_job: bigquery.QueryJob): - """Return progress bar string + +def render_bqquery_sent_event_plaintext( + event: bigframes.core.events.BigQuerySentEvent, +) -> str: + """Return progress bar html string Args: query_job (bigquery.QueryJob): The job representing the execution of the query on the server. Returns: - String + Html string. """ - return f"""Query job {query_job.job_id} is {query_job.state}.{get_bytes_processed_string(query_job.total_bytes_processed)} \n{get_job_url(query_job)}""" + + job_url = get_job_url( + project_id=event.billing_project, + location=event.location, + job_id=event.job_id, + ) + if job_url: + job_link = f" Open job: {job_url}" + else: + job_link = "" + + query_id = "" + if event.job_id: + query_id = f" with job ID {event.job_id}:{event.billing_project}.{event.job_id}" + elif event.request_id: + query_id = f" with request ID {event.job_id}:{event.billing_project}.{event.request_id}" + + return f"Query started{query_id}.{job_link}" def get_base_job_loading_html(job: GenericJob): @@ -269,7 +284,11 @@ def get_base_job_loading_html(job: GenericJob): Returns: Html string. """ - return f"""{job.job_type.capitalize()} job {job.job_id} is {job.state}. Open Job""" + return f"""{job.job_type.capitalize()} job {job.job_id} is {job.state}. Open Job""" def get_base_job_loading_string(job: GenericJob): @@ -280,7 +299,11 @@ def get_base_job_loading_string(job: GenericJob): Returns: String """ - return f"""{job.job_type.capitalize()} job {job.job_id} is {job.state}. \n{get_job_url(job)}""" + return f"""{job.job_type.capitalize()} job {job.job_id} is {job.state}. \n{get_job_url( + project_id=job.job_id, + location=job.location, + job_id=job.job_id, + )}""" def get_formatted_time(val): diff --git a/tests/system/small/test_progress_bar.py b/tests/system/small/test_progress_bar.py index 8a323831b5..ccba186bae 100644 --- a/tests/system/small/test_progress_bar.py +++ b/tests/system/small/test_progress_bar.py @@ -115,24 +115,6 @@ def assert_loading_msg_exist(capystOut: str, pattern=job_load_message_regex): assert numLoadingMsg > 0 -def test_query_job_repr_html(penguins_df_default_index: bf.dataframe.DataFrame): - with bf.option_context("display.progress_bar", "terminal"): - penguins_df_default_index.to_pandas(allow_large_results=True) - query_job_repr = formatting_helpers.repr_query_job_html( - penguins_df_default_index.query_job - ).value - - string_checks = [ - "Job Id", - "Destination Table", - "Slot Time", - "Bytes Processed", - "Cache hit", - ] - for string in string_checks: - assert string in query_job_repr - - def test_query_job_repr(penguins_df_default_index: bf.dataframe.DataFrame): penguins_df_default_index.to_pandas(allow_large_results=True) query_job_repr = formatting_helpers.repr_query_job( From 2370ea2646fab8aed8e31533e55cc06cd4cdb964 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Fri, 26 Sep 2025 12:42:18 -0500 Subject: [PATCH 05/16] Feat render more events (#2121) * feat: Render more BigQuery events in progress bar This change updates bigframes/formatting_helpers.py to render more event types from bigframes/core/events.py. Specifically, it adds rendering support for: - BigQueryRetryEvent - BigQueryReceivedEvent - BigQueryFinishedEvent - BigQueryUnknownEvent This provides users with more detailed feedback during query execution in both notebook (HTML) and terminal (plaintext) environments. * feat: Render more BigQuery events in progress bar This change updates bigframes/formatting_helpers.py to render more event types from bigframes/core/events.py. Specifically, it adds rendering support for: - BigQueryRetryEvent - BigQueryReceivedEvent - BigQueryFinishedEvent - BigQueryUnknownEvent This provides users with more detailed feedback during query execution in both notebook (HTML) and terminal (plaintext) environments. Unit tests have been added to verify the rendering of each new event type. --------- Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com> --- bigframes/formatting_helpers.py | 223 ++++++++++++++++++++++++++ tests/unit/test_formatting_helpers.py | 143 ++++++++++++++++- 2 files changed, 365 insertions(+), 1 deletion(-) diff --git a/bigframes/formatting_helpers.py b/bigframes/formatting_helpers.py index 8ca0eaa426..f78cd0ef62 100644 --- a/bigframes/formatting_helpers.py +++ b/bigframes/formatting_helpers.py @@ -133,6 +133,30 @@ def progress_callback( display.HTML(previous_display_html), display_id=current_display_id, ) + elif isinstance(event, bigframes.core.events.BigQueryRetryEvent): + previous_display_html = render_bqquery_retry_event_html(event) + display.update_display( + display.HTML(previous_display_html), + display_id=current_display_id, + ) + elif isinstance(event, bigframes.core.events.BigQueryReceivedEvent): + previous_display_html = render_bqquery_received_event_html(event) + display.update_display( + display.HTML(previous_display_html), + display_id=current_display_id, + ) + elif isinstance(event, bigframes.core.events.BigQueryFinishedEvent): + previous_display_html = render_bqquery_finished_event_html(event) + display.update_display( + display.HTML(previous_display_html), + display_id=current_display_id, + ) + elif isinstance(event, bigframes.core.events.BigQueryUnknownEvent): + previous_display_html = render_bqquery_unknown_event_html(event) + display.update_display( + display.HTML(previous_display_html), + display_id=current_display_id, + ) elif isinstance(event, bigframes.core.events.ExecutionFinished): display.update_display( display.HTML(f"{previous_display_html} Execution done."), @@ -144,6 +168,18 @@ def progress_callback( elif isinstance(event, bigframes.core.events.BigQuerySentEvent): message = render_bqquery_sent_event_plaintext(event) print(message) + elif isinstance(event, bigframes.core.events.BigQueryRetryEvent): + message = render_bqquery_retry_event_plaintext(event) + print(message) + elif isinstance(event, bigframes.core.events.BigQueryReceivedEvent): + message = render_bqquery_received_event_plaintext(event) + print(message) + elif isinstance(event, bigframes.core.events.BigQueryFinishedEvent): + message = render_bqquery_finished_event_plaintext(event) + print(message) + elif isinstance(event, bigframes.core.events.BigQueryUnknownEvent): + message = render_bqquery_unknown_event_plaintext(event) + print(message) elif isinstance(event, bigframes.core.events.ExecutionFinished): print("Execution done.") @@ -276,6 +312,193 @@ def render_bqquery_sent_event_plaintext( return f"Query started{query_id}.{job_link}" +def render_bqquery_retry_event_html( + event: bigframes.core.events.BigQueryRetryEvent, +) -> str: + """Return progress bar html string for retry event.""" + + job_url = get_job_url( + project_id=event.billing_project, + location=event.location, + job_id=event.job_id, + ) + if job_url: + job_link = f'Open Job' + else: + job_link = "" + + query_id = "" + if event.job_id: + query_id = f" with job ID {event.billing_project}.{event.job_id}" + elif event.request_id: + query_id = f" with request ID {event.billing_project}.{event.request_id}" + + query_text_details = ( + f"
SQL
{html.escape(event.query)}
" + ) + + return f""" + Retrying query{query_id}.{job_link}{query_text_details} + """ + + +def render_bqquery_retry_event_plaintext( + event: bigframes.core.events.BigQueryRetryEvent, +) -> str: + """Return progress bar plaintext string for retry event.""" + + job_url = get_job_url( + project_id=event.billing_project, + location=event.location, + job_id=event.job_id, + ) + if job_url: + job_link = f" Open job: {job_url}" + else: + job_link = "" + + query_id = "" + if event.job_id: + query_id = f" with job ID {event.billing_project}.{event.job_id}" + elif event.request_id: + query_id = f" with request ID {event.billing_project}.{event.request_id}" + + return f"Retrying query{query_id}.{job_link}" + + +def render_bqquery_received_event_html( + event: bigframes.core.events.BigQueryReceivedEvent, +) -> str: + """Return progress bar html string for received event.""" + + job_url = get_job_url( + project_id=event.billing_project, + location=event.location, + job_id=event.job_id, + ) + if job_url: + job_link = f'Open Job' + else: + job_link = "" + + # Don't have billing project and job ID in the same string, as that's + # redundant with the job link. + job_id_str = "" + if event.job_id: + job_id_str = f" {event.job_id}" + + query_plan_details = "" + if event.query_plan: + plan_str = "\n".join([str(entry) for entry in event.query_plan]) + query_plan_details = f"
Query Plan
{html.escape(plan_str)}
" + + return f""" + Query job{job_id_str} is {event.state}.{job_link}{query_plan_details} + """ + + +def render_bqquery_received_event_plaintext( + event: bigframes.core.events.BigQueryReceivedEvent, +) -> str: + """Return progress bar plaintext string for received event.""" + + job_url = get_job_url( + project_id=event.billing_project, + location=event.location, + job_id=event.job_id, + ) + if job_url: + job_link = f" Open job: {job_url}" + else: + job_link = "" + + job_id_str = "" + if event.job_id: + job_id_str = f" {event.job_id}" + + return f"Query job{job_id_str} is {event.state}.{job_link}" + + +def render_bqquery_finished_event_html( + event: bigframes.core.events.BigQueryFinishedEvent, +) -> str: + """Return progress bar html string for finished event.""" + + job_url = get_job_url( + project_id=event.billing_project, + location=event.location, + job_id=event.job_id, + ) + if job_url: + job_link = f'Open Job' + else: + job_link = "" + + # Don't have billing project and job ID in the same string, as that's + # redundant with the job link. + job_id_str = "" + if event.job_id: + job_id_str = f" {event.job_id}" + + bytes_str = "" + if event.total_bytes_processed is not None: + bytes_str = f" {humanize.naturalsize(event.total_bytes_processed)} processed." + + slot_time_str = "" + if event.slot_millis is not None: + slot_time = datetime.timedelta(milliseconds=event.slot_millis) + slot_time_str = f" Slot time: {humanize.naturaldelta(slot_time)}." + + return f""" + Query job{job_id_str} finished.{bytes_str}{slot_time_str}{job_link} + """ + + +def render_bqquery_finished_event_plaintext( + event: bigframes.core.events.BigQueryFinishedEvent, +) -> str: + """Return progress bar plaintext string for finished event.""" + + job_url = get_job_url( + project_id=event.billing_project, + location=event.location, + job_id=event.job_id, + ) + if job_url: + job_link = f" Open job: {job_url}" + else: + job_link = "" + + job_id_str = "" + if event.job_id: + job_id_str = f" {event.job_id}" + + bytes_str = "" + if event.total_bytes_processed is not None: + bytes_str = f" {humanize.naturalsize(event.total_bytes_processed)} processed." + + slot_time_str = "" + if event.slot_millis is not None: + slot_time = datetime.timedelta(milliseconds=event.slot_millis) + slot_time_str = f" Slot time: {humanize.naturaldelta(slot_time)}." + + return f"Query job{job_id_str} finished.{bytes_str}{slot_time_str}{job_link}" + + +def render_bqquery_unknown_event_html( + event: bigframes.core.events.BigQueryUnknownEvent, +) -> str: + """Return progress bar html string for unknown event.""" + return "Received unknown event." + + +def render_bqquery_unknown_event_plaintext( + event: bigframes.core.events.BigQueryUnknownEvent, +) -> str: + """Return progress bar plaintext string for unknown event.""" + return "Received unknown event." + + def get_base_job_loading_html(job: GenericJob): """Return progress bar html string Args: diff --git a/tests/unit/test_formatting_helpers.py b/tests/unit/test_formatting_helpers.py index 588ef6e824..f50c63a049 100644 --- a/tests/unit/test_formatting_helpers.py +++ b/tests/unit/test_formatting_helpers.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import datetime import unittest.mock as mock import bigframes_vendored.constants as constants @@ -19,6 +20,7 @@ import google.cloud.bigquery as bigquery import pytest +import bigframes.core.events as bfevents import bigframes.formatting_helpers as formatting_helpers import bigframes.version @@ -30,7 +32,7 @@ def test_wait_for_query_job_error_includes_feedback_link(): ) with pytest.raises(api_core_exceptions.BadRequest) as cap_exc: - formatting_helpers.wait_for_query_job(mock_query_job) + formatting_helpers.wait_for_job(mock_query_job) cap_exc.match("Test message 123.") cap_exc.match(constants.FEEDBACK_LINK) @@ -70,3 +72,142 @@ def test_get_formatted_bytes(test_input, expected): ) def test_get_formatted_time(test_input, expected): assert formatting_helpers.get_formatted_time(test_input) == expected + + +def test_render_bqquery_sent_event_html(): + event = bfevents.BigQuerySentEvent( + query="SELECT * FROM my_table", + job_id="my-job-id", + location="us-central1", + billing_project="my-project", + ) + html = formatting_helpers.render_bqquery_sent_event_html(event) + assert "SELECT * FROM my_table" in html + assert "my-job-id" in html + assert "us-central1" in html + assert "my-project" in html + assert "
" in html + + +def test_render_bqquery_sent_event_plaintext(): + event = bfevents.BigQuerySentEvent( + query="SELECT * FROM my_table", + job_id="my-job-id", + location="us-central1", + billing_project="my-project", + ) + text = formatting_helpers.render_bqquery_sent_event_plaintext(event) + assert "my-job-id" in text + assert "us-central1" in text + assert "my-project" in text + assert "SELECT * FROM my_table" not in text + + +def test_render_bqquery_retry_event_html(): + event = bfevents.BigQueryRetryEvent( + query="SELECT * FROM my_table", + job_id="my-job-id", + location="us-central1", + billing_project="my-project", + ) + html = formatting_helpers.render_bqquery_retry_event_html(event) + assert "Retrying query" in html + assert "SELECT * FROM my_table" in html + assert "my-job-id" in html + assert "us-central1" in html + assert "my-project" in html + assert "
" in html + + +def test_render_bqquery_retry_event_plaintext(): + event = bfevents.BigQueryRetryEvent( + query="SELECT * FROM my_table", + job_id="my-job-id", + location="us-central1", + billing_project="my-project", + ) + text = formatting_helpers.render_bqquery_retry_event_plaintext(event) + assert "Retrying query" in text + assert "my-job-id" in text + assert "us-central1" in text + assert "my-project" in text + assert "SELECT * FROM my_table" not in text + + +def test_render_bqquery_received_event_html(): + mock_plan_entry = mock.create_autospec( + bigquery.job.query.QueryPlanEntry, instance=True + ) + mock_plan_entry.__str__.return_value = "mocked plan" + event = bfevents.BigQueryReceivedEvent( + job_id="my-job-id", + location="us-central1", + billing_project="my-project", + state="RUNNING", + query_plan=[mock_plan_entry], + ) + html = formatting_helpers.render_bqquery_received_event_html(event) + assert "Query job" in html + assert "my-job-id" in html + assert "is RUNNING" in html + assert "
" in html + assert "mocked plan" in html + + +def test_render_bqquery_received_event_plaintext(): + event = bfevents.BigQueryReceivedEvent( + job_id="my-job-id", + location="us-central1", + billing_project="my-project", + state="RUNNING", + query_plan=[], + ) + text = formatting_helpers.render_bqquery_received_event_plaintext(event) + assert "Query job" in text + assert "my-job-id" in text + assert "is RUNNING" in text + assert "Query Plan" not in text + + +def test_render_bqquery_finished_event_html(): + event = bfevents.BigQueryFinishedEvent( + job_id="my-job-id", + location="us-central1", + billing_project="my-project", + total_bytes_processed=1000, + slot_millis=2000, + ) + html = formatting_helpers.render_bqquery_finished_event_html(event) + assert "Query job" in html + assert "my-job-id" in html + assert "finished" in html + assert "1.0 kB processed" in html + assert "Slot time: 2 seconds" in html + + +def test_render_bqquery_finished_event_plaintext(): + event = bfevents.BigQueryFinishedEvent( + job_id="my-job-id", + location="us-central1", + billing_project="my-project", + total_bytes_processed=1000, + slot_millis=2000, + ) + text = formatting_helpers.render_bqquery_finished_event_plaintext(event) + assert "Query job" in text + assert "my-job-id" in text + assert "finished" in text + assert "1.0 kB processed" in text + assert "Slot time: 2 seconds" in text + + +def test_render_bqquery_unknown_event_html(): + event = bfevents.BigQueryUnknownEvent(event=None) + html = formatting_helpers.render_bqquery_unknown_event_html(event) + assert "Received unknown event" in html + + +def test_render_bqquery_unknown_event_plaintext(): + event = bfevents.BigQueryUnknownEvent(event=None) + text = formatting_helpers.render_bqquery_unknown_event_plaintext(event) + assert "Received unknown event" in text From 4d9f37ada8e89e228412ab13d71b1853d2429a59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Fri, 26 Sep 2025 18:02:27 +0000 Subject: [PATCH 06/16] fix job links --- bigframes/formatting_helpers.py | 246 +++++++++++++------------- tests/unit/test_formatting_helpers.py | 21 +-- 2 files changed, 123 insertions(+), 144 deletions(-) diff --git a/bigframes/formatting_helpers.py b/bigframes/formatting_helpers.py index f78cd0ef62..63f129122f 100644 --- a/bigframes/formatting_helpers.py +++ b/bigframes/formatting_helpers.py @@ -151,12 +151,6 @@ def progress_callback( display.HTML(previous_display_html), display_id=current_display_id, ) - elif isinstance(event, bigframes.core.events.BigQueryUnknownEvent): - previous_display_html = render_bqquery_unknown_event_html(event) - display.update_display( - display.HTML(previous_display_html), - display_id=current_display_id, - ) elif isinstance(event, bigframes.core.events.ExecutionFinished): display.update_display( display.HTML(f"{previous_display_html} Execution done."), @@ -177,9 +171,6 @@ def progress_callback( elif isinstance(event, bigframes.core.events.BigQueryFinishedEvent): message = render_bqquery_finished_event_plaintext(event) print(message) - elif isinstance(event, bigframes.core.events.BigQueryUnknownEvent): - message = render_bqquery_unknown_event_plaintext(event) - print(message) elif isinstance(event, bigframes.core.events.ExecutionFinished): print("Execution done.") @@ -232,6 +223,57 @@ def wait_for_job(job: GenericJob, progress_bar: Optional[str] = None): raise +def render_query_references( + *, + project_id: Optional[str], + location: Optional[str], + job_id: Optional[str], + request_id: Optional[str], +) -> str: + query_id = "" + if job_id: + query_id = f" with job ID {project_id}:{location}.{job_id}" + elif request_id: + query_id = f" with request ID {project_id}:{location}.{request_id}" + return query_id + + +def render_job_link_html( + *, + project_id: Optional[str], + location: Optional[str], + job_id: Optional[str], +) -> str: + job_url = get_job_url( + project_id=project_id, + location=location, + job_id=job_id, + ) + if job_url: + job_link = f' Open Job' + else: + job_link = "" + return job_link + + +def render_job_link_plaintext( + *, + project_id: Optional[str], + location: Optional[str], + job_id: Optional[str], +) -> str: + job_url = get_job_url( + project_id=project_id, + location=location, + job_id=job_id, + ) + if job_url: + job_link = f" Open Job: {job_url}" + else: + job_link = "" + return job_link + + def get_job_url( *, project_id: Optional[str], @@ -259,22 +301,17 @@ def render_bqquery_sent_event_html( Html string. """ - job_url = get_job_url( + job_link = render_job_link_html( project_id=event.billing_project, location=event.location, job_id=event.job_id, ) - if job_url: - job_link = f'Open Job' - else: - job_link = "" - - query_id = "" - if event.job_id: - query_id = f" with job ID {event.job_id}:{event.billing_project}.{event.job_id}" - elif event.request_id: - query_id = f" with request ID {event.job_id}:{event.billing_project}.{event.request_id}" - + query_id = render_query_references( + project_id=event.billing_project, + location=event.location, + job_id=event.job_id, + request_id=event.request_id, + ) query_text_details = f"
SQL
{html.escape(event.query)}
" return f""" @@ -293,21 +330,17 @@ def render_bqquery_sent_event_plaintext( Html string. """ - job_url = get_job_url( + job_link = render_job_link_plaintext( project_id=event.billing_project, location=event.location, job_id=event.job_id, ) - if job_url: - job_link = f" Open job: {job_url}" - else: - job_link = "" - - query_id = "" - if event.job_id: - query_id = f" with job ID {event.job_id}:{event.billing_project}.{event.job_id}" - elif event.request_id: - query_id = f" with request ID {event.job_id}:{event.billing_project}.{event.request_id}" + query_id = render_query_references( + project_id=event.billing_project, + location=event.location, + job_id=event.job_id, + request_id=event.request_id, + ) return f"Query started{query_id}.{job_link}" @@ -317,25 +350,18 @@ def render_bqquery_retry_event_html( ) -> str: """Return progress bar html string for retry event.""" - job_url = get_job_url( + job_link = render_job_link_html( project_id=event.billing_project, location=event.location, job_id=event.job_id, ) - if job_url: - job_link = f'Open Job' - else: - job_link = "" - - query_id = "" - if event.job_id: - query_id = f" with job ID {event.billing_project}.{event.job_id}" - elif event.request_id: - query_id = f" with request ID {event.billing_project}.{event.request_id}" - - query_text_details = ( - f"
SQL
{html.escape(event.query)}
" + query_id = render_query_references( + project_id=event.billing_project, + location=event.location, + job_id=event.job_id, + request_id=event.request_id, ) + query_text_details = f"
SQL
{html.escape(event.query)}
" return f""" Retrying query{query_id}.{job_link}{query_text_details} @@ -347,22 +373,17 @@ def render_bqquery_retry_event_plaintext( ) -> str: """Return progress bar plaintext string for retry event.""" - job_url = get_job_url( + job_link = render_job_link_plaintext( project_id=event.billing_project, location=event.location, job_id=event.job_id, ) - if job_url: - job_link = f" Open job: {job_url}" - else: - job_link = "" - - query_id = "" - if event.job_id: - query_id = f" with job ID {event.billing_project}.{event.job_id}" - elif event.request_id: - query_id = f" with request ID {event.billing_project}.{event.request_id}" - + query_id = render_query_references( + project_id=event.billing_project, + location=event.location, + job_id=event.job_id, + request_id=event.request_id, + ) return f"Retrying query{query_id}.{job_link}" @@ -371,21 +392,17 @@ def render_bqquery_received_event_html( ) -> str: """Return progress bar html string for received event.""" - job_url = get_job_url( + job_link = render_job_link_html( project_id=event.billing_project, location=event.location, job_id=event.job_id, ) - if job_url: - job_link = f'Open Job' - else: - job_link = "" - - # Don't have billing project and job ID in the same string, as that's - # redundant with the job link. - job_id_str = "" - if event.job_id: - job_id_str = f" {event.job_id}" + query_id = render_query_references( + project_id=event.billing_project, + location=event.location, + job_id=event.job_id, + request_id=None, + ) query_plan_details = "" if event.query_plan: @@ -393,7 +410,7 @@ def render_bqquery_received_event_html( query_plan_details = f"
Query Plan
{html.escape(plan_str)}
" return f""" - Query job{job_id_str} is {event.state}.{job_link}{query_plan_details} + Query{query_id} is {event.state}.{job_link}{query_plan_details} """ @@ -402,21 +419,18 @@ def render_bqquery_received_event_plaintext( ) -> str: """Return progress bar plaintext string for received event.""" - job_url = get_job_url( + job_link = render_job_link_plaintext( project_id=event.billing_project, location=event.location, job_id=event.job_id, ) - if job_url: - job_link = f" Open job: {job_url}" - else: - job_link = "" - - job_id_str = "" - if event.job_id: - job_id_str = f" {event.job_id}" - - return f"Query job{job_id_str} is {event.state}.{job_link}" + query_id = render_query_references( + project_id=event.billing_project, + location=event.location, + job_id=event.job_id, + request_id=None, + ) + return f"Query{query_id} is {event.state}.{job_link}" def render_bqquery_finished_event_html( @@ -424,22 +438,6 @@ def render_bqquery_finished_event_html( ) -> str: """Return progress bar html string for finished event.""" - job_url = get_job_url( - project_id=event.billing_project, - location=event.location, - job_id=event.job_id, - ) - if job_url: - job_link = f'Open Job' - else: - job_link = "" - - # Don't have billing project and job ID in the same string, as that's - # redundant with the job link. - job_id_str = "" - if event.job_id: - job_id_str = f" {event.job_id}" - bytes_str = "" if event.total_bytes_processed is not None: bytes_str = f" {humanize.naturalsize(event.total_bytes_processed)} processed." @@ -449,8 +447,19 @@ def render_bqquery_finished_event_html( slot_time = datetime.timedelta(milliseconds=event.slot_millis) slot_time_str = f" Slot time: {humanize.naturaldelta(slot_time)}." + job_link = render_job_link_html( + project_id=event.billing_project, + location=event.location, + job_id=event.job_id, + ) + query_id = render_query_references( + project_id=event.billing_project, + location=event.location, + job_id=event.job_id, + request_id=None, + ) return f""" - Query job{job_id_str} finished.{bytes_str}{slot_time_str}{job_link} + Query{query_id} finished.{bytes_str}{slot_time_str}{job_link} """ @@ -459,20 +468,6 @@ def render_bqquery_finished_event_plaintext( ) -> str: """Return progress bar plaintext string for finished event.""" - job_url = get_job_url( - project_id=event.billing_project, - location=event.location, - job_id=event.job_id, - ) - if job_url: - job_link = f" Open job: {job_url}" - else: - job_link = "" - - job_id_str = "" - if event.job_id: - job_id_str = f" {event.job_id}" - bytes_str = "" if event.total_bytes_processed is not None: bytes_str = f" {humanize.naturalsize(event.total_bytes_processed)} processed." @@ -482,21 +477,18 @@ def render_bqquery_finished_event_plaintext( slot_time = datetime.timedelta(milliseconds=event.slot_millis) slot_time_str = f" Slot time: {humanize.naturaldelta(slot_time)}." - return f"Query job{job_id_str} finished.{bytes_str}{slot_time_str}{job_link}" - - -def render_bqquery_unknown_event_html( - event: bigframes.core.events.BigQueryUnknownEvent, -) -> str: - """Return progress bar html string for unknown event.""" - return "Received unknown event." - - -def render_bqquery_unknown_event_plaintext( - event: bigframes.core.events.BigQueryUnknownEvent, -) -> str: - """Return progress bar plaintext string for unknown event.""" - return "Received unknown event." + job_link = render_job_link_plaintext( + project_id=event.billing_project, + location=event.location, + job_id=event.job_id, + ) + query_id = render_query_references( + project_id=event.billing_project, + location=event.location, + job_id=event.job_id, + request_id=None, + ) + return f"Query{query_id} finished.{bytes_str}{slot_time_str}{job_link}" def get_base_job_loading_html(job: GenericJob): diff --git a/tests/unit/test_formatting_helpers.py b/tests/unit/test_formatting_helpers.py index f50c63a049..d66867bc35 100644 --- a/tests/unit/test_formatting_helpers.py +++ b/tests/unit/test_formatting_helpers.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import datetime import unittest.mock as mock import bigframes_vendored.constants as constants @@ -147,7 +146,7 @@ def test_render_bqquery_received_event_html(): query_plan=[mock_plan_entry], ) html = formatting_helpers.render_bqquery_received_event_html(event) - assert "Query job" in html + assert "Query with job" in html assert "my-job-id" in html assert "is RUNNING" in html assert "
" in html @@ -163,7 +162,7 @@ def test_render_bqquery_received_event_plaintext(): query_plan=[], ) text = formatting_helpers.render_bqquery_received_event_plaintext(event) - assert "Query job" in text + assert "Query with job" in text assert "my-job-id" in text assert "is RUNNING" in text assert "Query Plan" not in text @@ -178,7 +177,7 @@ def test_render_bqquery_finished_event_html(): slot_millis=2000, ) html = formatting_helpers.render_bqquery_finished_event_html(event) - assert "Query job" in html + assert "Query with job" in html assert "my-job-id" in html assert "finished" in html assert "1.0 kB processed" in html @@ -194,20 +193,8 @@ def test_render_bqquery_finished_event_plaintext(): slot_millis=2000, ) text = formatting_helpers.render_bqquery_finished_event_plaintext(event) - assert "Query job" in text + assert "Query with job" in text assert "my-job-id" in text assert "finished" in text assert "1.0 kB processed" in text assert "Slot time: 2 seconds" in text - - -def test_render_bqquery_unknown_event_html(): - event = bfevents.BigQueryUnknownEvent(event=None) - html = formatting_helpers.render_bqquery_unknown_event_html(event) - assert "Received unknown event" in html - - -def test_render_bqquery_unknown_event_plaintext(): - event = bfevents.BigQueryUnknownEvent(event=None) - text = formatting_helpers.render_bqquery_unknown_event_plaintext(event) - assert "Received unknown event" in text From fc1e630e855727904d1a1aa6ce58b508d1c52586 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Fri, 26 Sep 2025 19:08:47 +0000 Subject: [PATCH 07/16] fix system tests --- tests/system/small/test_progress_bar.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/system/small/test_progress_bar.py b/tests/system/small/test_progress_bar.py index ccba186bae..8c2e969227 100644 --- a/tests/system/small/test_progress_bar.py +++ b/tests/system/small/test_progress_bar.py @@ -23,7 +23,7 @@ import bigframes.formatting_helpers as formatting_helpers from bigframes.session import MAX_INLINE_DF_BYTES -job_load_message_regex = r"\w+ job [\w-]+ is \w+\." +job_load_message_regex = r"Query.*with" EXPECTED_DRY_RUN_MESSAGE = "Computation deferred. Computation will process" @@ -56,7 +56,7 @@ def test_progress_bar_scalar(penguins_df_default_index: bf.dataframe.DataFrame, with bf.option_context("display.progress_bar", "terminal"): penguins_df_default_index["body_mass_g"].head(10).mean() - assert capsys.readouterr().out == "" + assert_loading_msg_exist(capsys.readouterr().out) def test_progress_bar_scalar_allow_large_results( @@ -100,19 +100,19 @@ def test_progress_bar_load_jobs( capsys.readouterr() # clear output session.read_csv(path) - assert_loading_msg_exist(capsys.readouterr().out) + assert_loading_msg_exist(capsys.readouterr().out, pattern="Load") -def assert_loading_msg_exist(capystOut: str, pattern=job_load_message_regex): - numLoadingMsg = 0 - lines = capystOut.split("\n") +def assert_loading_msg_exist(capstdout: str, pattern=job_load_message_regex): + num_loading_msg = 0 + lines = capstdout.split("\n") lines = [line for line in lines if len(line) > 0] assert len(lines) > 0 for line in lines: - if re.match(pattern, line) is not None: - numLoadingMsg += 1 - assert numLoadingMsg > 0 + if re.search(pattern, line) is not None: + num_loading_msg += 1 + assert num_loading_msg > 0 def test_query_job_repr(penguins_df_default_index: bf.dataframe.DataFrame): From d1a7f70ac311dc9acbcaf9a4c65ae28e76334ecd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Fri, 26 Sep 2025 19:20:35 +0000 Subject: [PATCH 08/16] fix mypy --- tests/unit/session/test_read_gbq_table.py | 67 +++++++++++++++++++---- 1 file changed, 56 insertions(+), 11 deletions(-) diff --git a/tests/unit/session/test_read_gbq_table.py b/tests/unit/session/test_read_gbq_table.py index 0c67e05813..3436239a66 100644 --- a/tests/unit/session/test_read_gbq_table.py +++ b/tests/unit/session/test_read_gbq_table.py @@ -24,13 +24,12 @@ @pytest.mark.parametrize( - ("index_cols", "primary_keys", "values_distinct", "expected"), + ("index_cols", "primary_keys", "expected"), ( - (["col1", "col2"], ["col1", "col2", "col3"], False, ("col1", "col2", "col3")), + (["col1", "col2"], ["col1", "col2", "col3"], ("col1", "col2", "col3")), ( ["col1", "col2", "col3"], ["col1", "col2", "col3"], - True, ("col1", "col2", "col3"), ), ( @@ -39,15 +38,14 @@ "col3", "col2", ], - True, ("col2", "col3"), ), - (["col1", "col2"], [], False, ()), - ([], ["col1", "col2", "col3"], False, ("col1", "col2", "col3")), - ([], [], False, ()), + (["col1", "col2"], [], ()), + ([], ["col1", "col2", "col3"], ("col1", "col2", "col3")), + ([], [], ()), ), ) -def test_infer_unique_columns(index_cols, primary_keys, values_distinct, expected): +def test_infer_unique_columns(index_cols, primary_keys, expected): """If a primary key is set on the table, we use that as the index column by default, no error should be raised in this case. @@ -79,6 +77,49 @@ def test_infer_unique_columns(index_cols, primary_keys, values_distinct, expecte "columns": primary_keys, }, } + + result = bf_read_gbq_table.infer_unique_columns(table, index_cols) + + assert result == expected + + +@pytest.mark.parametrize( + ("index_cols", "values_distinct", "expected"), + ( + ( + ["col1", "col2", "col3"], + True, + ("col1", "col2", "col3"), + ), + ( + ["col2", "col3", "col1"], + True, + ("col2", "col3", "col1"), + ), + (["col1", "col2"], False, ()), + ([], False, ()), + ), +) +def test_check_if_index_columns_are_unique(index_cols, values_distinct, expected): + table = google.cloud.bigquery.Table.from_api_repr( + { + "tableReference": { + "projectId": "my-project", + "datasetId": "my_dataset", + "tableId": "my_table", + }, + "clustering": { + "fields": ["col1", "col2"], + }, + }, + ) + table.schema = ( + google.cloud.bigquery.SchemaField("col1", "INT64"), + google.cloud.bigquery.SchemaField("col2", "INT64"), + google.cloud.bigquery.SchemaField("col3", "INT64"), + google.cloud.bigquery.SchemaField("col4", "INT64"), + ) + bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True) bqclient.project = "test-project" session = mocks.create_bigquery_session( @@ -87,13 +128,17 @@ def test_infer_unique_columns(index_cols, primary_keys, values_distinct, expecte # Mock bqclient _after_ creating session to override its mocks. bqclient.get_table.return_value = table - bqclient.query_and_wait.side_effect = None - bqclient.query_and_wait.return_value = ( + bqclient._query_and_wait_bigframes.side_effect = None + bqclient._query_and_wait_bigframes.return_value = ( {"total_count": 3, "distinct_count": 3 if values_distinct else 2}, ) table._properties["location"] = session._location - result = bf_read_gbq_table.infer_unique_columns(bqclient, table, index_cols) + result = bf_read_gbq_table.check_if_index_columns_are_unique( + bqclient=bqclient, + table=table, + index_cols=index_cols, + ) assert result == expected From 253de65f310839538c6adc71a7232d9450b39287 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Fri, 26 Sep 2025 19:37:33 +0000 Subject: [PATCH 09/16] fix unit tests --- bigframes/dataframe.py | 82 +++++++++++++++--------------- bigframes/testing/mocks.py | 1 + tests/unit/session/test_session.py | 15 ++++-- 3 files changed, 52 insertions(+), 46 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index eb5ed997a1..8c0e8389a4 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -4630,24 +4630,24 @@ def to_string( ) -> str | None: return self.to_pandas(allow_large_results=allow_large_results).to_string( buf, - columns, # type: ignore - col_space, - header, # type: ignore - index, - na_rep, - formatters, - float_format, - sparsify, - index_names, - justify, - max_rows, - max_cols, - show_dimensions, - decimal, - line_width, - min_rows, - max_colwidth, - encoding, + columns=columns, # type: ignore + col_space=col_space, + header=header, # type: ignore + index=index, + na_rep=na_rep, + formatters=formatters, + float_format=float_format, + sparsify=sparsify, + index_names=index_names, + justify=justify, + max_rows=max_rows, + max_cols=max_cols, + show_dimensions=show_dimensions, + decimal=decimal, + line_width=line_width, + min_rows=min_rows, + max_colwidth=max_colwidth, + encoding=encoding, ) def to_html( @@ -4680,28 +4680,28 @@ def to_html( ) -> str: return self.to_pandas(allow_large_results=allow_large_results).to_html( buf, - columns, # type: ignore - col_space, - header, - index, - na_rep, - formatters, - float_format, - sparsify, - index_names, - justify, # type: ignore - max_rows, - max_cols, - show_dimensions, - decimal, - bold_rows, - classes, - escape, - notebook, - border, - table_id, - render_links, - encoding, + columns=columns, # type: ignore + col_space=col_space, + header=header, + index=index, + na_rep=na_rep, + formatters=formatters, + float_format=float_format, + sparsify=sparsify, + index_names=index_names, + justify=justify, # type: ignore + max_rows=max_rows, + max_cols=max_cols, + show_dimensions=show_dimensions, + decimal=decimal, + bold_rows=bold_rows, + classes=classes, + escape=escape, + notebook=notebook, + border=border, + table_id=table_id, + render_links=render_links, + encoding=encoding, ) def to_markdown( @@ -4713,7 +4713,7 @@ def to_markdown( allow_large_results: Optional[bool] = None, **kwargs, ) -> str | None: - return self.to_pandas(allow_large_results=allow_large_results).to_markdown(buf, mode, index, **kwargs) # type: ignore + return self.to_pandas(allow_large_results=allow_large_results).to_markdown(buf, mode=mode, index=index, **kwargs) # type: ignore def to_pickle(self, path, *, allow_large_results=None, **kwargs) -> None: return self.to_pandas(allow_large_results=allow_large_results).to_pickle( diff --git a/bigframes/testing/mocks.py b/bigframes/testing/mocks.py index 8d9997b1df..ff210419fd 100644 --- a/bigframes/testing/mocks.py +++ b/bigframes/testing/mocks.py @@ -143,6 +143,7 @@ def query_and_wait_mock(query, *args, job_config=None, **kwargs): bqclient.query.side_effect = query_mock bqclient.query_and_wait.side_effect = query_and_wait_mock + bqclient._query_and_wait_bigframes.side_effect = query_and_wait_mock clients_provider = mock.create_autospec(bigframes.session.clients.ClientsProvider) type(clients_provider).bqclient = mock.PropertyMock(return_value=bqclient) diff --git a/tests/unit/session/test_session.py b/tests/unit/session/test_session.py index 63c82eb30f..d05957b941 100644 --- a/tests/unit/session/test_session.py +++ b/tests/unit/session/test_session.py @@ -247,7 +247,7 @@ def test_read_gbq_cached_table(): table, ) - session.bqclient.query_and_wait = mock.MagicMock( + session.bqclient._query_and_wait_bigframes = mock.MagicMock( return_value=({"total_count": 3, "distinct_count": 2},) ) session.bqclient.get_table.return_value = table @@ -278,7 +278,7 @@ def test_read_gbq_cached_table_doesnt_warn_for_anonymous_tables_and_doesnt_inclu table, ) - session.bqclient.query_and_wait = mock.MagicMock( + session.bqclient._query_and_wait_bigframes = mock.MagicMock( return_value=({"total_count": 3, "distinct_count": 2},) ) session.bqclient.get_table.return_value = table @@ -306,7 +306,9 @@ def test_default_index_warning_raised_by_read_gbq(table): bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True) bqclient.project = "test-project" bqclient.get_table.return_value = table - bqclient.query_and_wait.return_value = ({"total_count": 3, "distinct_count": 2},) + bqclient._query_and_wait_bigframes.return_value = ( + {"total_count": 3, "distinct_count": 2}, + ) session = mocks.create_bigquery_session( bqclient=bqclient, # DefaultIndexWarning is only relevant for strict mode. @@ -333,7 +335,9 @@ def test_default_index_warning_not_raised_by_read_gbq_index_col_sequential_int64 bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True) bqclient.project = "test-project" bqclient.get_table.return_value = table - bqclient.query_and_wait.return_value = ({"total_count": 4, "distinct_count": 3},) + bqclient._query_and_wait_bigframes.return_value = ( + {"total_count": 4, "distinct_count": 3}, + ) session = mocks.create_bigquery_session( bqclient=bqclient, # DefaultIndexWarning is only relevant for strict mode. @@ -382,7 +386,7 @@ def test_default_index_warning_not_raised_by_read_gbq_index_col_columns( bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True) bqclient.project = "test-project" bqclient.get_table.return_value = table - bqclient.query_and_wait.return_value = ( + bqclient._query_and_wait_bigframes.return_value = ( {"total_count": total_count, "distinct_count": distinct_count}, ) session = mocks.create_bigquery_session( @@ -492,6 +496,7 @@ def query_mock(query, *args, **kwargs): return session_query_mock(query, *args, **kwargs) session.bqclient.query_and_wait = query_mock + session.bqclient._query_and_wait_bigframes = query_mock def get_table_mock(table_ref): table = google.cloud.bigquery.Table( From 5fec058288f537334c6b925537f4e361e68864b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Fri, 26 Sep 2025 19:45:42 +0000 Subject: [PATCH 10/16] support more event types --- bigframes/session/_io/bigquery/__init__.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/bigframes/session/_io/bigquery/__init__.py b/bigframes/session/_io/bigquery/__init__.py index 58d0da696a..b3218dd122 100644 --- a/bigframes/session/_io/bigquery/__init__.py +++ b/bigframes/session/_io/bigquery/__init__.py @@ -241,7 +241,13 @@ def add_and_trim_labels(job_config): def publish_bq_event(event): - if isinstance(event, google.cloud.bigquery._job_helpers.QuerySentEvent): + if isinstance(event, google.cloud.bigquery._job_helpers.QueryFinishedEvent): + bf_event = bigframes.core.events.BigQueryFinishedEvent.from_bqclient(event) + elif isinstance(event, google.cloud.bigquery._job_helpers.QueryReceivedEvent): + bf_event = bigframes.core.events.BigQueryReceivedEvent.from_bqclient(event) + elif isinstance(event, google.cloud.bigquery._job_helpers.QueryRetryEvent): + bf_event = bigframes.core.events.BigQueryRetryEvent.from_bqclient(event) + elif isinstance(event, google.cloud.bigquery._job_helpers.QuerySentEvent): bf_event = bigframes.core.events.BigQuerySentEvent.from_bqclient(event) else: bf_event = bigframes.core.events.BigQueryUnknownEvent(event) From 0008e9917164dadfa0f15c87cf81656d139b055a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Mon, 29 Sep 2025 16:51:14 +0000 Subject: [PATCH 11/16] move publisher to session --- bigframes/blob/_functions.py | 1 + bigframes/core/events.py | 4 -- bigframes/functions/_function_client.py | 1 + bigframes/functions/function.py | 16 +++++++- bigframes/session/__init__.py | 14 +++++-- bigframes/session/_io/bigquery/__init__.py | 41 ++++++++++++------- .../session/_io/bigquery/read_gbq_table.py | 8 ++++ bigframes/session/bq_caching_executor.py | 15 +++++-- bigframes/session/direct_gbq_execution.py | 9 +++- bigframes/session/loader.py | 19 +++++---- .../small/functions/test_remote_function.py | 4 ++ tests/unit/session/test_io_bigquery.py | 2 + 12 files changed, 98 insertions(+), 36 deletions(-) diff --git a/bigframes/blob/_functions.py b/bigframes/blob/_functions.py index 8d1ca38e62..8dd9328fb8 100644 --- a/bigframes/blob/_functions.py +++ b/bigframes/blob/_functions.py @@ -99,6 +99,7 @@ def _create_udf(self): project=None, timeout=None, query_with_job=True, + publisher=self._session._publisher, ) return udf_name diff --git a/bigframes/core/events.py b/bigframes/core/events.py index 5a44fe3255..9b9132df90 100644 --- a/bigframes/core/events.py +++ b/bigframes/core/events.py @@ -68,10 +68,6 @@ def send(self, event: Event): callback(event) -publisher = Publisher() -publisher.subscribe(bigframes.formatting_helpers.progress_callback) - - class Event: pass diff --git a/bigframes/functions/_function_client.py b/bigframes/functions/_function_client.py index 641bf52dc9..8a88a14040 100644 --- a/bigframes/functions/_function_client.py +++ b/bigframes/functions/_function_client.py @@ -145,6 +145,7 @@ def _create_bq_function(self, create_function_ddl: str) -> None: timeout=None, metrics=None, query_with_job=True, + publisher=self._session._publisher, ) logger.info(f"Created bigframes function {query_job.ddl_target_routine}") diff --git a/bigframes/functions/function.py b/bigframes/functions/function.py index 99b89131e7..242daf7525 100644 --- a/bigframes/functions/function.py +++ b/bigframes/functions/function.py @@ -219,7 +219,13 @@ def __call__(self, *args, **kwargs): args_string = ", ".join(map(bf_sql.simple_literal, args)) sql = f"SELECT `{str(self._udf_def.routine_ref)}`({args_string})" - iter, job = bf_io_bigquery.start_query_with_client(self._session.bqclient, sql=sql, query_with_job=True, job_config=bigquery.QueryJobConfig()) # type: ignore + iter, job = bf_io_bigquery.start_query_with_client( + self._session.bqclient, + sql=sql, + query_with_job=True, + job_config=bigquery.QueryJobConfig(), + publisher=self._session._publisher, + ) # type: ignore return list(iter.to_arrow().to_pydict().values())[0][0] @property @@ -297,7 +303,13 @@ def __call__(self, *args, **kwargs): args_string = ", ".join(map(bf_sql.simple_literal, args)) sql = f"SELECT `{str(self._udf_def.routine_ref)}`({args_string})" - iter, job = bf_io_bigquery.start_query_with_client(self._session.bqclient, sql=sql, query_with_job=True, job_config=bigquery.QueryJobConfig()) # type: ignore + iter, job = bf_io_bigquery.start_query_with_client( + self._session.bqclient, + sql=sql, + query_with_job=True, + job_config=bigquery.QueryJobConfig(), + publisher=self._session._publisher, + ) # type: ignore return list(iter.to_arrow().to_pydict().values())[0][0] @property diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index eed5f8496b..8ae80bab2e 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -67,10 +67,9 @@ import bigframes.constants import bigframes.core from bigframes.core import blocks, log_adapter, utils +import bigframes.core.events import bigframes.core.pyformat - -# Even though the ibis.backends.bigquery import is unused, it's needed -# to register new and replacement ops with the Ibis BigQuery backend. +import bigframes.formatting_helpers import bigframes.functions._function_session as bff_session import bigframes.functions.function as bff from bigframes.session import bigquery_session, bq_caching_executor, executor @@ -137,6 +136,11 @@ def __init__( _warn_if_bf_version_is_obsolete() + # Publisher needs to be created before the other objects, especially + # the executors, because they access it. + self._publisher = bigframes.core.events.Publisher() + self._publisher.subscribe(bigframes.formatting_helpers.progress_callback) + if context is None: context = bigquery_options.BigQueryOptions() @@ -251,6 +255,7 @@ def __init__( scan_index_uniqueness=self._strictly_ordered, force_total_order=self._strictly_ordered, metrics=self._metrics, + publisher=self._publisher, ) self._executor: executor.Executor = bq_caching_executor.BigQueryCachingExecutor( bqclient=self._clients_provider.bqclient, @@ -260,6 +265,7 @@ def __init__( strictly_ordered=self._strictly_ordered, metrics=self._metrics, enable_polars_execution=context.enable_polars_execution, + publisher=self._publisher, ) def __del__(self): @@ -2150,6 +2156,7 @@ def _start_query_ml_ddl( timeout=None, query_with_job=True, job_retry=third_party_gcb_retry.DEFAULT_ML_JOB_RETRY, + publisher=self._publisher, ) return iterator, query_job @@ -2177,6 +2184,7 @@ def _create_object_table(self, path: str, connection: str) -> str: project=None, timeout=None, query_with_job=True, + publisher=self._publisher, ) return table diff --git a/bigframes/session/_io/bigquery/__init__.py b/bigframes/session/_io/bigquery/__init__.py index b3218dd122..cdf9c78ca0 100644 --- a/bigframes/session/_io/bigquery/__init__.py +++ b/bigframes/session/_io/bigquery/__init__.py @@ -240,19 +240,22 @@ def add_and_trim_labels(job_config): ) -def publish_bq_event(event): - if isinstance(event, google.cloud.bigquery._job_helpers.QueryFinishedEvent): - bf_event = bigframes.core.events.BigQueryFinishedEvent.from_bqclient(event) - elif isinstance(event, google.cloud.bigquery._job_helpers.QueryReceivedEvent): - bf_event = bigframes.core.events.BigQueryReceivedEvent.from_bqclient(event) - elif isinstance(event, google.cloud.bigquery._job_helpers.QueryRetryEvent): - bf_event = bigframes.core.events.BigQueryRetryEvent.from_bqclient(event) - elif isinstance(event, google.cloud.bigquery._job_helpers.QuerySentEvent): - bf_event = bigframes.core.events.BigQuerySentEvent.from_bqclient(event) - else: - bf_event = bigframes.core.events.BigQueryUnknownEvent(event) +def create_bq_event_callback(publisher): + def publish_bq_event(event): + if isinstance(event, google.cloud.bigquery._job_helpers.QueryFinishedEvent): + bf_event = bigframes.core.events.BigQueryFinishedEvent.from_bqclient(event) + elif isinstance(event, google.cloud.bigquery._job_helpers.QueryReceivedEvent): + bf_event = bigframes.core.events.BigQueryReceivedEvent.from_bqclient(event) + elif isinstance(event, google.cloud.bigquery._job_helpers.QueryRetryEvent): + bf_event = bigframes.core.events.BigQueryRetryEvent.from_bqclient(event) + elif isinstance(event, google.cloud.bigquery._job_helpers.QuerySentEvent): + bf_event = bigframes.core.events.BigQuerySentEvent.from_bqclient(event) + else: + bf_event = bigframes.core.events.BigQueryUnknownEvent(event) - bigframes.core.events.publisher.send(bf_event) + publisher.send(bf_event) + + return publish_bq_event @overload @@ -266,6 +269,7 @@ def start_query_with_client( timeout: Optional[float], metrics: Optional[bigframes.session.metrics.ExecutionMetrics], query_with_job: Literal[True], + publisher: bigframes.core.events.Publisher, ) -> Tuple[google.cloud.bigquery.table.RowIterator, bigquery.QueryJob]: ... @@ -281,6 +285,7 @@ def start_query_with_client( timeout: Optional[float], metrics: Optional[bigframes.session.metrics.ExecutionMetrics], query_with_job: Literal[False], + publisher: bigframes.core.events.Publisher, ) -> Tuple[google.cloud.bigquery.table.RowIterator, Optional[bigquery.QueryJob]]: ... @@ -297,6 +302,7 @@ def start_query_with_client( metrics: Optional[bigframes.session.metrics.ExecutionMetrics], query_with_job: Literal[True], job_retry: google.api_core.retry.Retry, + publisher: bigframes.core.events.Publisher, ) -> Tuple[google.cloud.bigquery.table.RowIterator, bigquery.QueryJob]: ... @@ -313,6 +319,7 @@ def start_query_with_client( metrics: Optional[bigframes.session.metrics.ExecutionMetrics], query_with_job: Literal[False], job_retry: google.api_core.retry.Retry, + publisher: bigframes.core.events.Publisher, ) -> Tuple[google.cloud.bigquery.table.RowIterator, Optional[bigquery.QueryJob]]: ... @@ -332,6 +339,7 @@ def start_query_with_client( # https://github.com/googleapis/python-bigquery/pull/2256 merged, likely # version 3.36.0 or later. job_retry: google.api_core.retry.Retry = third_party_gcb_retry.DEFAULT_JOB_RETRY, + publisher: bigframes.core.events.Publisher, ) -> Tuple[google.cloud.bigquery.table.RowIterator, Optional[bigquery.QueryJob]]: """ Starts query job and waits for results. @@ -350,7 +358,7 @@ def start_query_with_client( project=project, api_timeout=timeout, job_retry=job_retry, - callback=publish_bq_event, + callback=create_bq_event_callback(publisher), ) if metrics is not None: metrics.count_job_stats(row_iterator=results_iterator) @@ -370,7 +378,7 @@ def start_query_with_client( raise if not query_job.configuration.dry_run: - bigframes.core.events.publisher.send( + publisher.send( bigframes.core.events.BigQuerySentEvent( sql, billing_project=query_job.project, @@ -381,7 +389,7 @@ def start_query_with_client( ) results_iterator = query_job.result() if not query_job.configuration.dry_run: - bigframes.core.events.publisher.send( + publisher.send( bigframes.core.events.BigQueryFinishedEvent( billing_project=query_job.project, location=query_job.location, @@ -436,6 +444,8 @@ def create_bq_dataset_reference( bq_client: bigquery.Client, location: Optional[str] = None, project: Optional[str] = None, + *, + publisher: bigframes.core.events.Publisher, ) -> bigquery.DatasetReference: """Create and identify dataset(s) for temporary BQ resources. @@ -467,6 +477,7 @@ def create_bq_dataset_reference( timeout=None, metrics=None, query_with_job=True, + publisher=publisher, ) # The anonymous dataset is used by BigQuery to write query results and diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py index 434196e921..f8a379aee9 100644 --- a/bigframes/session/_io/bigquery/read_gbq_table.py +++ b/bigframes/session/_io/bigquery/read_gbq_table.py @@ -28,6 +28,7 @@ import google.cloud.bigquery as bigquery import google.cloud.bigquery.table +import bigframes.core.events import bigframes.exceptions as bfe import bigframes.session._io.bigquery @@ -43,6 +44,7 @@ def get_table_metadata( *, cache: Dict[bigquery.TableReference, Tuple[datetime.datetime, bigquery.Table]], use_cache: bool = True, + publisher: bigframes.core.events.Publisher, ) -> Tuple[datetime.datetime, google.cloud.bigquery.table.Table]: """Get the table metadata, either from cache or via REST API.""" @@ -59,6 +61,7 @@ def get_table_metadata( # Don't warn, because that will already have been taken care of. should_warn=False, should_dry_run=False, + publisher=publisher, ): # This warning should only happen if the cached snapshot_time will # have any effect on bigframes (b/437090788). For example, with @@ -108,6 +111,7 @@ def is_time_travel_eligible( *, should_warn: bool, should_dry_run: bool, + publisher: bigframes.core.events.Publisher, ): """Check if a table is eligible to use time-travel. @@ -184,6 +188,7 @@ def is_time_travel_eligible( timeout=None, metrics=None, query_with_job=False, + publisher=publisher, ) return True @@ -235,6 +240,8 @@ def check_if_index_columns_are_unique( bqclient: bigquery.Client, table: google.cloud.bigquery.table.Table, index_cols: List[str], + *, + publisher: bigframes.core.events.Publisher, ) -> Tuple[str, ...]: import bigframes.core.sql import bigframes.session._io.bigquery @@ -252,6 +259,7 @@ def check_if_index_columns_are_unique( project=None, metrics=None, query_with_job=False, + publisher=publisher, ) row = next(iter(results)) diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py index 1b8ad3eece..f424505de0 100644 --- a/bigframes/session/bq_caching_executor.py +++ b/bigframes/session/bq_caching_executor.py @@ -140,6 +140,7 @@ def __init__( strictly_ordered: bool = True, metrics: Optional[bigframes.session.metrics.ExecutionMetrics] = None, enable_polars_execution: bool = False, + publisher: bigframes.core.events.Publisher, ): self.bqclient = bqclient self.storage_manager = storage_manager @@ -149,6 +150,9 @@ def __init__( self.loader = loader self.bqstoragereadclient = bqstoragereadclient self._enable_polars_execution = enable_polars_execution + self._publisher = publisher + + # TODO(tswast): Send events from semi-executors, too. self._semi_executors: Sequence[semi_executor.SemiExecutor] = ( read_api_execution.ReadApiSemiExecutor( bqstoragereadclient=bqstoragereadclient, @@ -188,7 +192,7 @@ def execute( array_value: bigframes.core.ArrayValue, execution_spec: ex_spec.ExecutionSpec, ) -> executor.ExecuteResult: - bigframes.core.events.publisher.send(bigframes.core.events.ExecutionStarted()) + self._publisher.send(bigframes.core.events.ExecutionStarted()) # TODO: Support export jobs in combination with semi executors if execution_spec.destination_spec is None: @@ -198,7 +202,7 @@ def execute( plan, ordered=execution_spec.ordered, peek=execution_spec.peek ) if maybe_result: - bigframes.core.events.publisher.send( + self._publisher.send( bigframes.core.events.ExecutionFinished( result=maybe_result, ) @@ -212,7 +216,7 @@ def execute( ) # separate path for export_gbq, as it has all sorts of annoying logic, such as possibly running as dml result = self._export_gbq(array_value, execution_spec.destination_spec) - bigframes.core.events.publisher.send( + self._publisher.send( bigframes.core.events.ExecutionFinished( result=result, ) @@ -232,7 +236,7 @@ def execute( if isinstance(execution_spec.destination_spec, ex_spec.GcsOutputSpec): self._export_result_gcs(result, execution_spec.destination_spec) - bigframes.core.events.publisher.send( + self._publisher.send( bigframes.core.events.ExecutionFinished( result=result, ) @@ -261,6 +265,7 @@ def _export_result_gcs( location=None, timeout=None, query_with_job=True, + publisher=self._publisher, ) def _maybe_find_existing_table( @@ -419,6 +424,7 @@ def _run_execute_query( location=None, timeout=None, query_with_job=True, + publisher=self._publisher, ) else: return bq_io.start_query_with_client( @@ -430,6 +436,7 @@ def _run_execute_query( location=None, timeout=None, query_with_job=False, + publisher=self._publisher, ) except google.api_core.exceptions.BadRequest as e: diff --git a/bigframes/session/direct_gbq_execution.py b/bigframes/session/direct_gbq_execution.py index 7538c9300f..9e7db87301 100644 --- a/bigframes/session/direct_gbq_execution.py +++ b/bigframes/session/direct_gbq_execution.py @@ -21,6 +21,7 @@ from bigframes.core import compile, nodes from bigframes.core.compile import sqlglot +import bigframes.core.events from bigframes.session import executor, semi_executor import bigframes.session._io.bigquery as bq_io @@ -31,7 +32,11 @@ # reference for validating more complex executors. class DirectGbqExecutor(semi_executor.SemiExecutor): def __init__( - self, bqclient: bigquery.Client, compiler: Literal["ibis", "sqlglot"] = "ibis" + self, + bqclient: bigquery.Client, + compiler: Literal["ibis", "sqlglot"] = "ibis", + *, + publisher: bigframes.core.events.Publisher, ): self.bqclient = bqclient self._compile_fn = ( @@ -39,6 +44,7 @@ def __init__( if compiler == "ibis" else sqlglot.SQLGlotCompiler()._compile_sql ) + self._publisher = publisher def execute( self, @@ -83,4 +89,5 @@ def _run_execute_query( timeout=None, metrics=None, query_with_job=False, + publisher=self._publisher, ) diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index d0bc5c908a..558021d9a6 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -263,6 +263,8 @@ def __init__( scan_index_uniqueness: bool, force_total_order: bool, metrics: Optional[bigframes.session.metrics.ExecutionMetrics] = None, + *, + publisher: bigframes.core.events.Publisher, ): self._bqclient = bqclient self._write_client = write_client @@ -274,6 +276,7 @@ def __init__( bigquery.TableReference, Tuple[datetime.datetime, bigquery.Table] ] = {} self._metrics = metrics + self._publisher = publisher # Unfortunate circular reference, but need to pass reference when constructing objects self._session = session self._clock = session_time.BigQuerySyncedClock(bqclient) @@ -783,7 +786,7 @@ def read_gbq_table( # If non in strict ordering mode, don't go through overhead of scanning index column(s) to determine if unique if not primary_key and self._scan_index_uniqueness and index_cols: if publish_execution: - bigframes.core.events.publisher.send( + self._publisher.send( bigframes.core.events.ExecutionStarted(), ) primary_key = bf_read_gbq_table.check_if_index_columns_are_unique( @@ -792,7 +795,7 @@ def read_gbq_table( index_cols=index_cols, ) if publish_execution: - bigframes.core.events.publisher.send( + self._publisher.send( bigframes.core.events.ExecutionFinished(), ) @@ -1015,7 +1018,7 @@ def read_gbq_query( # We want to make sure we show progress when we actually do execute a # query. Since we have got this far, we know it's not a dry run. - bigframes.core.events.publisher.send( + self._publisher.send( bigframes.core.events.ExecutionStarted(), ) @@ -1080,7 +1083,7 @@ def read_gbq_query( index_col=index_col, columns=columns, ) - bigframes.core.events.publisher.send( + self._publisher.send( bigframes.core.events.ExecutionFinished(), ) return df @@ -1092,7 +1095,7 @@ def read_gbq_query( query_job_for_metrics, session=self._session, ) - bigframes.core.events.publisher.send( + self._publisher.send( bigframes.core.events.ExecutionFinished(), ) return df @@ -1110,7 +1113,7 @@ def read_gbq_query( query_job_for_metrics, session=self._session, ) - bigframes.core.events.publisher.send( + self._publisher.send( bigframes.core.events.ExecutionFinished(), ) return df @@ -1134,7 +1137,7 @@ def read_gbq_query( # max_results and filters are omitted because they are already # handled by to_query(), above. ) - bigframes.core.events.publisher.send( + self._publisher.send( bigframes.core.events.ExecutionFinished(), ) return df @@ -1239,6 +1242,7 @@ def _start_query_with_job_optional( project=None, metrics=None, query_with_job=False, + publisher=self._publisher, ) return rows @@ -1264,6 +1268,7 @@ def _start_query_with_job( project=None, metrics=None, query_with_job=True, + publisher=self._publisher, ) return query_job diff --git a/tests/system/small/functions/test_remote_function.py b/tests/system/small/functions/test_remote_function.py index 28fab19144..f81bdf8931 100644 --- a/tests/system/small/functions/test_remote_function.py +++ b/tests/system/small/functions/test_remote_function.py @@ -27,6 +27,7 @@ import bigframes import bigframes.clients +import bigframes.core.events import bigframes.dtypes import bigframes.exceptions from bigframes.functions import _utils as bff_utils @@ -769,6 +770,7 @@ def test_read_gbq_function_runs_existing_udf_array_output(session, routine_id_un timeout=None, metrics=None, query_with_job=True, + publisher=bigframes.core.events.Publisher(), ) func = session.read_gbq_function(routine_id_unique) @@ -807,6 +809,7 @@ def test_read_gbq_function_runs_existing_udf_2_params_array_output( timeout=None, metrics=None, query_with_job=True, + publisher=bigframes.core.events.Publisher(), ) func = session.read_gbq_function(routine_id_unique) @@ -847,6 +850,7 @@ def test_read_gbq_function_runs_existing_udf_4_params_array_output( timeout=None, metrics=None, query_with_job=True, + publisher=bigframes.core.events.Publisher(), ) func = session.read_gbq_function(routine_id_unique) diff --git a/tests/unit/session/test_io_bigquery.py b/tests/unit/session/test_io_bigquery.py index c451d74d0f..57ac3d88f7 100644 --- a/tests/unit/session/test_io_bigquery.py +++ b/tests/unit/session/test_io_bigquery.py @@ -22,6 +22,7 @@ import bigframes from bigframes.core import log_adapter +import bigframes.core.events import bigframes.pandas as bpd import bigframes.session._io.bigquery as io_bq from bigframes.testing import mocks @@ -236,6 +237,7 @@ def test_start_query_with_client_labels_length_limit_met( timeout=timeout, metrics=None, query_with_job=True, + publisher=bigframes.core.events.Publisher(), ) assert job_config.labels is not None From 1cf0dfde197a7a22c59af98675cac7d489843198 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Mon, 29 Sep 2025 18:01:21 +0000 Subject: [PATCH 12/16] fix remaining mypy errors --- bigframes/pandas/__init__.py | 1 + bigframes/session/__init__.py | 1 + bigframes/session/anonymous_dataset.py | 6 +++++- bigframes/session/loader.py | 3 +++ tests/system/small/engines/conftest.py | 9 ++++++--- tests/system/small/engines/test_aggregation.py | 16 +++++++++++++--- tests/system/small/engines/test_windowing.py | 8 ++++++-- tests/unit/session/test_read_gbq_table.py | 1 + 8 files changed, 36 insertions(+), 9 deletions(-) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 2ea10132bc..2455637b0a 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -291,6 +291,7 @@ def clean_up_by_session_id( session.bqclient, location=location, project=project, + publisher=session._publisher, ) bigframes.session._io.bigquery.delete_tables_matching_session_id( diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 8ae80bab2e..3a7d42982b 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -233,6 +233,7 @@ def __init__( location=self._location, session_id=self._session_id, kms_key=self._bq_kms_key_name, + publisher=self._publisher, ) # Session temp tables don't support specifying kms key, so use anon dataset if kms key specified self._session_resource_manager = ( diff --git a/bigframes/session/anonymous_dataset.py b/bigframes/session/anonymous_dataset.py index ec624d4eb4..3c1757806b 100644 --- a/bigframes/session/anonymous_dataset.py +++ b/bigframes/session/anonymous_dataset.py @@ -20,6 +20,7 @@ import google.cloud.bigquery as bigquery from bigframes import constants +import bigframes.core.events from bigframes.session import temporary_storage import bigframes.session._io.bigquery as bf_io_bigquery @@ -37,10 +38,12 @@ def __init__( location: str, session_id: str, *, - kms_key: Optional[str] = None + kms_key: Optional[str] = None, + publisher: bigframes.core.events.Publisher, ): self.bqclient = bqclient self._location = location + self._publisher = publisher self.session_id = session_id self._table_ids: List[bigquery.TableReference] = [] @@ -62,6 +65,7 @@ def dataset(self) -> bigquery.DatasetReference: self._datset_ref = bf_io_bigquery.create_bq_dataset_reference( self.bqclient, location=self._location, + publisher=self._publisher, ) return self._datset_ref diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index 558021d9a6..e7a69b50e6 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -647,6 +647,7 @@ def read_gbq_table( bq_time=self._clock.get_time(), cache=self._df_snapshot, use_cache=use_cache, + publisher=self._publisher, ) if table.location.casefold() != self._storage_manager.location.casefold(): @@ -767,6 +768,7 @@ def read_gbq_table( filter_str, should_warn=True, should_dry_run=True, + publisher=self._publisher, ) # ---------------------------- @@ -793,6 +795,7 @@ def read_gbq_table( self._bqclient, table=table, index_cols=index_cols, + publisher=self._publisher, ) if publish_execution: self._publisher.send( diff --git a/tests/system/small/engines/conftest.py b/tests/system/small/engines/conftest.py index 9699cc6a61..a775731cde 100644 --- a/tests/system/small/engines/conftest.py +++ b/tests/system/small/engines/conftest.py @@ -19,7 +19,7 @@ import pytest import bigframes -from bigframes.core import ArrayValue, local_data +from bigframes.core import ArrayValue, events, local_data from bigframes.session import ( direct_gbq_execution, local_scan_executor, @@ -50,11 +50,14 @@ def engine(request, bigquery_client: bigquery.Client) -> semi_executor.SemiExecu return local_scan_executor.LocalScanExecutor() if request.param == "polars": return polars_executor.PolarsExecutor() + publisher = events.Publisher() if request.param == "bq": - return direct_gbq_execution.DirectGbqExecutor(bigquery_client) + return direct_gbq_execution.DirectGbqExecutor( + bigquery_client, publisher=publisher + ) if request.param == "bq-sqlglot": return direct_gbq_execution.DirectGbqExecutor( - bigquery_client, compiler="sqlglot" + bigquery_client, compiler="sqlglot", publisher=publisher ) raise ValueError(f"Unrecognized param: {request.param}") diff --git a/tests/system/small/engines/test_aggregation.py b/tests/system/small/engines/test_aggregation.py index 9b4efe8cbe..4225d5dff7 100644 --- a/tests/system/small/engines/test_aggregation.py +++ b/tests/system/small/engines/test_aggregation.py @@ -15,7 +15,14 @@ from google.cloud import bigquery import pytest -from bigframes.core import agg_expressions, array_value, expression, identifiers, nodes +from bigframes.core import ( + agg_expressions, + array_value, + events, + expression, + identifiers, + nodes, +) import bigframes.operations.aggregations as agg_ops from bigframes.session import direct_gbq_execution, polars_executor from bigframes.testing.engine_utils import assert_equivalence_execution @@ -93,9 +100,12 @@ def test_sql_engines_median_op_aggregates( scalars_array_value, agg_ops.MedianOp(), ).node - left_engine = direct_gbq_execution.DirectGbqExecutor(bigquery_client) + publisher = events.Publisher() + left_engine = direct_gbq_execution.DirectGbqExecutor( + bigquery_client, publisher=publisher + ) right_engine = direct_gbq_execution.DirectGbqExecutor( - bigquery_client, compiler="sqlglot" + bigquery_client, compiler="sqlglot", publisher=publisher ) assert_equivalence_execution(node, left_engine, right_engine) diff --git a/tests/system/small/engines/test_windowing.py b/tests/system/small/engines/test_windowing.py index f344a3b60a..a34d7b8f38 100644 --- a/tests/system/small/engines/test_windowing.py +++ b/tests/system/small/engines/test_windowing.py @@ -18,6 +18,7 @@ from bigframes.core import ( agg_expressions, array_value, + events, expression, identifiers, nodes, @@ -64,8 +65,11 @@ def test_engines_with_rows_window( skip_reproject_unsafe=False, ) - bq_executor = direct_gbq_execution.DirectGbqExecutor(bigquery_client) + publisher = events.Publisher() + bq_executor = direct_gbq_execution.DirectGbqExecutor( + bigquery_client, publisher=publisher + ) bq_sqlgot_executor = direct_gbq_execution.DirectGbqExecutor( - bigquery_client, compiler="sqlglot" + bigquery_client, compiler="sqlglot", publisher=publisher ) assert_equivalence_execution(window_node, bq_executor, bq_sqlgot_executor) diff --git a/tests/unit/session/test_read_gbq_table.py b/tests/unit/session/test_read_gbq_table.py index 3436239a66..d21f0000a9 100644 --- a/tests/unit/session/test_read_gbq_table.py +++ b/tests/unit/session/test_read_gbq_table.py @@ -139,6 +139,7 @@ def test_check_if_index_columns_are_unique(index_cols, values_distinct, expected bqclient=bqclient, table=table, index_cols=index_cols, + publisher=session._publisher, ) assert result == expected From a6600f8c117fb23aeb022eaa346264034d0017da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Mon, 29 Sep 2025 18:55:30 +0000 Subject: [PATCH 13/16] update text --- bigframes/core/events.py | 5 +++++ bigframes/formatting_helpers.py | 23 +++++++++++++---------- bigframes/session/__init__.py | 6 +++++- tests/unit/test_formatting_helpers.py | 13 ++++++------- 4 files changed, 29 insertions(+), 18 deletions(-) diff --git a/bigframes/core/events.py b/bigframes/core/events.py index 9b9132df90..b748ba6fe2 100644 --- a/bigframes/core/events.py +++ b/bigframes/core/events.py @@ -72,6 +72,11 @@ class Event: pass +@dataclasses.dataclass(frozen=True) +class SessionClosed(Event): + session_id: str + + class ExecutionStarted(Event): pass diff --git a/bigframes/formatting_helpers.py b/bigframes/formatting_helpers.py index 63f129122f..f75394c47d 100644 --- a/bigframes/formatting_helpers.py +++ b/bigframes/formatting_helpers.py @@ -121,7 +121,7 @@ def progress_callback( ): previous_display_html = "" current_display_id = str(random.random()) - current_display = display.HTML("Starting execution.") + current_display = display.HTML("Starting.") display.display( current_display, display_id=current_display_id, @@ -153,7 +153,12 @@ def progress_callback( ) elif isinstance(event, bigframes.core.events.ExecutionFinished): display.update_display( - display.HTML(f"{previous_display_html} Execution done."), + display.HTML(f"✅ Completed. {previous_display_html}"), + display_id=current_display_id, + ) + elif isinstance(event, bigframes.core.events.SessionClosed): + display.update_display( + display.HTML(f"Session {event.session_id} closed."), display_id=current_display_id, ) elif progress_bar == "terminal": @@ -231,9 +236,7 @@ def render_query_references( request_id: Optional[str], ) -> str: query_id = "" - if job_id: - query_id = f" with job ID {project_id}:{location}.{job_id}" - elif request_id: + if request_id and not job_id: query_id = f" with request ID {project_id}:{location}.{request_id}" return query_id @@ -250,7 +253,7 @@ def render_job_link_html( job_id=job_id, ) if job_url: - job_link = f' Open Job' + job_link = f' [Job {project_id}:{location}.{job_id} details]' else: job_link = "" return job_link @@ -268,7 +271,7 @@ def render_job_link_plaintext( job_id=job_id, ) if job_url: - job_link = f" Open Job: {job_url}" + job_link = f" Job {project_id}:{location}.{job_id} details: {job_url}" else: job_link = "" return job_link @@ -440,12 +443,12 @@ def render_bqquery_finished_event_html( bytes_str = "" if event.total_bytes_processed is not None: - bytes_str = f" {humanize.naturalsize(event.total_bytes_processed)} processed." + bytes_str = f" {humanize.naturalsize(event.total_bytes_processed)}" slot_time_str = "" if event.slot_millis is not None: slot_time = datetime.timedelta(milliseconds=event.slot_millis) - slot_time_str = f" Slot time: {humanize.naturaldelta(slot_time)}." + slot_time_str = f" in {humanize.naturaldelta(slot_time)} of slot time" job_link = render_job_link_html( project_id=event.billing_project, @@ -459,7 +462,7 @@ def render_bqquery_finished_event_html( request_id=None, ) return f""" - Query{query_id} finished.{bytes_str}{slot_time_str}{job_link} + Query processed{bytes_str}{slot_time_str}{query_id}.{job_link} """ diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 3a7d42982b..670bd16490 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -377,10 +377,14 @@ def close(self): remote_function_session = getattr(self, "_function_session", None) if remote_function_session: - self._function_session.clean_up( + remote_function_session.clean_up( self.bqclient, self.cloudfunctionsclient, self.session_id ) + publisher_session = getattr(self, "_publisher", None) + if publisher_session: + publisher_session.send(self.session_id) + @overload def read_gbq( # type: ignore[overload-overlap] self, diff --git a/tests/unit/test_formatting_helpers.py b/tests/unit/test_formatting_helpers.py index d66867bc35..9dc1379496 100644 --- a/tests/unit/test_formatting_helpers.py +++ b/tests/unit/test_formatting_helpers.py @@ -146,7 +146,7 @@ def test_render_bqquery_received_event_html(): query_plan=[mock_plan_entry], ) html = formatting_helpers.render_bqquery_received_event_html(event) - assert "Query with job" in html + assert "Query" in html assert "my-job-id" in html assert "is RUNNING" in html assert "
" in html @@ -162,7 +162,7 @@ def test_render_bqquery_received_event_plaintext(): query_plan=[], ) text = formatting_helpers.render_bqquery_received_event_plaintext(event) - assert "Query with job" in text + assert "Query" in text assert "my-job-id" in text assert "is RUNNING" in text assert "Query Plan" not in text @@ -177,11 +177,10 @@ def test_render_bqquery_finished_event_html(): slot_millis=2000, ) html = formatting_helpers.render_bqquery_finished_event_html(event) - assert "Query with job" in html + assert "Query" in html assert "my-job-id" in html - assert "finished" in html - assert "1.0 kB processed" in html - assert "Slot time: 2 seconds" in html + assert "processed 1.0 kB" in html + assert "2 seconds of slot time" in html def test_render_bqquery_finished_event_plaintext(): @@ -193,7 +192,7 @@ def test_render_bqquery_finished_event_plaintext(): slot_millis=2000, ) text = formatting_helpers.render_bqquery_finished_event_plaintext(event) - assert "Query with job" in text + assert "Query" in text assert "my-job-id" in text assert "finished" in text assert "1.0 kB processed" in text From b35015ebd1c8b2b24793b3a0ae1a350c3e217e4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Thu, 2 Oct 2025 21:55:44 +0000 Subject: [PATCH 14/16] add explicit unsubscribe --- bigframes/core/events.py | 89 +++++++++++++--------- bigframes/session/__init__.py | 4 +- bigframes/session/_io/bigquery/__init__.py | 6 +- bigframes/session/bq_caching_executor.py | 8 +- bigframes/session/loader.py | 14 ++-- 5 files changed, 72 insertions(+), 49 deletions(-) diff --git a/bigframes/core/events.py b/bigframes/core/events.py index b748ba6fe2..893be59477 100644 --- a/bigframes/core/events.py +++ b/bigframes/core/events.py @@ -16,56 +16,70 @@ import dataclasses import datetime -import threading -from typing import List, Optional -import weakref +from typing import Any, Callable, Optional, Set +import uuid import google.cloud.bigquery._job_helpers import google.cloud.bigquery.job.query import google.cloud.bigquery.table -import bigframes.formatting_helpers import bigframes.session.executor -@dataclasses.dataclass(frozen=True) class Subscriber: - callback_ref: weakref.ref - # TODO(tswast): Add block_id to allow filter in context managers. + def __init__(self, callback: Callable[[Event], None], *, publisher: Publisher): + self._publisher = publisher + self._callback = callback + self._subscriber_id = str(uuid.uuid4()) + + def __call__(self, *args, **kwargs): + return self._callback(*args, **kwargs) + + def __hash__(self) -> int: + return hash(self._subscriber_id) + + def __eq__(self, value: object): + if not isinstance(value, Subscriber): + return NotImplemented + return value._subscriber_id == self._subscriber_id + + def close(self): + self._publisher.unsubscribe(self) + del self._publisher + del self._callback + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + if exc_value is not None: + self( + UnknownErrorEvent( + exc_type=exc_type, + exc_value=exc_value, + traceback=traceback, + ) + ) + self.close() class Publisher: def __init__(self): - self._subscribers: List[Subscriber] = [] - self._subscribers_lock = threading.Lock() - - def subscribe(self, callback): - subscriber = Subscriber(callback_ref=weakref.ref(callback)) - - with self._subscribers_lock: - # TODO(tswast): Add block_id to allow filter in context managers. - self._subscribers.append(subscriber) - - def send(self, event: Event): - to_delete = [] - to_call = [] + self._subscribers: Set[Subscriber] = set() - with self._subscribers_lock: - for sid, subscriber in enumerate(self._subscribers): - callback = subscriber.callback_ref() + def subscribe(self, callback: Callable[[Event], None]) -> Subscriber: + # TODO(b/448176657): figure out how to handle subscribers/publishers in + # a background thread. Maybe subscribers should be thread-local? + subscriber = Subscriber(callback, publisher=self) + self._subscribers.add(subscriber) + return subscriber - if callback is None: - to_delete.append(sid) - else: - # TODO(tswast): Add if statement for block_id to allow filter - # in context managers. - to_call.append(callback) + def unsubscribe(self, subscriber: Subscriber): + self._subscribers.remove(subscriber) - for sid in reversed(to_delete): - del self._subscribers[sid] - - for callback in to_call: - callback(event) + def publish(self, event: Event): + for subscriber in self._subscribers: + subscriber(event) class Event: @@ -90,6 +104,13 @@ class ExecutionFinished(Event): result: Optional[bigframes.session.executor.ExecuteResult] = None +@dataclasses.dataclass(frozen=True) +class UnknownErrorEvent(Event): + exc_type: Any + exc_value: Any + traceback: Any + + @dataclasses.dataclass(frozen=True) class BigQuerySentEvent(ExecutionRunning): """Query sent to BigQuery.""" diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 670bd16490..4c308984bf 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -383,7 +383,9 @@ def close(self): publisher_session = getattr(self, "_publisher", None) if publisher_session: - publisher_session.send(self.session_id) + publisher_session.publish( + bigframes.core.events.SessionClosed(self.session_id) + ) @overload def read_gbq( # type: ignore[overload-overlap] diff --git a/bigframes/session/_io/bigquery/__init__.py b/bigframes/session/_io/bigquery/__init__.py index cdf9c78ca0..aa56dc0040 100644 --- a/bigframes/session/_io/bigquery/__init__.py +++ b/bigframes/session/_io/bigquery/__init__.py @@ -253,7 +253,7 @@ def publish_bq_event(event): else: bf_event = bigframes.core.events.BigQueryUnknownEvent(event) - publisher.send(bf_event) + publisher.publish(bf_event) return publish_bq_event @@ -378,7 +378,7 @@ def start_query_with_client( raise if not query_job.configuration.dry_run: - publisher.send( + publisher.publish( bigframes.core.events.BigQuerySentEvent( sql, billing_project=query_job.project, @@ -389,7 +389,7 @@ def start_query_with_client( ) results_iterator = query_job.result() if not query_job.configuration.dry_run: - publisher.send( + publisher.publish( bigframes.core.events.BigQueryFinishedEvent( billing_project=query_job.project, location=query_job.location, diff --git a/bigframes/session/bq_caching_executor.py b/bigframes/session/bq_caching_executor.py index f424505de0..e98f053a90 100644 --- a/bigframes/session/bq_caching_executor.py +++ b/bigframes/session/bq_caching_executor.py @@ -192,7 +192,7 @@ def execute( array_value: bigframes.core.ArrayValue, execution_spec: ex_spec.ExecutionSpec, ) -> executor.ExecuteResult: - self._publisher.send(bigframes.core.events.ExecutionStarted()) + self._publisher.publish(bigframes.core.events.ExecutionStarted()) # TODO: Support export jobs in combination with semi executors if execution_spec.destination_spec is None: @@ -202,7 +202,7 @@ def execute( plan, ordered=execution_spec.ordered, peek=execution_spec.peek ) if maybe_result: - self._publisher.send( + self._publisher.publish( bigframes.core.events.ExecutionFinished( result=maybe_result, ) @@ -216,7 +216,7 @@ def execute( ) # separate path for export_gbq, as it has all sorts of annoying logic, such as possibly running as dml result = self._export_gbq(array_value, execution_spec.destination_spec) - self._publisher.send( + self._publisher.publish( bigframes.core.events.ExecutionFinished( result=result, ) @@ -236,7 +236,7 @@ def execute( if isinstance(execution_spec.destination_spec, ex_spec.GcsOutputSpec): self._export_result_gcs(result, execution_spec.destination_spec) - self._publisher.send( + self._publisher.publish( bigframes.core.events.ExecutionFinished( result=result, ) diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index e7a69b50e6..940fdc1352 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -788,7 +788,7 @@ def read_gbq_table( # If non in strict ordering mode, don't go through overhead of scanning index column(s) to determine if unique if not primary_key and self._scan_index_uniqueness and index_cols: if publish_execution: - self._publisher.send( + self._publisher.publish( bigframes.core.events.ExecutionStarted(), ) primary_key = bf_read_gbq_table.check_if_index_columns_are_unique( @@ -798,7 +798,7 @@ def read_gbq_table( publisher=self._publisher, ) if publish_execution: - self._publisher.send( + self._publisher.publish( bigframes.core.events.ExecutionFinished(), ) @@ -1021,7 +1021,7 @@ def read_gbq_query( # We want to make sure we show progress when we actually do execute a # query. Since we have got this far, we know it's not a dry run. - self._publisher.send( + self._publisher.publish( bigframes.core.events.ExecutionStarted(), ) @@ -1086,7 +1086,7 @@ def read_gbq_query( index_col=index_col, columns=columns, ) - self._publisher.send( + self._publisher.publish( bigframes.core.events.ExecutionFinished(), ) return df @@ -1098,7 +1098,7 @@ def read_gbq_query( query_job_for_metrics, session=self._session, ) - self._publisher.send( + self._publisher.publish( bigframes.core.events.ExecutionFinished(), ) return df @@ -1116,7 +1116,7 @@ def read_gbq_query( query_job_for_metrics, session=self._session, ) - self._publisher.send( + self._publisher.publish( bigframes.core.events.ExecutionFinished(), ) return df @@ -1140,7 +1140,7 @@ def read_gbq_query( # max_results and filters are omitted because they are already # handled by to_query(), above. ) - self._publisher.send( + self._publisher.publish( bigframes.core.events.ExecutionFinished(), ) return df From e7ca461ec01f6915e6218bd3614f68d62c9cce43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Fri, 3 Oct 2025 15:41:43 +0000 Subject: [PATCH 15/16] fix presubmits --- tests/system/small/test_progress_bar.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/system/small/test_progress_bar.py b/tests/system/small/test_progress_bar.py index 8c2e969227..0c9c4070f4 100644 --- a/tests/system/small/test_progress_bar.py +++ b/tests/system/small/test_progress_bar.py @@ -23,7 +23,7 @@ import bigframes.formatting_helpers as formatting_helpers from bigframes.session import MAX_INLINE_DF_BYTES -job_load_message_regex = r"Query.*with" +job_load_message_regex = r"Query" EXPECTED_DRY_RUN_MESSAGE = "Computation deferred. Computation will process" From 7edbb0add7261590361de545c1d1ad88652c66be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a?= Date: Mon, 6 Oct 2025 21:38:51 +0000 Subject: [PATCH 16/16] add lock for publisher and publish temp table creations --- bigframes/core/events.py | 15 ++++--- bigframes/session/__init__.py | 1 + bigframes/session/bigquery_session.py | 61 +++++++++++++++++++++----- tests/system/small/test_bq_sessions.py | 10 +++-- 4 files changed, 68 insertions(+), 19 deletions(-) diff --git a/bigframes/core/events.py b/bigframes/core/events.py index 893be59477..d0e5f7ad69 100644 --- a/bigframes/core/events.py +++ b/bigframes/core/events.py @@ -16,6 +16,7 @@ import dataclasses import datetime +import threading from typing import Any, Callable, Optional, Set import uuid @@ -30,7 +31,7 @@ class Subscriber: def __init__(self, callback: Callable[[Event], None], *, publisher: Publisher): self._publisher = publisher self._callback = callback - self._subscriber_id = str(uuid.uuid4()) + self._subscriber_id = uuid.uuid4() def __call__(self, *args, **kwargs): return self._callback(*args, **kwargs) @@ -65,21 +66,25 @@ def __exit__(self, exc_type, exc_value, traceback): class Publisher: def __init__(self): + self._subscribers_lock = threading.Lock() self._subscribers: Set[Subscriber] = set() def subscribe(self, callback: Callable[[Event], None]) -> Subscriber: # TODO(b/448176657): figure out how to handle subscribers/publishers in # a background thread. Maybe subscribers should be thread-local? subscriber = Subscriber(callback, publisher=self) - self._subscribers.add(subscriber) + with self._subscribers_lock: + self._subscribers.add(subscriber) return subscriber def unsubscribe(self, subscriber: Subscriber): - self._subscribers.remove(subscriber) + with self._subscribers_lock: + self._subscribers.remove(subscriber) def publish(self, event: Event): - for subscriber in self._subscribers: - subscriber(event) + with self._subscribers_lock: + for subscriber in self._subscribers: + subscriber(event) class Event: diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 4c308984bf..6c90838b17 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -240,6 +240,7 @@ def __init__( bigquery_session.SessionResourceManager( self.bqclient, self._location, + publisher=self._publisher, ) if (self._bq_kms_key_name is None) else None diff --git a/bigframes/session/bigquery_session.py b/bigframes/session/bigquery_session.py index 883087df07..99c13007d8 100644 --- a/bigframes/session/bigquery_session.py +++ b/bigframes/session/bigquery_session.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + import datetime import logging import threading @@ -23,7 +25,9 @@ import google.cloud.bigquery as bigquery from bigframes.core.compile import googlesql +import bigframes.core.events from bigframes.session import temporary_storage +import bigframes.session._io.bigquery as bfbqio KEEPALIVE_QUERY_TIMEOUT_SECONDS = 5.0 @@ -38,12 +42,19 @@ class SessionResourceManager(temporary_storage.TemporaryStorageManager): Responsible for allocating and cleaning up temporary gbq tables used by a BigFrames session. """ - def __init__(self, bqclient: bigquery.Client, location: str): + def __init__( + self, + bqclient: bigquery.Client, + location: str, + *, + publisher: bigframes.core.events.Publisher, + ): self.bqclient = bqclient self._location = location self._session_id: Optional[str] = None self._sessiondaemon: Optional[RecurringTaskDaemon] = None self._session_lock = threading.RLock() + self._publisher = publisher @property def location(self): @@ -84,21 +95,38 @@ def create_temp_table( ddl = f"CREATE TEMP TABLE `_SESSION`.{googlesql.identifier(table_ref.table_id)} ({fields_string}){cluster_string}" - job = self.bqclient.query( - ddl, job_config=job_config, location=self.location + _, job = bfbqio.start_query_with_client( + self.bqclient, + ddl, + job_config=job_config, + location=self.location, + project=None, + timeout=None, + metrics=None, + query_with_job=True, + publisher=self._publisher, ) job.result() # return the fully qualified table, so it can be used outside of the session - return job.destination + destination = job.destination + assert destination is not None, "Failure to create temp table." + return destination def close(self): if self._sessiondaemon is not None: self._sessiondaemon.stop() if self._session_id is not None and self.bqclient is not None: - self.bqclient.query_and_wait( + bfbqio.start_query_with_client( + self.bqclient, f"CALL BQ.ABORT_SESSION('{self._session_id}')", + job_config=bigquery.QueryJobConfig(), location=self.location, + project=None, + timeout=None, + metrics=None, + query_with_job=False, + publisher=self._publisher, ) def _get_session_id(self) -> str: @@ -109,8 +137,16 @@ def _get_session_id(self) -> str: job_config = bigquery.QueryJobConfig(create_session=True) # Make sure the session is a new one, not one associated with another query. job_config.use_query_cache = False - query_job = self.bqclient.query( - "SELECT 1", job_config=job_config, location=self.location + _, query_job = bfbqio.start_query_with_client( + self.bqclient, + "SELECT 1", + job_config=job_config, + location=self.location, + project=None, + timeout=None, + metrics=None, + query_with_job=True, + publisher=self._publisher, ) query_job.result() # blocks until finished assert query_job.session_info is not None @@ -133,11 +169,16 @@ def _keep_session_alive(self): ] ) try: - self.bqclient.query_and_wait( + bfbqio.start_query_with_client( + self.bqclient, "SELECT 1", - location=self.location, job_config=job_config, - wait_timeout=KEEPALIVE_QUERY_TIMEOUT_SECONDS, + location=self.location, + project=None, + timeout=KEEPALIVE_QUERY_TIMEOUT_SECONDS, + metrics=None, + query_with_job=False, + publisher=self._publisher, ) except Exception as e: logging.warning("BigQuery session keep-alive query errored : %s", e) diff --git a/tests/system/small/test_bq_sessions.py b/tests/system/small/test_bq_sessions.py index 7aad19bd8f..801346600d 100644 --- a/tests/system/small/test_bq_sessions.py +++ b/tests/system/small/test_bq_sessions.py @@ -17,10 +17,10 @@ import google import google.api_core.exceptions -import google.cloud from google.cloud import bigquery import pytest +import bigframes.core.events from bigframes.session import bigquery_session TEST_SCHEMA = [ @@ -39,12 +39,14 @@ def session_resource_manager( bigquery_client, ) -> bigquery_session.SessionResourceManager: - return bigquery_session.SessionResourceManager(bigquery_client, "US") + return bigquery_session.SessionResourceManager( + bigquery_client, "US", publisher=bigframes.core.events.Publisher() + ) def test_bq_session_create_temp_table_clustered(bigquery_client: bigquery.Client): session_resource_manager = bigquery_session.SessionResourceManager( - bigquery_client, "US" + bigquery_client, "US", publisher=bigframes.core.events.Publisher() ) cluster_cols = ["string field", "bool field"] @@ -68,7 +70,7 @@ def test_bq_session_create_temp_table_clustered(bigquery_client: bigquery.Client def test_bq_session_create_multi_temp_tables(bigquery_client: bigquery.Client): session_resource_manager = bigquery_session.SessionResourceManager( - bigquery_client, "US" + bigquery_client, "US", publisher=bigframes.core.events.Publisher() ) def create_table():