From 992dc210d8a4ed3d1bf65e76297f0afe9c60bd89 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 7 Oct 2025 04:17:32 +0000 Subject: [PATCH 01/53] change to ai.generate --- bigframes/operations/blob.py | 12 +++++++----- tests/system/large/blob/test_function.py | 1 + 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index 1f6b75a8f5..038cc1d891 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -974,21 +974,23 @@ def audio_transcribe( prompt_text = "**Task:** Transcribe the provided audio. **Instructions:** - Your response must contain only the verbatim transcription of the audio. - Do not include any introductory text, summaries, or conversational filler in your response. The output should begin directly with the first word of the audio." - # Convert the audio series to the runtime representation required by the model. - audio_runtime = audio_series.blob._get_runtime("R", with_metadata=True) - + # Use bbq.ai.generate() to transcribe audio transcribed_results = bbq.ai.generate( - prompt=(prompt_text, audio_runtime), + prompt=(prompt_text, audio_series), connection_id=connection, endpoint=model_name, - model_params={"generationConfig": {"temperature": 0.0}}, + request_type="unspecified", ) + transcribed_content_series = transcribed_results.struct.field("result").rename( + "transcribed_content" + ) transcribed_content_series = transcribed_results.struct.field("result").rename( "transcribed_content" ) if verbose: + transcribed_status_series = transcribed_results.struct.field("status") transcribed_status_series = transcribed_results.struct.field("status") results_df = bpd.DataFrame( { diff --git a/tests/system/large/blob/test_function.py b/tests/system/large/blob/test_function.py index 7963fabd0b..2124234649 100644 --- a/tests/system/large/blob/test_function.py +++ b/tests/system/large/blob/test_function.py @@ -768,6 +768,7 @@ def test_blob_transcribe( ) .to_pandas() ) + print(actual) # check relative length expected_text = "Now, as all books not primarily intended as picture-books consist principally of types composed to form letterpress" From 74e042a9ae286368a94840dbaee1d33dcafe673a Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Sat, 4 Oct 2025 07:19:28 +0000 Subject: [PATCH 02/53] perf: Default to interactive display for SQL in anywidget mode Previously, SQL queries in anywidget mode would fall back to deferred execution, showing a dry run instead of an interactive table. This change modifies the display logic to directly use the anywidget interactive display for SQL queries, providing a more consistent and responsive user experience. A test case has been added to verify this behavior. --- bigframes/dataframe.py | 21 ++++++++++++++++++--- tests/system/small/test_anywidget.py | 15 +++++++++++++++ 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index f016fddd83..ae284fef0e 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -783,11 +783,26 @@ def __repr__(self) -> str: opts = bigframes.options.display max_results = opts.max_rows - # anywdiget mode uses the same display logic as the "deferred" mode - # for faster execution - if opts.repr_mode in ("deferred", "anywidget"): + + # Only deferred mode shows dry run + if opts.repr_mode in ("deferred"): return formatter.repr_query_job(self._compute_dry_run()) + # Anywidget mode uses interative display + if opts.repr_mode == "anywidget": + # Try to display with anywidget, fall back to deferred if not in IPython + try: + from IPython.display import display as ipython_display + + from bigframes import display + + widget = display.TableWidget(self.copy()) + ipython_display(widget) + return "" # Return empty string since we used display() + except (AttributeError, ValueError, ImportError): + # Not in IPython environment, fall back to deferred mode + return formatter.repr_query_job(self._compute_dry_run()) + # TODO(swast): pass max_columns and get the true column count back. Maybe # get 1 more column than we have requested so that pandas can add the # ... for us? diff --git a/tests/system/small/test_anywidget.py b/tests/system/small/test_anywidget.py index 8944ee5365..ad16888b44 100644 --- a/tests/system/small/test_anywidget.py +++ b/tests/system/small/test_anywidget.py @@ -455,6 +455,21 @@ def test_widget_creation_should_load_css_for_rendering(table_widget): assert ".bigframes-widget .footer" in css_content +def test_sql_anywidget_mode(session: bf.Session): + """ + Test that a SQL query runs in anywidget mode. + """ + sql = "SELECT * FROM `bigquery-public-data.usa_names.usa_1910_current` LIMIT 5" + + with bf.option_context("display.repr_mode", "anywidget"): + df = session.read_gbq(sql) + # In a real environment, this would display a widget. + # For testing, we just want to make sure we're in the anywidget code path. + # The `_repr_html_` method in anywidget mode will return an empty string + # and display the widget via IPython's display mechanism. + assert df._repr_html_() == "" + + def test_widget_row_count_should_be_immutable_after_creation( paginated_bf_df: bf.dataframe.DataFrame, ): From 074d4c20f172c1ac2f0ed76bee38bb7d02b5acf3 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Sat, 4 Oct 2025 08:44:21 +0000 Subject: [PATCH 03/53] fix: resolve double printing issue in anywidget mode --- bigframes/dataframe.py | 5 +- notebooks/dataframes/anywidget_mode.ipynb | 38 ++++++-- tests/system/small/test_anywidget.py | 105 ++-------------------- 3 files changed, 41 insertions(+), 107 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index ae284fef0e..0eb53ddc03 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -792,13 +792,10 @@ def __repr__(self) -> str: if opts.repr_mode == "anywidget": # Try to display with anywidget, fall back to deferred if not in IPython try: - from IPython.display import display as ipython_display - from bigframes import display widget = display.TableWidget(self.copy()) - ipython_display(widget) - return "" # Return empty string since we used display() + return widget._repr_html_() # Return widget's HTML representation except (AttributeError, ValueError, ImportError): # Not in IPython environment, fall back to deferred mode return formatter.repr_query_job(self._compute_dry_run()) diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index c2af915721..88f9658009 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -73,11 +73,25 @@ "id": "f289d250", "metadata": {}, "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "071c0a905297406ba6c990cbbb8fc28d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "TableWidget(page_size=10, row_count=5552452, table_html=' 0 assert ".bigframes-widget .footer" in css_content -def test_sql_anywidget_mode(session: bf.Session): +@mock.patch("bigframes.display.TableWidget") +def test_sql_anywidget_mode(mock_table_widget, session: bf.Session): """ Test that a SQL query runs in anywidget mode. """ @@ -465,88 +456,8 @@ def test_sql_anywidget_mode(session: bf.Session): df = session.read_gbq(sql) # In a real environment, this would display a widget. # For testing, we just want to make sure we're in the anywidget code path. - # The `_repr_html_` method in anywidget mode will return an empty string - # and display the widget via IPython's display mechanism. - assert df._repr_html_() == "" - - -def test_widget_row_count_should_be_immutable_after_creation( - paginated_bf_df: bf.dataframe.DataFrame, -): - """ - Given a widget created with a specific configuration when global display - options are changed later, the widget's original row_count should remain - unchanged. - """ - from bigframes.display import TableWidget - - # Use a context manager to ensure the option is reset - with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): - widget = TableWidget(paginated_bf_df) - initial_row_count = widget.row_count - - # Change a global option that could influence row count - bf.options.display.max_rows = 10 - - # Verify the row count remains immutable. - assert widget.row_count == initial_row_count - - -class FaultyIterator: - def __iter__(self): - return self - - def __next__(self): - raise ValueError("Simulated read error") - - -def test_widget_should_fallback_to_zero_rows_with_invalid_total_rows( - paginated_bf_df: bf.dataframe.DataFrame, - monkeypatch: pytest.MonkeyPatch, -): - """ - Given an internal component fails to return valid execution data, - when the TableWidget is created, its error_message should be set and displayed. - """ - # Patch the executor's 'execute' method to simulate an error. - monkeypatch.setattr( - "bigframes.session.bq_caching_executor.BigQueryCachingExecutor.execute", - lambda self, *args, **kwargs: mock_execute_result_with_params( - self, paginated_bf_df._block.expr.schema, None, [], *args, **kwargs - ), - ) - - # Create the TableWidget under the error condition. - with bf.option_context("display.repr_mode", "anywidget"): - from bigframes.display import TableWidget - - # The widget should handle the faulty data from the mock without crashing. - widget = TableWidget(paginated_bf_df) - - # The widget should have an error message and display it in the HTML. - assert widget.row_count == 0 - assert widget._error_message is not None - assert "Could not determine total row count" in widget._error_message - assert widget._error_message in widget.table_html - - -def test_widget_row_count_reflects_actual_data_available( - paginated_bf_df: bf.dataframe.DataFrame, -): - """ - Test that widget row_count reflects the actual data available, - regardless of theoretical limits. - """ - from bigframes.display import TableWidget - - # Set up display options that define a page size. - with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): - widget = TableWidget(paginated_bf_df) - - # The widget should report the total rows in the DataFrame, - # not limited by page_size (which only affects pagination) - assert widget.row_count == EXPECTED_ROW_COUNT - assert widget.page_size == 2 # Respects the display option + df._repr_html_() + mock_table_widget.assert_called_once() # TODO(shuowei): Add tests for custom index and multiindex From 982ea9781af00c88b19b84bc16e0de3a78dea5ef Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 7 Oct 2025 05:42:54 +0000 Subject: [PATCH 04/53] feat: Add test case for STRUCT column in anywidget Adds a test case to verify that a DataFrame with a STRUCT column is correctly displayed in anywidget mode. This test confirms that displaying a STRUCT column does not raise an exception that would trigger the fallback to the deferred representation. It mocks `IPython.display.display` to capture the `TableWidget` instance and asserts that the rendered HTML contains the expected string representation of the STRUCT data. --- tests/system/small/test_anywidget.py | 41 ++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tests/system/small/test_anywidget.py b/tests/system/small/test_anywidget.py index 4f82f7d81d..15e902ee16 100644 --- a/tests/system/small/test_anywidget.py +++ b/tests/system/small/test_anywidget.py @@ -460,6 +460,47 @@ def test_sql_anywidget_mode(mock_table_widget, session: bf.Session): mock_table_widget.assert_called_once() +@mock.patch("IPython.display.display") +def test_struct_column_anywidget_mode(mock_display, session: bf.Session): + """ + Test that a DataFrame with a STRUCT column is displayed in anywidget mode + and does not fall back to the deferred representation. This confirms that + anywidget can handle complex types without raising an exception that would + trigger the fallback mechanism. + """ + pandas_df = pd.DataFrame( + { + "a": [1], + "b": [{"c": 2, "d": 3}], + } + ) + bf_df = session.read_pandas(pandas_df) + + with bf.option_context("display.repr_mode", "anywidget"): + with mock.patch( + "bigframes.dataframe.formatter.repr_query_job" + ) as mock_repr_query_job: + # Trigger the display logic. + result = bf_df._repr_html_() + + # Assert that we did NOT fall back to the deferred representation. + mock_repr_query_job.assert_not_called() + + # Assert that display was called with a TableWidget + mock_display.assert_called_once() + widget = mock_display.call_args[0][0] + from bigframes.display import TableWidget + + assert isinstance(widget, TableWidget) + + # Assert that the widget's html contains the struct + html = widget.table_html + assert "{'c': 2, 'd': 3}" in html + + # Assert that _repr_html_ returns an empty string + assert result == "" + + # TODO(shuowei): Add tests for custom index and multiindex # This may not be necessary for the SQL Cell use case but should be # considered for completeness. From a9116c71f964cf5c8cec16b3249ded6faffb30ec Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 9 Oct 2025 08:25:28 +0000 Subject: [PATCH 05/53] fix presubmit --- bigframes/display/anywidget.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index a0b4f809d8..15a022a1f5 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -218,16 +218,14 @@ def _set_table_html(self) -> None: start = self.page * self.page_size end = start + self.page_size - # fetch more data if the requested page is outside our cache - cached_data = self._cached_data - while len(cached_data) < end and not self._all_data_loaded: - if self._get_next_batch(): - cached_data = self._cached_data - else: - break - - # Get the data for the current page - page_data = cached_data.iloc[start:end] + # fetch more data if the requested page is outside our cache + cached_data = self._cached_data + while len(cached_data) < end and not self._all_data_loaded: + if self._get_next_batch(): + cached_data = self._cached_data + else: + break + page_data = cached_data.iloc[start:end] # Generate HTML table self.table_html = bigframes.display.html.render_html( @@ -250,8 +248,5 @@ def _page_size_changed(self, _change: Dict[str, Any]) -> None: # Reset the page to 0 when page size changes to avoid invalid page states self.page = 0 - # Reset batches to use new page size for future data fetching - self._reset_batches_for_new_page_size() - # Update the table display self._set_table_html() From f0992c693221965fe57b8ab0edba322a4ac0b303 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 15 Oct 2025 19:44:37 +0000 Subject: [PATCH 06/53] Revert accidental changes to test_function.py --- tests/system/large/blob/test_function.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/system/large/blob/test_function.py b/tests/system/large/blob/test_function.py index 2124234649..7963fabd0b 100644 --- a/tests/system/large/blob/test_function.py +++ b/tests/system/large/blob/test_function.py @@ -768,7 +768,6 @@ def test_blob_transcribe( ) .to_pandas() ) - print(actual) # check relative length expected_text = "Now, as all books not primarily intended as picture-books consist principally of types composed to form letterpress" From 3aefdbfe73e3ec6bfbc611c185aafc94de8e1538 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 15 Oct 2025 19:46:37 +0000 Subject: [PATCH 07/53] revert accidental change to blob.py --- bigframes/operations/blob.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index 038cc1d891..1f6b75a8f5 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -974,23 +974,21 @@ def audio_transcribe( prompt_text = "**Task:** Transcribe the provided audio. **Instructions:** - Your response must contain only the verbatim transcription of the audio. - Do not include any introductory text, summaries, or conversational filler in your response. The output should begin directly with the first word of the audio." - # Use bbq.ai.generate() to transcribe audio + # Convert the audio series to the runtime representation required by the model. + audio_runtime = audio_series.blob._get_runtime("R", with_metadata=True) + transcribed_results = bbq.ai.generate( - prompt=(prompt_text, audio_series), + prompt=(prompt_text, audio_runtime), connection_id=connection, endpoint=model_name, - request_type="unspecified", + model_params={"generationConfig": {"temperature": 0.0}}, ) - transcribed_content_series = transcribed_results.struct.field("result").rename( - "transcribed_content" - ) transcribed_content_series = transcribed_results.struct.field("result").rename( "transcribed_content" ) if verbose: - transcribed_status_series = transcribed_results.struct.field("status") transcribed_status_series = transcribed_results.struct.field("status") results_df = bpd.DataFrame( { From 7d4cfdfb6d677ad31245dfd4dda56ef8ff9a3fe6 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 15 Oct 2025 19:54:54 +0000 Subject: [PATCH 08/53] change return type --- bigframes/dataframe.py | 39 +++++++++++++-------------------------- 1 file changed, 13 insertions(+), 26 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 0eb53ddc03..0259e94132 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -23,7 +23,6 @@ import re import sys import textwrap -import traceback import typing from typing import ( Any, @@ -788,18 +787,6 @@ def __repr__(self) -> str: if opts.repr_mode in ("deferred"): return formatter.repr_query_job(self._compute_dry_run()) - # Anywidget mode uses interative display - if opts.repr_mode == "anywidget": - # Try to display with anywidget, fall back to deferred if not in IPython - try: - from bigframes import display - - widget = display.TableWidget(self.copy()) - return widget._repr_html_() # Return widget's HTML representation - except (AttributeError, ValueError, ImportError): - # Not in IPython environment, fall back to deferred mode - return formatter.repr_query_job(self._compute_dry_run()) - # TODO(swast): pass max_columns and get the true column count back. Maybe # get 1 more column than we have requested so that pandas can add the # ... for us? @@ -863,27 +850,27 @@ def _repr_html_(self) -> str: if opts.repr_mode == "anywidget": try: + import anywidget # noqa: F401 from IPython.display import display as ipython_display + import traitlets # noqa: F401 from bigframes import display - - # Always create a new widget instance for each display call - # This ensures that each cell gets its own widget and prevents - # unintended sharing between cells - widget = display.TableWidget(df.copy()) - - ipython_display(widget) - return "" # Return empty string since we used display() - - except (AttributeError, ValueError, ImportError): - # Fallback if anywidget is not available + except ImportError: warnings.warn( - "Anywidget mode is not available. " + "anywidget or its dependencies are not installed. " "Please `pip install anywidget traitlets` or `pip install 'bigframes[anywidget]'` to use interactive tables. " - f"Falling back to deferred mode. Error: {traceback.format_exc()}" + "Falling back to deferred mode." ) return formatter.repr_query_job(self._compute_dry_run()) + # Always create a new widget instance for each display call + # This ensures that each cell gets its own widget and prevents + # unintended sharing between cells + widget = display.TableWidget(df.copy()) + + ipython_display(widget) + return "" # Return empty string since we used display() + # Continue with regular HTML rendering for non-anywidget modes # TODO(swast): pass max_columns and get the true column count back. Maybe # get 1 more column than we have requested so that pandas can add the From a951810f11b3872d6b5868e37b5a56de08ff9655 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 20 Oct 2025 08:26:16 +0000 Subject: [PATCH 09/53] add todo and revert change --- bigframes/dataframe.py | 65 +++++++++++++++------------------- bigframes/display/anywidget.py | 25 ++++++++----- 2 files changed, 46 insertions(+), 44 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 0259e94132..b7d1268b61 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -23,6 +23,7 @@ import re import sys import textwrap +import traceback import typing from typing import ( Any, @@ -782,9 +783,9 @@ def __repr__(self) -> str: opts = bigframes.options.display max_results = opts.max_rows - - # Only deferred mode shows dry run - if opts.repr_mode in ("deferred"): + # anywdiget mode uses the same display logic as the "deferred" mode + # for faster execution + if opts.repr_mode in ("deferred", "anywidget"): return formatter.repr_query_job(self._compute_dry_run()) # TODO(swast): pass max_columns and get the true column count back. Maybe @@ -850,27 +851,27 @@ def _repr_html_(self) -> str: if opts.repr_mode == "anywidget": try: - import anywidget # noqa: F401 from IPython.display import display as ipython_display - import traitlets # noqa: F401 from bigframes import display - except ImportError: + + # Always create a new widget instance for each display call + # This ensures that each cell gets its own widget and prevents + # unintended sharing between cells + widget = display.TableWidget(df.copy()) + + ipython_display(widget) + return "" # Return empty string since we used display() + + except (AttributeError, ValueError, ImportError): + # Fallback if anywidget is not available warnings.warn( - "anywidget or its dependencies are not installed. " + "Anywidget mode is not available. " "Please `pip install anywidget traitlets` or `pip install 'bigframes[anywidget]'` to use interactive tables. " - "Falling back to deferred mode." + f"Falling back to deferred mode. Error: {traceback.format_exc()}" ) return formatter.repr_query_job(self._compute_dry_run()) - # Always create a new widget instance for each display call - # This ensures that each cell gets its own widget and prevents - # unintended sharing between cells - widget = display.TableWidget(df.copy()) - - ipython_display(widget) - return "" # Return empty string since we used display() - # Continue with regular HTML rendering for non-anywidget modes # TODO(swast): pass max_columns and get the true column count back. Maybe # get 1 more column than we have requested so that pandas can add the @@ -2568,33 +2569,25 @@ def sort_index( ) -> None: ... + @validations.requires_index def sort_index( self, *, - axis: Union[int, str] = 0, ascending: bool = True, inplace: bool = False, na_position: Literal["first", "last"] = "last", ) -> Optional[DataFrame]: - if utils.get_axis_number(axis) == 0: - if na_position not in ["first", "last"]: - raise ValueError("Param na_position must be one of 'first' or 'last'") - na_last = na_position == "last" - index_columns = self._block.index_columns - ordering = [ - order.ascending_over(column, na_last) - if ascending - else order.descending_over(column, na_last) - for column in index_columns - ] - block = self._block.order_by(ordering) - else: # axis=1 - _, indexer = self.columns.sort_values( - return_indexer=True, ascending=ascending, na_position=na_position # type: ignore - ) - block = self._block.select_columns( - [self._block.value_columns[i] for i in indexer] - ) + if na_position not in ["first", "last"]: + raise ValueError("Param na_position must be one of 'first' or 'last'") + na_last = na_position == "last" + index_columns = self._block.index_columns + ordering = [ + order.ascending_over(column, na_last) + if ascending + else order.descending_over(column, na_last) + for column in index_columns + ] + block = self._block.order_by(ordering) if inplace: self._set_block(block) return None diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index 15a022a1f5..1ed6eeb8a5 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -209,6 +209,15 @@ def _reset_batches_for_new_page_size(self) -> None: def _set_table_html(self) -> None: """Sets the current html data based on the current page and page size.""" + # TODO (shuowei): BigFrames Series with db_dtypes.JSONArrowType column + # fails to convert to pandas DataFrame in anywidget environment due to + # missing handling in to_pandas_batches(). b/453561268 + # For empty dataframe, render empty table with headers. + if self.row_count == 0: + page_data = self._cached_data + else: + start = self.page * self.page_size + end = start + self.page_size if self._error_message: self.table_html = ( f"
{self._error_message}
" @@ -218,14 +227,14 @@ def _set_table_html(self) -> None: start = self.page * self.page_size end = start + self.page_size - # fetch more data if the requested page is outside our cache - cached_data = self._cached_data - while len(cached_data) < end and not self._all_data_loaded: - if self._get_next_batch(): - cached_data = self._cached_data - else: - break - page_data = cached_data.iloc[start:end] + # fetch more data if the requested page is outside our cache + cached_data = self._cached_data + while len(cached_data) < end and not self._all_data_loaded: + if self._get_next_batch(): + cached_data = self._cached_data + else: + break + page_data = cached_data.iloc[start:end] # Generate HTML table self.table_html = bigframes.display.html.render_html( From 89521d2b108492f7b7fed2c29a00b729228a6d1e Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 20 Oct 2025 08:28:56 +0000 Subject: [PATCH 10/53] Revert "add todo and revert change" This reverts commit 153e1d203c273d6755623b3db30bd2256a240cc1. --- bigframes/dataframe.py | 65 +++++++++++++++++++--------------- bigframes/display/anywidget.py | 3 -- 2 files changed, 36 insertions(+), 32 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index b7d1268b61..0259e94132 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -23,7 +23,6 @@ import re import sys import textwrap -import traceback import typing from typing import ( Any, @@ -783,9 +782,9 @@ def __repr__(self) -> str: opts = bigframes.options.display max_results = opts.max_rows - # anywdiget mode uses the same display logic as the "deferred" mode - # for faster execution - if opts.repr_mode in ("deferred", "anywidget"): + + # Only deferred mode shows dry run + if opts.repr_mode in ("deferred"): return formatter.repr_query_job(self._compute_dry_run()) # TODO(swast): pass max_columns and get the true column count back. Maybe @@ -851,27 +850,27 @@ def _repr_html_(self) -> str: if opts.repr_mode == "anywidget": try: + import anywidget # noqa: F401 from IPython.display import display as ipython_display + import traitlets # noqa: F401 from bigframes import display - - # Always create a new widget instance for each display call - # This ensures that each cell gets its own widget and prevents - # unintended sharing between cells - widget = display.TableWidget(df.copy()) - - ipython_display(widget) - return "" # Return empty string since we used display() - - except (AttributeError, ValueError, ImportError): - # Fallback if anywidget is not available + except ImportError: warnings.warn( - "Anywidget mode is not available. " + "anywidget or its dependencies are not installed. " "Please `pip install anywidget traitlets` or `pip install 'bigframes[anywidget]'` to use interactive tables. " - f"Falling back to deferred mode. Error: {traceback.format_exc()}" + "Falling back to deferred mode." ) return formatter.repr_query_job(self._compute_dry_run()) + # Always create a new widget instance for each display call + # This ensures that each cell gets its own widget and prevents + # unintended sharing between cells + widget = display.TableWidget(df.copy()) + + ipython_display(widget) + return "" # Return empty string since we used display() + # Continue with regular HTML rendering for non-anywidget modes # TODO(swast): pass max_columns and get the true column count back. Maybe # get 1 more column than we have requested so that pandas can add the @@ -2569,25 +2568,33 @@ def sort_index( ) -> None: ... - @validations.requires_index def sort_index( self, *, + axis: Union[int, str] = 0, ascending: bool = True, inplace: bool = False, na_position: Literal["first", "last"] = "last", ) -> Optional[DataFrame]: - if na_position not in ["first", "last"]: - raise ValueError("Param na_position must be one of 'first' or 'last'") - na_last = na_position == "last" - index_columns = self._block.index_columns - ordering = [ - order.ascending_over(column, na_last) - if ascending - else order.descending_over(column, na_last) - for column in index_columns - ] - block = self._block.order_by(ordering) + if utils.get_axis_number(axis) == 0: + if na_position not in ["first", "last"]: + raise ValueError("Param na_position must be one of 'first' or 'last'") + na_last = na_position == "last" + index_columns = self._block.index_columns + ordering = [ + order.ascending_over(column, na_last) + if ascending + else order.descending_over(column, na_last) + for column in index_columns + ] + block = self._block.order_by(ordering) + else: # axis=1 + _, indexer = self.columns.sort_values( + return_indexer=True, ascending=ascending, na_position=na_position # type: ignore + ) + block = self._block.select_columns( + [self._block.value_columns[i] for i in indexer] + ) if inplace: self._set_block(block) return None diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index 1ed6eeb8a5..cf5d4e6310 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -209,9 +209,6 @@ def _reset_batches_for_new_page_size(self) -> None: def _set_table_html(self) -> None: """Sets the current html data based on the current page and page size.""" - # TODO (shuowei): BigFrames Series with db_dtypes.JSONArrowType column - # fails to convert to pandas DataFrame in anywidget environment due to - # missing handling in to_pandas_batches(). b/453561268 # For empty dataframe, render empty table with headers. if self.row_count == 0: page_data = self._cached_data From 1c155d04b2fd9d0ec286e4458cb5ae758e201c1a Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 20 Oct 2025 17:12:13 +0000 Subject: [PATCH 11/53] Add todo --- bigframes/display/anywidget.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index cf5d4e6310..1ed6eeb8a5 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -209,6 +209,9 @@ def _reset_batches_for_new_page_size(self) -> None: def _set_table_html(self) -> None: """Sets the current html data based on the current page and page size.""" + # TODO (shuowei): BigFrames Series with db_dtypes.JSONArrowType column + # fails to convert to pandas DataFrame in anywidget environment due to + # missing handling in to_pandas_batches(). b/453561268 # For empty dataframe, render empty table with headers. if self.row_count == 0: page_data = self._cached_data From 86cb692d9ad30ca1cf36f3490ce5fb4c5ac8a0ec Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 21 Oct 2025 01:54:37 +0000 Subject: [PATCH 12/53] Fix: Handle JSON dtype in anywidget display This commit fixes an AttributeError that occurred when displaying a DataFrame with a JSON column in anywidget mode. The dtype check was incorrect and has been updated. Additionally, the SQL compilation for casting JSON to string has been corrected to use TO_JSON_STRING. --- .../ibis_compiler/scalar_op_registry.py | 2 +- bigframes/display/anywidget.py | 19 +++++++++++++++---- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py index e983fc7e21..7b17aac61a 100644 --- a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py +++ b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -1036,7 +1036,7 @@ def astype_op_impl(x: ibis_types.Value, op: ops.AsTypeOp): if to_type == ibis_dtypes.bool: return cast_json_to_bool_in_safe(x) if op.safe else cast_json_to_bool(x) if to_type == ibis_dtypes.string: - return cast_json_to_string_in_safe(x) if op.safe else cast_json_to_string(x) + return to_json_string(x) # TODO: either inline this function, or push rest of this op into the function return bigframes.core.compile.ibis_types.cast_ibis_value(x, to_type, safe=op.safe) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index 1ed6eeb8a5..ff5a51f312 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -74,7 +74,21 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): "Please `pip install anywidget traitlets` or `pip install 'bigframes[anywidget]'` to use TableWidget." ) - self._dataframe = dataframe + super().__init__() + # Workaround for Arrow bug https://github.com/apache/arrow/issues/45262 + # JSON columns are not supported in `to_pandas_batches` and will be converted to string. + json_cols = [ + col + for col, dtype in dataframe.dtypes.items() + if dtype == bigframes.dtypes.JSON_DTYPE + ] + if json_cols: + df_copy = dataframe.copy() + for col in json_cols: + df_copy[str(col)] = df_copy[str(col)].astype("string") + self._dataframe = df_copy + else: + self._dataframe = dataframe super().__init__() @@ -209,9 +223,6 @@ def _reset_batches_for_new_page_size(self) -> None: def _set_table_html(self) -> None: """Sets the current html data based on the current page and page size.""" - # TODO (shuowei): BigFrames Series with db_dtypes.JSONArrowType column - # fails to convert to pandas DataFrame in anywidget environment due to - # missing handling in to_pandas_batches(). b/453561268 # For empty dataframe, render empty table with headers. if self.row_count == 0: page_data = self._cached_data From 81013c6133fe3beeaec2dce300b03b2165ca2d79 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 21 Oct 2025 02:40:03 +0000 Subject: [PATCH 13/53] revert a change --- bigframes/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 0259e94132..41bc4db03c 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -784,7 +784,7 @@ def __repr__(self) -> str: max_results = opts.max_rows # Only deferred mode shows dry run - if opts.repr_mode in ("deferred"): + if opts.repr_mode in ("deferred", "anywidget"): return formatter.repr_query_job(self._compute_dry_run()) # TODO(swast): pass max_columns and get the true column count back. Maybe From 6ea72810f26c13e76b0a92ed1333ba1b91d6edbf Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 21 Oct 2025 02:41:42 +0000 Subject: [PATCH 14/53] revert a change --- bigframes/dataframe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 41bc4db03c..fc60e47f7a 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -783,7 +783,8 @@ def __repr__(self) -> str: opts = bigframes.options.display max_results = opts.max_rows - # Only deferred mode shows dry run + # anywdiget mode uses the same display logic as the "deferred" mode + # for faster execution if opts.repr_mode in ("deferred", "anywidget"): return formatter.repr_query_job(self._compute_dry_run()) From 63b7918bba81abdef65a13cbd486b5f1bd5b364b Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 21 Oct 2025 02:49:38 +0000 Subject: [PATCH 15/53] Revert: Restore bigframes/dataframe.py to state from 42da847 --- bigframes/dataframe.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index fc60e47f7a..0259e94132 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -783,9 +783,8 @@ def __repr__(self) -> str: opts = bigframes.options.display max_results = opts.max_rows - # anywdiget mode uses the same display logic as the "deferred" mode - # for faster execution - if opts.repr_mode in ("deferred", "anywidget"): + # Only deferred mode shows dry run + if opts.repr_mode in ("deferred"): return formatter.repr_query_job(self._compute_dry_run()) # TODO(swast): pass max_columns and get the true column count back. Maybe From 4aa98797c42a93da2c3d1fb89d4293886d01d120 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 21 Oct 2025 02:57:02 +0000 Subject: [PATCH 16/53] remove anywidget from early return, allow execution proceeds to _repr_html_() --- bigframes/dataframe.py | 15 +++++++++++++++ bigframes/operations/output_schemas.py | 5 +++++ 2 files changed, 20 insertions(+) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 0259e94132..5ecc123417 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -863,6 +863,21 @@ def _repr_html_(self) -> str: ) return formatter.repr_query_job(self._compute_dry_run()) + # The anywidget frontend doesn't support the db_dtypes JSON type, so + # convert to strings for display. + json_cols = [ + series_name + for series_name, series in df.items() + if bigframes.dtypes.contains_db_dtypes_json_dtype(series.dtype) + ] + if json_cols: + warnings.warn( + "Converting JSON columns to strings for display. " + "This is temporary and will be removed when the frontend supports JSON types." + ) + for col in json_cols: + df[col] = df[col]._apply_unary_op(ops.json_ops.ToJSONString()) + # Always create a new widget instance for each display call # This ensures that each cell gets its own widget and prevents # unintended sharing between cells diff --git a/bigframes/operations/output_schemas.py b/bigframes/operations/output_schemas.py index ff9c9883dc..2a72d4f48f 100644 --- a/bigframes/operations/output_schemas.py +++ b/bigframes/operations/output_schemas.py @@ -14,6 +14,8 @@ import pyarrow as pa +from bigframes import dtypes + def parse_sql_type(sql: str) -> pa.DataType: """ @@ -43,6 +45,9 @@ def parse_sql_type(sql: str) -> pa.DataType: if sql.upper() == "BOOL": return pa.bool_() + if sql.upper() == "JSON": + return dtypes.JSON_ARROW_TYPE + if sql.upper().startswith("ARRAY<") and sql.endswith(">"): inner_type = sql[len("ARRAY<") : -1] return pa.list_(parse_sql_type(inner_type)) From 62d8608418bdd30e931a6ccd72e24be9ce591de5 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 21 Oct 2025 03:13:23 +0000 Subject: [PATCH 17/53] remove unnecessary changes --- bigframes/dataframe.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 5ecc123417..0dc8bc3d5f 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -783,8 +783,9 @@ def __repr__(self) -> str: opts = bigframes.options.display max_results = opts.max_rows - # Only deferred mode shows dry run - if opts.repr_mode in ("deferred"): + # anywdiget mode uses the same display logic as the "deferred" mode + # for faster execution + if opts.repr_mode in ("deferred", "anywidget"): return formatter.repr_query_job(self._compute_dry_run()) # TODO(swast): pass max_columns and get the true column count back. Maybe From 24d766d18fdd7fc8275bed76000219486bdeb828 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 21 Oct 2025 03:21:04 +0000 Subject: [PATCH 18/53] remove redundant code change --- bigframes/dataframe.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 0dc8bc3d5f..4fe259639e 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -23,6 +23,7 @@ import re import sys import textwrap +import traceback import typing from typing import ( Any, @@ -856,11 +857,12 @@ def _repr_html_(self) -> str: import traitlets # noqa: F401 from bigframes import display - except ImportError: + except (AttributeError, ValueError, ImportError): + # Fallback if anywidget is not available warnings.warn( - "anywidget or its dependencies are not installed. " + "Anywidget mode is not available. " "Please `pip install anywidget traitlets` or `pip install 'bigframes[anywidget]'` to use interactive tables. " - "Falling back to deferred mode." + f"Falling back to deferred mode. Error: {traceback.format_exc()}" ) return formatter.repr_query_job(self._compute_dry_run()) From 9239f20cdff25191082bdd789eccdc4ff6d6b584 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 21 Oct 2025 03:54:11 +0000 Subject: [PATCH 19/53] code style change --- .../ibis_compiler/scalar_op_registry.py | 2 +- bigframes/dataframe.py | 48 +++--- notebooks/dataframes/anywidget_mode.ipynb | 160 ++++++++++++++---- 3 files changed, 153 insertions(+), 57 deletions(-) diff --git a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py index 7b17aac61a..74314cd882 100644 --- a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py +++ b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -30,7 +30,7 @@ from bigframes.core.compile.constants import UNIT_TO_US_CONVERSION_FACTORS import bigframes.core.compile.ibis_compiler.default_ordering from bigframes.core.compile.ibis_compiler.scalar_op_compiler import ( - scalar_op_compiler, # TODO(tswast): avoid import of variables + scalar_op_compiler, # TODO(b/428238610): avoid import of variables ) import bigframes.core.compile.ibis_types import bigframes.operations as ops diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 4fe259639e..38500b8fb3 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -784,7 +784,7 @@ def __repr__(self) -> str: opts = bigframes.options.display max_results = opts.max_rows - # anywdiget mode uses the same display logic as the "deferred" mode + # anywidget mode uses the same display logic as the "deferred" mode # for faster execution if opts.repr_mode in ("deferred", "anywidget"): return formatter.repr_query_job(self._compute_dry_run()) @@ -857,6 +857,29 @@ def _repr_html_(self) -> str: import traitlets # noqa: F401 from bigframes import display + + # The anywidget frontend doesn't support the db_dtypes JSON type, so + # convert to strings for display. + json_cols = [ + series_name + for series_name, series in df.items() + if bigframes.dtypes.contains_db_dtypes_json_dtype(series.dtype) + ] + if json_cols: + warnings.warn( + "Converting JSON columns to strings for display. " + "This is temporary and will be removed when the frontend supports JSON types." + ) + for col in json_cols: + df[col] = df[col]._apply_unary_op(ops.json_ops.ToJSONString()) + + # Always create a new widget instance for each display call + # This ensures that each cell gets its own widget and prevents + # unintended sharing between cells + widget = display.TableWidget(df.copy()) + + ipython_display(widget) + return "" # Return empty string since we used display() except (AttributeError, ValueError, ImportError): # Fallback if anywidget is not available warnings.warn( @@ -866,29 +889,6 @@ def _repr_html_(self) -> str: ) return formatter.repr_query_job(self._compute_dry_run()) - # The anywidget frontend doesn't support the db_dtypes JSON type, so - # convert to strings for display. - json_cols = [ - series_name - for series_name, series in df.items() - if bigframes.dtypes.contains_db_dtypes_json_dtype(series.dtype) - ] - if json_cols: - warnings.warn( - "Converting JSON columns to strings for display. " - "This is temporary and will be removed when the frontend supports JSON types." - ) - for col in json_cols: - df[col] = df[col]._apply_unary_op(ops.json_ops.ToJSONString()) - - # Always create a new widget instance for each display call - # This ensures that each cell gets its own widget and prevents - # unintended sharing between cells - widget = display.TableWidget(df.copy()) - - ipython_display(widget) - return "" # Return empty string since we used display() - # Continue with regular HTML rendering for non-anywidget modes # TODO(swast): pass max_columns and get the true column count back. Maybe # get 1 more column than we have requested so that pandas can add the diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index 88f9658009..903d88b210 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -73,25 +73,11 @@ "id": "f289d250", "metadata": {}, "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "071c0a905297406ba6c990cbbb8fc28d", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "TableWidget(page_size=10, row_count=5552452, table_html='
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dataframe.py:869: UserWarning: Converting JSON columns to strings for display. This is temporary and will be removed when the frontend supports JSON types.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "✅ Completed. \n", + " Query processed 0 Bytes in a moment of slot time.\n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ab607cc7263f4a159ecfe63682c5e651", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "TableWidget(page_size=10, row_count=5, table_html='
(\\\"Extract the values.\\\", OBJ.GET_ACCESS_URL(OBJ.FETCH_METADATA(OBJ.MAKE_REF(gcs_path, \\\"us.conn\\\")), \\\"r\\\")),\\n\n", + " connection_id=>\\\"bigframes-dev.us.bigframes-default-connection\\\",\\n\n", + " output_schema=>\\\"publication_date string, class_international string, application_number string, filing_date string\\\") AS result,\\n\n", + " *\\n\n", + "FROM `bigquery-public-data.labeled_patents.extracted_data`\\n\n", + "LIMIT 5;\\n\n", + "\"\"\")" + ] } ], "metadata": { "kernelspec": { + "display_name": "3.10.18", "display_name": "3.10.18", "language": "python", "name": "python3" @@ -368,6 +463,7 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.18" + "version": "3.10.18" } }, "nbformat": 4, From 48d6c665c072237bc61aa7d705663bfe0aa4ddb8 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 21 Oct 2025 04:07:52 +0000 Subject: [PATCH 20/53] tescase update --- tests/system/small/test_anywidget.py | 68 ++++++++++++++++++++-------- 1 file changed, 48 insertions(+), 20 deletions(-) diff --git a/tests/system/small/test_anywidget.py b/tests/system/small/test_anywidget.py index 15e902ee16..40804e1853 100644 --- a/tests/system/small/test_anywidget.py +++ b/tests/system/small/test_anywidget.py @@ -24,7 +24,6 @@ # Test constants to avoid change detector tests EXPECTED_ROW_COUNT = 6 EXPECTED_PAGE_SIZE = 2 -EXPECTED_TOTAL_PAGES = 3 @pytest.fixture(scope="module") @@ -112,21 +111,19 @@ def empty_bf_df( return session.read_pandas(empty_pandas_df) -def mock_execute_result_with_params( - self, schema, total_rows_val, arrow_batches_val, *args, **kwargs -): - """ - Mocks an execution result with configurable total_rows and arrow_batches. - """ - from bigframes.session.executor import ExecuteResult +@pytest.fixture(scope="module") +def json_df(session: bf.Session) -> bf.dataframe.DataFrame: + """Create a DataFrame with a JSON column for testing.""" + import bigframes.dtypes - return ExecuteResult( - iter(arrow_batches_val), - schema=schema, - query_job=None, - total_bytes=None, - total_rows=total_rows_val, + pandas_df = pd.DataFrame( + { + "a": [1], + "b": ['{"c": 2, "d": 3}'], + } ) + pandas_df["b"] = pandas_df["b"].astype(bigframes.dtypes.JSON_DTYPE) + return session.read_pandas(pandas_df) def _assert_html_matches_pandas_slice( @@ -438,12 +435,6 @@ def test_setting_page_size_above_max_should_be_clamped(table_widget): # The page size is clamped to the maximum. assert table_widget.page_size == expected_clamped_size - """ - Test that the widget's CSS is loaded correctly. - """ - css_content = table_widget._css - assert ".bigframes-widget .footer" in css_content - @mock.patch("bigframes.display.TableWidget") def test_sql_anywidget_mode(mock_table_widget, session: bf.Session): @@ -501,6 +492,43 @@ def test_struct_column_anywidget_mode(mock_display, session: bf.Session): assert result == "" +def test_widget_creation_should_load_css_for_rendering(table_widget): + """ + Test that the widget's CSS is loaded correctly. + """ + css_content = table_widget._css + assert ".bigframes-widget .footer" in css_content + + +@mock.patch("IPython.display.display") +def test_json_column_anywidget_mode(mock_display, json_df: bf.dataframe.DataFrame): + """ + Test that a DataFrame with a JSON column is displayed in anywidget mode + by converting JSON to string, and does not fall back to deferred representation. + """ + with bf.option_context("display.repr_mode", "anywidget"): + with mock.patch( + "bigframes.dataframe.formatter.repr_query_job" + ) as mock_repr_query_job: + result = json_df._repr_html_() + + # Assert no fallback + mock_repr_query_job.assert_not_called() + + # Assert TableWidget was created and displayed + mock_display.assert_called_once() + widget = mock_display.call_args[0][0] + from bigframes.display import TableWidget + + assert isinstance(widget, TableWidget) + + # Assert JSON was converted to string in the HTML + html = widget.table_html + assert "{"c":2,"d":3}" in html + + assert result == "" + + # TODO(shuowei): Add tests for custom index and multiindex # This may not be necessary for the SQL Cell use case but should be # considered for completeness. From 4cb8cd22a6c93342d599f2e976ec05aa92b42302 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 21 Oct 2025 04:15:35 +0000 Subject: [PATCH 21/53] revert a change --- bigframes/core/compile/ibis_compiler/scalar_op_registry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py index 74314cd882..7b17aac61a 100644 --- a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py +++ b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -30,7 +30,7 @@ from bigframes.core.compile.constants import UNIT_TO_US_CONVERSION_FACTORS import bigframes.core.compile.ibis_compiler.default_ordering from bigframes.core.compile.ibis_compiler.scalar_op_compiler import ( - scalar_op_compiler, # TODO(b/428238610): avoid import of variables + scalar_op_compiler, # TODO(tswast): avoid import of variables ) import bigframes.core.compile.ibis_types import bigframes.operations as ops From 75a6d68e3e4c4c6474f1aaef2e257b6a0e0d1cf3 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 21 Oct 2025 04:24:10 +0000 Subject: [PATCH 22/53] final touch of notebook --- notebooks/dataframes/anywidget_mode.ipynb | 28 +++++++++++------------ 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index 903d88b210..23be36701d 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -142,7 +142,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "25b38c1408434091865f4bf9525dd069", + "model_id": "f0ed74d739b64a56a6e3750968b155e1", "version_major": 2, "version_minor": 0 }, @@ -217,7 +217,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "cb4f246802a1407cb966321d8724ea27", + "model_id": "fd00566103744c189a52033df9c9db7a", "version_major": 2, "version_minor": 0 }, @@ -330,7 +330,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "5b48c05254224c4dbce56f2793d6b661", + "model_id": "2233934e95b84a87b01b9352ca36346d", "version_major": 2, "version_minor": 0 }, @@ -369,7 +369,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 85.9 kB in 14 seconds of slot time.\n", + " Query processed 85.9 kB in 11 seconds of slot time.\n", " " ], "text/plain": [ @@ -408,7 +408,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "ab607cc7263f4a159ecfe63682c5e651", + "model_id": "3e3e09d7adee4bcaa5b3540603c2418a", "version_major": 2, "version_minor": 0 }, @@ -432,15 +432,15 @@ } ], "source": [ - "bpd._read_gbq_colab(\"\"\"\\n\n", - "SELECT\\n\n", - " AI.GENERATE(\\n\n", - " prompt=>(\\\"Extract the values.\\\", OBJ.GET_ACCESS_URL(OBJ.FETCH_METADATA(OBJ.MAKE_REF(gcs_path, \\\"us.conn\\\")), \\\"r\\\")),\\n\n", - " connection_id=>\\\"bigframes-dev.us.bigframes-default-connection\\\",\\n\n", - " output_schema=>\\\"publication_date string, class_international string, application_number string, filing_date string\\\") AS result,\\n\n", - " *\\n\n", - "FROM `bigquery-public-data.labeled_patents.extracted_data`\\n\n", - "LIMIT 5;\\n\n", + "bpd._read_gbq_colab(\"\"\"\n", + " SELECT\n", + " AI.GENERATE(\n", + " prompt=>(\\\"Extract the values.\\\", OBJ.GET_ACCESS_URL(OBJ.FETCH_METADATA(OBJ.MAKE_REF(gcs_path, \\\"us.conn\\\")), \\\"r\\\")),\n", + " connection_id=>\\\"bigframes-dev.us.bigframes-default-connection\\\",\n", + " output_schema=>\\\"publication_date string, class_international string, application_number string, filing_date string\\\") AS result,\n", + " *\n", + " FROM `bigquery-public-data.labeled_patents.extracted_data`\n", + " LIMIT 5;\n", "\"\"\")" ] } From 8dc2171ee13b43b7d9a776fae960f0c27e3b03dd Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 21 Oct 2025 17:21:16 +0000 Subject: [PATCH 23/53] fix presumbit error --- bigframes/operations/output_schemas.py | 5 ---- notebooks/dataframes/anywidget_mode.ipynb | 34 +++++++++++------------ tests/system/small/test_anywidget.py | 22 +++++++-------- 3 files changed, 27 insertions(+), 34 deletions(-) diff --git a/bigframes/operations/output_schemas.py b/bigframes/operations/output_schemas.py index 2a72d4f48f..ff9c9883dc 100644 --- a/bigframes/operations/output_schemas.py +++ b/bigframes/operations/output_schemas.py @@ -14,8 +14,6 @@ import pyarrow as pa -from bigframes import dtypes - def parse_sql_type(sql: str) -> pa.DataType: """ @@ -45,9 +43,6 @@ def parse_sql_type(sql: str) -> pa.DataType: if sql.upper() == "BOOL": return pa.bool_() - if sql.upper() == "JSON": - return dtypes.JSON_ARROW_TYPE - if sql.upper().startswith("ARRAY<") and sql.endswith(">"): inner_type = sql[len("ARRAY<") : -1] return pa.list_(parse_sql_type(inner_type)) diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index 23be36701d..154afea7e1 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "d10bfca4", "metadata": {}, "outputs": [], @@ -32,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "ca22f059", "metadata": {}, "outputs": [], @@ -50,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "1bc5aaf3", "metadata": {}, "outputs": [], @@ -69,7 +69,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "f289d250", "metadata": {}, "outputs": [ @@ -96,7 +96,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "42bb02ab", "metadata": {}, "outputs": [ @@ -123,7 +123,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "ce250157", "metadata": {}, "outputs": [ @@ -142,7 +142,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "f0ed74d739b64a56a6e3750968b155e1", + "model_id": "6e46f6d1352043a4baee57fa089f2b0c", "version_major": 2, "version_minor": 0 }, @@ -160,7 +160,7 @@ "Computation deferred. Computation will process 171.4 MB" ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -179,7 +179,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "6920d49b", "metadata": {}, "outputs": [ @@ -217,7 +217,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "fd00566103744c189a52033df9c9db7a", + "model_id": "88d370b617b545809eb7bb8e5c66ea0e", "version_major": 2, "version_minor": 0 }, @@ -251,7 +251,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "12b68f15", "metadata": {}, "outputs": [ @@ -288,7 +288,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "a9d5d13a", "metadata": {}, "outputs": [ @@ -330,7 +330,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "2233934e95b84a87b01b9352ca36346d", + "model_id": "dec19e8788b74219b88bccfc65e3b9c0", "version_major": 2, "version_minor": 0 }, @@ -361,7 +361,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "id": "added-cell-1", "metadata": {}, "outputs": [ @@ -369,7 +369,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 85.9 kB in 11 seconds of slot time.\n", + " Query processed 85.9 kB in 21 seconds of slot time.\n", " " ], "text/plain": [ @@ -408,7 +408,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "3e3e09d7adee4bcaa5b3540603c2418a", + "model_id": "774357b4083c47c8a5e1fd33bb6af188", "version_major": 2, "version_minor": 0 }, @@ -426,7 +426,7 @@ "Computation deferred. Computation will process 0 Bytes" ] }, - "execution_count": 10, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } diff --git a/tests/system/small/test_anywidget.py b/tests/system/small/test_anywidget.py index 40804e1853..890d591de5 100644 --- a/tests/system/small/test_anywidget.py +++ b/tests/system/small/test_anywidget.py @@ -62,8 +62,7 @@ def table_widget(paginated_bf_df: bf.dataframe.DataFrame): Helper fixture to create a TableWidget instance with a fixed page size. This reduces duplication across tests that use the same widget configuration. """ - - from bigframes.display import TableWidget + from bigframes.display.anywidget import TableWidget with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): # Delay context manager cleanup of `max_rows` until after tests finish. @@ -92,7 +91,7 @@ def small_bf_df( @pytest.fixture def small_widget(small_bf_df): """Helper fixture for tests using a DataFrame smaller than the page size.""" - from bigframes.display import TableWidget + from bigframes.display.anywidget import TableWidget with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 5): yield TableWidget(small_bf_df) @@ -152,10 +151,11 @@ def test_widget_initialization_should_calculate_total_row_count( paginated_bf_df: bf.dataframe.DataFrame, ): """A TableWidget should correctly calculate the total row count on creation.""" - from bigframes.display import TableWidget + from bigframes.display.anywidget import TableWidget with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): widget = TableWidget(paginated_bf_df) + widget = TableWidget(paginated_bf_df) assert widget.row_count == EXPECTED_ROW_COUNT @@ -266,7 +266,7 @@ def test_widget_pagination_should_work_with_custom_page_size( A widget should paginate correctly with a custom page size of 3. """ with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 3): - from bigframes.display import TableWidget + from bigframes.display.anywidget import TableWidget widget = TableWidget(paginated_bf_df) assert widget.page_size == 3 @@ -312,7 +312,7 @@ def test_widget_page_size_should_be_immutable_after_creation( by subsequent changes to global options. """ with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): - from bigframes.display import TableWidget + from bigframes.display.anywidget import TableWidget widget = TableWidget(paginated_bf_df) assert widget.page_size == 2 @@ -331,7 +331,7 @@ def test_widget_page_size_should_be_immutable_after_creation( def test_empty_widget_should_have_zero_row_count(empty_bf_df: bf.dataframe.DataFrame): """Given an empty DataFrame, the widget's row count should be 0.""" with bf.option_context("display.repr_mode", "anywidget"): - from bigframes.display import TableWidget + from bigframes.display.anywidget import TableWidget widget = TableWidget(empty_bf_df) @@ -341,7 +341,7 @@ def test_empty_widget_should_have_zero_row_count(empty_bf_df: bf.dataframe.DataF def test_empty_widget_should_render_table_headers(empty_bf_df: bf.dataframe.DataFrame): """Given an empty DataFrame, the widget should still render table headers.""" with bf.option_context("display.repr_mode", "anywidget"): - from bigframes.display import TableWidget + from bigframes.display.anywidget import TableWidget widget = TableWidget(empty_bf_df) @@ -477,10 +477,8 @@ def test_struct_column_anywidget_mode(mock_display, session: bf.Session): # Assert that we did NOT fall back to the deferred representation. mock_repr_query_job.assert_not_called() - # Assert that display was called with a TableWidget - mock_display.assert_called_once() widget = mock_display.call_args[0][0] - from bigframes.display import TableWidget + from bigframes.display.anywidget import TableWidget assert isinstance(widget, TableWidget) @@ -518,7 +516,7 @@ def test_json_column_anywidget_mode(mock_display, json_df: bf.dataframe.DataFram # Assert TableWidget was created and displayed mock_display.assert_called_once() widget = mock_display.call_args[0][0] - from bigframes.display import TableWidget + from bigframes.display.anywidget import TableWidget assert isinstance(widget, TableWidget) From 2adc426e9b97ea49397a1ce19ec30ca304af4410 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 21 Oct 2025 17:43:03 +0000 Subject: [PATCH 24/53] remove invlaid test with anywidget bug fix --- tests/system/small/test_series.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 5ace3f54d8..63c2f6c498 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -4077,7 +4077,6 @@ def test_json_astype_others(data, to_type, errors): pytest.param(["10.2", None], dtypes.INT_DTYPE, id="to_int"), pytest.param(["false", None], dtypes.FLOAT_DTYPE, id="to_float"), pytest.param(["10.2", None], dtypes.BOOL_DTYPE, id="to_bool"), - pytest.param(["true", None], dtypes.STRING_DTYPE, id="to_string"), ], ) def test_json_astype_others_raise_error(data, to_type): From faf1bb2d4f7123084a0ba0d09d5414c26fa02a11 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 21 Oct 2025 22:33:00 +0000 Subject: [PATCH 25/53] fix presubmit --- bigframes/series.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/bigframes/series.py b/bigframes/series.py index ad1f091803..e90a360418 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -609,6 +609,15 @@ def astype( if errors not in ["raise", "null"]: raise ValueError("Argument 'errors' must be one of 'raise' or 'null'") dtype = bigframes.dtypes.bigframes_type(dtype) + + # BigQuery doesn't support CAST(json_col AS STRING), but it does support + # TO_JSON_STRING(json_col). + if ( + self.dtype == bigframes.dtypes.JSON_DTYPE + and dtype == bigframes.dtypes.STRING_DTYPE + ): + return self._apply_unary_op(ops.json_ops.ToJSONString()) + return self._apply_unary_op( bigframes.operations.AsTypeOp(to_type=dtype, safe=(errors == "null")) ) From 7a83b804f27dd4216f90c21bf13885958beec924 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 21 Oct 2025 23:42:19 +0000 Subject: [PATCH 26/53] fix polar complier --- bigframes/core/compile/polars/compiler.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index acaf1b8f22..1a55cef63a 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -407,6 +407,19 @@ def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: assert isinstance(op, json_ops.JSONDecode) return input.str.json_decode(_DTYPE_MAPPING[op.to_type]) + @compile_op.register(json_ops.ToJSONString) + def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: + return input.str.json_decode(pl.String()) + + @compile_op.register(json_ops.ParseJSON) + def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: + return input.str.json_decode(pl.String()) + + @compile_op.register(json_ops.JSONExtract) + def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: + assert isinstance(op, json_ops.JSONExtract) + return input.str.json_extract(json_path=op.json_path) + @compile_op.register(arr_ops.ToArrayOp) def _(self, op: ops.ToArrayOp, *inputs: pl.Expr) -> pl.Expr: return pl.concat_list(*inputs) From 233e857acfeb1d8fdfc47e90391ccc555054272e Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 21 Oct 2025 23:49:03 +0000 Subject: [PATCH 27/53] Revert an unnecessary change --- bigframes/operations/output_schemas.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/bigframes/operations/output_schemas.py b/bigframes/operations/output_schemas.py index ff9c9883dc..2a72d4f48f 100644 --- a/bigframes/operations/output_schemas.py +++ b/bigframes/operations/output_schemas.py @@ -14,6 +14,8 @@ import pyarrow as pa +from bigframes import dtypes + def parse_sql_type(sql: str) -> pa.DataType: """ @@ -43,6 +45,9 @@ def parse_sql_type(sql: str) -> pa.DataType: if sql.upper() == "BOOL": return pa.bool_() + if sql.upper() == "JSON": + return dtypes.JSON_ARROW_TYPE + if sql.upper().startswith("ARRAY<") and sql.endswith(">"): inner_type = sql[len("ARRAY<") : -1] return pa.list_(parse_sql_type(inner_type)) From 11daddb7ebb22e6544dfd4fb2572b4c7b630ff00 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 27 Oct 2025 22:27:23 +0000 Subject: [PATCH 28/53] apply the workaround to i/O layer --- bigframes/core/compile/polars/compiler.py | 42 +- bigframes/dataframe.py | 17 +- tests/system/small/test_dataframe.py | 6151 +-------------------- 3 files changed, 60 insertions(+), 6150 deletions(-) diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index 1a55cef63a..681ca37da7 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -45,13 +45,13 @@ polars_installed = True if TYPE_CHECKING: import polars as pl + import pyarrow as pa else: try: import bigframes._importing - # Use import_polars() instead of importing directly so that we check - # the version numbers. pl = bigframes._importing.import_polars() + import pyarrow as pa except Exception: polars_installed = False @@ -409,11 +409,13 @@ def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: @compile_op.register(json_ops.ToJSONString) def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: - return input.str.json_decode(pl.String()) + # Convert JSON to string representation + return input.cast(pl.String()) @compile_op.register(json_ops.ParseJSON) def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: - return input.str.json_decode(pl.String()) + # Parse string as JSON - this should decode, not encode + return input.str.json_decode() @compile_op.register(json_ops.JSONExtract) def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: @@ -599,9 +601,35 @@ def compile_readlocal(self, node: nodes.ReadLocalNode): scan_item.source_id: scan_item.id.sql for scan_item in node.scan_list.items } - lazy_frame = cast( - pl.DataFrame, pl.from_arrow(node.local_data_source.data) - ).lazy() + + # Workaround for PyArrow bug https://github.com/apache/arrow/issues/45262 + # Convert JSON columns to strings before Polars processing + arrow_data = node.local_data_source.data + schema = arrow_data.schema + + # Check if any columns are JSON type + json_field_indices = [ + i + for i, field in enumerate(schema) + if pa.types.is_extension_type(field.type) + and field.type.extension_name == "google:sqlType:json" + ] + + if json_field_indices: + # Convert JSON columns to string columns + new_arrays = [] + new_fields = [] + for i, field in enumerate(schema): + if i in json_field_indices: + # Cast JSON to string + new_arrays.append(arrow_data.column(i).cast(pa.string())) + new_fields.append(pa.field(field.name, pa.string())) + else: + new_arrays.append(arrow_data.column(i)) + new_fields.append(field) + arrow_data = pa.table(new_arrays, schema=pa.schema(new_fields)) + + lazy_frame = cast(pl.DataFrame, pl.from_arrow(arrow_data)).lazy() lazy_frame = lazy_frame.select(cols_to_read.keys()).rename(cols_to_read) if node.offsets_col: lazy_frame = lazy_frame.with_columns( diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 38500b8fb3..788a47f38b 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1965,7 +1965,22 @@ def _to_pandas_batches( *, allow_large_results: Optional[bool] = None, ) -> blocks.PandasBatches: - return self._block.to_pandas_batches( + # Workaround for PyArrow bug https://github.com/apache/arrow/issues/45262 + # JSON columns are not supported in to_pandas_batches + json_cols = [ + str(col_name) # Cast to string + for col_name, dtype in self.dtypes.items() + if bigframes.dtypes.contains_db_dtypes_json_dtype(dtype) + ] + + df = self + if json_cols: + # Convert JSON columns to strings before materialization + df = df.copy() + for col in json_cols: + df[col] = df[col].astype("string") + + return df._block.to_pandas_batches( page_size=page_size, max_results=max_results, allow_large_results=allow_large_results, diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 79f8efd00f..ffd9bc512b 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1,6144 +1,11 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +def test_to_pandas_batches_with_json_columns(session): + """Test that JSON columns are properly handled in to_pandas_batches.""" + # Create a DataFrame with JSON column + df = session.read_gbq('SELECT JSON \'{"key": "value"}\' as json_col') -import io -import operator -import sys -import tempfile -import typing -from typing import Dict, List, Tuple + # This should not raise an error + batches = df._to_pandas_batches(page_size=10) + result = next(batches) -import geopandas as gpd # type: ignore -import numpy as np -import pandas as pd -import pandas.testing -import pyarrow as pa # type: ignore -import pytest - -import bigframes -import bigframes._config.display_options as display_options -import bigframes.core.indexes as bf_indexes -import bigframes.dataframe as dataframe -import bigframes.dtypes as dtypes -import bigframes.pandas as bpd -import bigframes.series as series -from bigframes.testing.utils import ( - assert_dfs_equivalent, - assert_pandas_df_equal, - assert_series_equal, - assert_series_equivalent, -) - - -def test_df_construct_copy(scalars_dfs): - columns = ["int64_col", "string_col", "float64_col"] - scalars_df, scalars_pandas_df = scalars_dfs - # Make the mapping from label to col_id non-trivial - bf_df = scalars_df.copy() - bf_df["int64_col"] = bf_df["int64_col"] / 2 - pd_df = scalars_pandas_df.copy() - pd_df["int64_col"] = pd_df["int64_col"] / 2 - - bf_result = dataframe.DataFrame(bf_df, columns=columns).to_pandas() - - pd_result = pd.DataFrame(pd_df, columns=columns) - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_df_construct_pandas_default(scalars_dfs): - # This should trigger the inlined codepath - columns = [ - "int64_too", - "int64_col", - "float64_col", - "bool_col", - "string_col", - "date_col", - "datetime_col", - "numeric_col", - "float64_col", - "time_col", - "timestamp_col", - ] - _, scalars_pandas_df = scalars_dfs - bf_result = dataframe.DataFrame(scalars_pandas_df, columns=columns).to_pandas() - pd_result = pd.DataFrame(scalars_pandas_df, columns=columns) - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -@pytest.mark.parametrize( - ("write_engine"), - [ - ("bigquery_inline"), - ("bigquery_load"), - ("bigquery_streaming"), - ("bigquery_write"), - ], -) -def test_read_pandas_all_nice_types( - session: bigframes.Session, scalars_pandas_df_index: pd.DataFrame, write_engine -): - bf_result = session.read_pandas( - scalars_pandas_df_index, write_engine=write_engine - ).to_pandas() - pandas.testing.assert_frame_equal(bf_result, scalars_pandas_df_index) - - -def test_df_construct_large_strings(): - data = [["hello", "w" + "o" * 50000 + "rld"]] - bf_result = dataframe.DataFrame(data).to_pandas() - pd_result = pd.DataFrame(data, dtype=pd.StringDtype(storage="pyarrow")) - pandas.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) - - -def test_df_construct_pandas_load_job(scalars_dfs_maybe_ordered): - # This should trigger the inlined codepath - columns = [ - "int64_too", - "int64_col", - "float64_col", - "bool_col", - "string_col", - "date_col", - "datetime_col", - "numeric_col", - "float64_col", - "time_col", - "timestamp_col", - "geography_col", - ] - _, scalars_pandas_df = scalars_dfs_maybe_ordered - bf_result = dataframe.DataFrame(scalars_pandas_df, columns=columns) - pd_result = pd.DataFrame(scalars_pandas_df, columns=columns) - assert_dfs_equivalent(pd_result, bf_result) - - -def test_df_construct_structs(session): - pd_frame = pd.Series( - [ - {"version": 1, "project": "pandas"}, - {"version": 2, "project": "pandas"}, - {"version": 1, "project": "numpy"}, - ] - ).to_frame() - bf_series = session.read_pandas(pd_frame) - pd.testing.assert_frame_equal( - bf_series.to_pandas(), pd_frame, check_index_type=False, check_dtype=False - ) - - -def test_df_construct_local_concat_pd(scalars_pandas_df_index, session): - pd_df = pd.concat([scalars_pandas_df_index, scalars_pandas_df_index]) - - bf_df = session.read_pandas(pd_df) - - pd.testing.assert_frame_equal( - bf_df.to_pandas(), pd_df, check_index_type=False, check_dtype=False - ) - - -def test_df_construct_pandas_set_dtype(scalars_dfs): - columns = [ - "int64_too", - "int64_col", - "float64_col", - "bool_col", - ] - _, scalars_pandas_df = scalars_dfs - bf_result = dataframe.DataFrame( - scalars_pandas_df, columns=columns, dtype="Float64" - ).to_pandas() - pd_result = pd.DataFrame(scalars_pandas_df, columns=columns, dtype="Float64") - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_df_construct_from_series(scalars_dfs_maybe_ordered): - scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered - bf_result = dataframe.DataFrame( - {"a": scalars_df["int64_col"], "b": scalars_df["string_col"]}, - dtype="string[pyarrow]", - ) - pd_result = pd.DataFrame( - {"a": scalars_pandas_df["int64_col"], "b": scalars_pandas_df["string_col"]}, - dtype="string[pyarrow]", - ) - assert_dfs_equivalent(pd_result, bf_result) - - -def test_df_construct_from_dict(): - input_dict = { - "Animal": ["Falcon", "Falcon", "Parrot", "Parrot"], - # With a space in column name. We use standardized SQL schema ids to solve the problem that BQ schema doesn't support column names with spaces. b/296751058 - "Max Speed": [380.0, 370.0, 24.0, 26.0], - } - bf_result = dataframe.DataFrame(input_dict).to_pandas() - pd_result = pd.DataFrame(input_dict) - - pandas.testing.assert_frame_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False - ) - - -@pytest.mark.parametrize( - ("json_type"), - [ - pytest.param(dtypes.JSON_DTYPE), - pytest.param("json"), - ], -) -def test_df_construct_w_json_dtype(json_type): - data = [ - "1", - "false", - '["a", {"b": 1}, null]', - None, - ] - df = dataframe.DataFrame({"json_col": data}, dtype=json_type) - - assert df["json_col"].dtype == dtypes.JSON_DTYPE - assert df["json_col"][1] == "false" - - -def test_df_construct_inline_respects_location(reset_default_session_and_location): - # Note: This starts a thread-local session. - with bpd.option_context("bigquery.location", "europe-west1"): - df = bpd.DataFrame([[1, 2, 3], [4, 5, 6]]) - df.to_gbq() - assert df.query_job is not None - table = bpd.get_global_session().bqclient.get_table(df.query_job.destination) - - assert table.location == "europe-west1" - - -def test_df_construct_dtype(): - data = { - "int_col": [1, 2, 3], - "string_col": ["1.1", "2.0", "3.5"], - "float_col": [1.0, 2.0, 3.0], - } - dtype = pd.StringDtype(storage="pyarrow") - bf_result = dataframe.DataFrame(data, dtype=dtype) - pd_result = pd.DataFrame(data, dtype=dtype) - pd_result.index = pd_result.index.astype("Int64") - pandas.testing.assert_frame_equal(bf_result.to_pandas(), pd_result) - - -def test_get_column(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "int64_col" - series = scalars_df[col_name] - bf_result = series.to_pandas() - pd_result = scalars_pandas_df[col_name] - assert_series_equal(bf_result, pd_result) - - -def test_get_column_nonstring(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - series = scalars_df.rename(columns={"int64_col": 123.1})[123.1] - bf_result = series.to_pandas() - pd_result = scalars_pandas_df.rename(columns={"int64_col": 123.1})[123.1] - assert_series_equal(bf_result, pd_result) - - -@pytest.mark.parametrize( - "row_slice", - [ - (slice(1, 7, 2)), - (slice(1, 7, None)), - (slice(None, -3, None)), - ], -) -def test_get_rows_with_slice(scalars_dfs, row_slice): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[row_slice].to_pandas() - pd_result = scalars_pandas_df[row_slice] - assert_pandas_df_equal(bf_result, pd_result) - - -def test_hasattr(scalars_dfs): - scalars_df, _ = scalars_dfs - assert hasattr(scalars_df, "int64_col") - assert hasattr(scalars_df, "head") - assert not hasattr(scalars_df, "not_exist") - - -@pytest.mark.parametrize( - ("ordered"), - [ - (True), - (False), - ], -) -def test_head_with_custom_column_labels( - scalars_df_index, scalars_pandas_df_index, ordered -): - rename_mapping = { - "int64_col": "Integer Column", - "string_col": "言語列", - } - bf_df = scalars_df_index.rename(columns=rename_mapping).head(3) - bf_result = bf_df.to_pandas(ordered=ordered) - pd_result = scalars_pandas_df_index.rename(columns=rename_mapping).head(3) - assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) - - -def test_tail_with_custom_column_labels(scalars_df_index, scalars_pandas_df_index): - rename_mapping = { - "int64_col": "Integer Column", - "string_col": "言語列", - } - bf_df = scalars_df_index.rename(columns=rename_mapping).tail(3) - bf_result = bf_df.to_pandas() - pd_result = scalars_pandas_df_index.rename(columns=rename_mapping).tail(3) - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -@pytest.mark.parametrize( - ("keep",), - [ - ("first",), - ("last",), - ("all",), - ], -) -def test_df_nlargest(scalars_df_index, scalars_pandas_df_index, keep): - bf_result = scalars_df_index.nlargest(3, ["bool_col", "int64_too"], keep=keep) - pd_result = scalars_pandas_df_index.nlargest( - 3, ["bool_col", "int64_too"], keep=keep - ) - - pd.testing.assert_frame_equal( - bf_result.to_pandas(), - pd_result, - ) - - -@pytest.mark.parametrize( - ("keep",), - [ - ("first",), - ("last",), - ("all",), - ], -) -def test_df_nsmallest(scalars_df_index, scalars_pandas_df_index, keep): - bf_result = scalars_df_index.nsmallest(6, ["bool_col"], keep=keep) - pd_result = scalars_pandas_df_index.nsmallest(6, ["bool_col"], keep=keep) - - pd.testing.assert_frame_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_get_column_by_attr(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - series = scalars_df.int64_col - bf_result = series.to_pandas() - pd_result = scalars_pandas_df.int64_col - assert_series_equal(bf_result, pd_result) - - -def test_get_columns(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_names = ["bool_col", "float64_col", "int64_col"] - df_subset = scalars_df.get(col_names) - df_pandas = df_subset.to_pandas() - pd.testing.assert_index_equal( - df_pandas.columns, scalars_pandas_df[col_names].columns - ) - - -def test_get_columns_default(scalars_dfs): - scalars_df, _ = scalars_dfs - col_names = ["not", "column", "names"] - result = scalars_df.get(col_names, "default_val") - assert result == "default_val" - - -@pytest.mark.parametrize( - ("loc", "column", "value", "allow_duplicates"), - [ - (0, 666, 2, False), - (5, "float64_col", 2.2, True), - (13, "rowindex_2", [8, 7, 6, 5, 4, 3, 2, 1, 0], True), - pytest.param( - 14, - "test", - 2, - False, - marks=pytest.mark.xfail( - raises=IndexError, - ), - ), - pytest.param( - 12, - "int64_col", - 2, - False, - marks=pytest.mark.xfail( - raises=ValueError, - ), - ), - ], -) -def test_insert(scalars_dfs, loc, column, value, allow_duplicates): - scalars_df, scalars_pandas_df = scalars_dfs - # insert works inplace, so will influence other tests. - # make a copy to avoid inplace changes. - bf_df = scalars_df.copy() - pd_df = scalars_pandas_df.copy() - bf_df.insert(loc, column, value, allow_duplicates) - pd_df.insert(loc, column, value, allow_duplicates) - - pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df, check_dtype=False) - - -def test_mask_series_cond(scalars_df_index, scalars_pandas_df_index): - cond_bf = scalars_df_index["int64_col"] > 0 - cond_pd = scalars_pandas_df_index["int64_col"] > 0 - - bf_df = scalars_df_index[["int64_too", "int64_col", "float64_col"]] - pd_df = scalars_pandas_df_index[["int64_too", "int64_col", "float64_col"]] - bf_result = bf_df.mask(cond_bf, bf_df + 1).to_pandas() - pd_result = pd_df.mask(cond_pd, pd_df + 1) - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_mask_callable(scalars_df_index, scalars_pandas_df_index): - def is_positive(x): - return x > 0 - - bf_df = scalars_df_index[["int64_too", "int64_col", "float64_col"]] - pd_df = scalars_pandas_df_index[["int64_too", "int64_col", "float64_col"]] - bf_result = bf_df.mask(cond=is_positive, other=lambda x: x + 1).to_pandas() - pd_result = pd_df.mask(cond=is_positive, other=lambda x: x + 1) - - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_where_multi_column(scalars_df_index, scalars_pandas_df_index): - # Test when a dataframe has multi-columns. - columns = ["int64_col", "float64_col"] - dataframe_bf = scalars_df_index[columns] - - dataframe_bf.columns = pd.MultiIndex.from_tuples( - [("str1", 1), ("str2", 2)], names=["STR", "INT"] - ) - cond_bf = dataframe_bf["str1"] > 0 - - with pytest.raises(NotImplementedError) as context: - dataframe_bf.where(cond_bf).to_pandas() - assert ( - str(context.value) - == "The dataframe.where() method does not support multi-column." - ) - - -def test_where_series_cond(scalars_df_index, scalars_pandas_df_index): - # Condition is dataframe, other is None (as default). - cond_bf = scalars_df_index["int64_col"] > 0 - cond_pd = scalars_pandas_df_index["int64_col"] > 0 - bf_result = scalars_df_index.where(cond_bf).to_pandas() - pd_result = scalars_pandas_df_index.where(cond_pd) - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_where_series_cond_const_other(scalars_df_index, scalars_pandas_df_index): - # Condition is a series, other is a constant. - columns = ["int64_col", "float64_col"] - dataframe_bf = scalars_df_index[columns] - dataframe_pd = scalars_pandas_df_index[columns] - dataframe_bf.columns.name = "test_name" - dataframe_pd.columns.name = "test_name" - - cond_bf = dataframe_bf["int64_col"] > 0 - cond_pd = dataframe_pd["int64_col"] > 0 - other = 0 - - bf_result = dataframe_bf.where(cond_bf, other).to_pandas() - pd_result = dataframe_pd.where(cond_pd, other) - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_where_series_cond_dataframe_other(scalars_df_index, scalars_pandas_df_index): - # Condition is a series, other is a dataframe. - columns = ["int64_col", "float64_col"] - dataframe_bf = scalars_df_index[columns] - dataframe_pd = scalars_pandas_df_index[columns] - - cond_bf = dataframe_bf["int64_col"] > 0 - cond_pd = dataframe_pd["int64_col"] > 0 - other_bf = -dataframe_bf - other_pd = -dataframe_pd - - bf_result = dataframe_bf.where(cond_bf, other_bf).to_pandas() - pd_result = dataframe_pd.where(cond_pd, other_pd) - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_where_dataframe_cond(scalars_df_index, scalars_pandas_df_index): - # Condition is a dataframe, other is None. - columns = ["int64_col", "float64_col"] - dataframe_bf = scalars_df_index[columns] - dataframe_pd = scalars_pandas_df_index[columns] - - cond_bf = dataframe_bf > 0 - cond_pd = dataframe_pd > 0 - - bf_result = dataframe_bf.where(cond_bf, None).to_pandas() - pd_result = dataframe_pd.where(cond_pd, None) - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_where_dataframe_cond_const_other(scalars_df_index, scalars_pandas_df_index): - # Condition is a dataframe, other is a constant. - columns = ["int64_col", "float64_col"] - dataframe_bf = scalars_df_index[columns] - dataframe_pd = scalars_pandas_df_index[columns] - - cond_bf = dataframe_bf > 0 - cond_pd = dataframe_pd > 0 - other_bf = 10 - other_pd = 10 - - bf_result = dataframe_bf.where(cond_bf, other_bf).to_pandas() - pd_result = dataframe_pd.where(cond_pd, other_pd) - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_where_dataframe_cond_dataframe_other( - scalars_df_index, scalars_pandas_df_index -): - # Condition is a dataframe, other is a dataframe. - columns = ["int64_col", "float64_col"] - dataframe_bf = scalars_df_index[columns] - dataframe_pd = scalars_pandas_df_index[columns] - - cond_bf = dataframe_bf > 0 - cond_pd = dataframe_pd > 0 - other_bf = dataframe_bf * 2 - other_pd = dataframe_pd * 2 - - bf_result = dataframe_bf.where(cond_bf, other_bf).to_pandas() - pd_result = dataframe_pd.where(cond_pd, other_pd) - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_where_callable_cond_constant_other(scalars_df_index, scalars_pandas_df_index): - # Condition is callable, other is a constant. - columns = ["int64_col", "float64_col"] - dataframe_bf = scalars_df_index[columns] - dataframe_pd = scalars_pandas_df_index[columns] - - other = 10 - - bf_result = dataframe_bf.where(lambda x: x > 0, other).to_pandas() - pd_result = dataframe_pd.where(lambda x: x > 0, other) - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_where_dataframe_cond_callable_other(scalars_df_index, scalars_pandas_df_index): - # Condition is a dataframe, other is callable. - columns = ["int64_col", "float64_col"] - dataframe_bf = scalars_df_index[columns] - dataframe_pd = scalars_pandas_df_index[columns] - - cond_bf = dataframe_bf > 0 - cond_pd = dataframe_pd > 0 - - def func(x): - return x * 2 - - bf_result = dataframe_bf.where(cond_bf, func).to_pandas() - pd_result = dataframe_pd.where(cond_pd, func) - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_where_callable_cond_callable_other(scalars_df_index, scalars_pandas_df_index): - # Condition is callable, other is callable too. - columns = ["int64_col", "float64_col"] - dataframe_bf = scalars_df_index[columns] - dataframe_pd = scalars_pandas_df_index[columns] - - def func(x): - return x["int64_col"] > 0 - - bf_result = dataframe_bf.where(func, lambda x: x * 2).to_pandas() - pd_result = dataframe_pd.where(func, lambda x: x * 2) - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_where_series_other(scalars_df_index): - # When other is a series, throw an error. - columns = ["int64_col", "float64_col"] - dataframe_bf = scalars_df_index[columns] - - with pytest.raises( - ValueError, - match="Seires is not a supported replacement type!", - ): - dataframe_bf.where(dataframe_bf > 0, dataframe_bf["int64_col"]) - - -def test_drop_column(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name = "int64_col" - df_pandas = scalars_df.drop(columns=col_name).to_pandas() - pd.testing.assert_index_equal( - df_pandas.columns, scalars_pandas_df.drop(columns=col_name).columns - ) - - -def test_drop_columns(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_names = ["int64_col", "geography_col", "time_col"] - df_pandas = scalars_df.drop(columns=col_names).to_pandas() - pd.testing.assert_index_equal( - df_pandas.columns, scalars_pandas_df.drop(columns=col_names).columns - ) - - -def test_drop_labels_axis_1(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - labels = ["int64_col", "geography_col", "time_col"] - - pd_result = scalars_pandas_df.drop(labels=labels, axis=1) - bf_result = scalars_df.drop(labels=labels, axis=1).to_pandas() - - pd.testing.assert_frame_equal(pd_result, bf_result) - - -def test_drop_with_custom_column_labels(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - rename_mapping = { - "int64_col": "Integer Column", - "string_col": "言語列", - } - dropped_columns = [ - "言語列", - "timestamp_col", - ] - bf_df = scalars_df.rename(columns=rename_mapping).drop(columns=dropped_columns) - bf_result = bf_df.to_pandas() - pd_result = scalars_pandas_df.rename(columns=rename_mapping).drop( - columns=dropped_columns - ) - assert_pandas_df_equal(bf_result, pd_result) - - -def test_df_memory_usage(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - pd_result = scalars_pandas_df.memory_usage() - bf_result = scalars_df.memory_usage() - - pd.testing.assert_series_equal(pd_result, bf_result, rtol=1.5) - - -def test_df_info(scalars_dfs): - expected = ( - "\n" - "Index: 9 entries, 0 to 8\n" - "Data columns (total 14 columns):\n" - " # Column Non-Null Count Dtype\n" - "--- ------------- ---------------- ------------------------------\n" - " 0 bool_col 8 non-null boolean\n" - " 1 bytes_col 6 non-null binary[pyarrow]\n" - " 2 date_col 7 non-null date32[day][pyarrow]\n" - " 3 datetime_col 6 non-null timestamp[us][pyarrow]\n" - " 4 geography_col 4 non-null geometry\n" - " 5 int64_col 8 non-null Int64\n" - " 6 int64_too 9 non-null Int64\n" - " 7 numeric_col 6 non-null decimal128(38, 9)[pyarrow]\n" - " 8 float64_col 7 non-null Float64\n" - " 9 rowindex_2 9 non-null Int64\n" - " 10 string_col 8 non-null string\n" - " 11 time_col 6 non-null time64[us][pyarrow]\n" - " 12 timestamp_col 6 non-null timestamp[us, tz=UTC][pyarrow]\n" - " 13 duration_col 7 non-null duration[us][pyarrow]\n" - "dtypes: Float64(1), Int64(3), binary[pyarrow](1), boolean(1), date32[day][pyarrow](1), decimal128(38, 9)[pyarrow](1), duration[us][pyarrow](1), geometry(1), string(1), time64[us][pyarrow](1), timestamp[us, tz=UTC][pyarrow](1), timestamp[us][pyarrow](1)\n" - "memory usage: 1341 bytes\n" - ) - - scalars_df, _ = scalars_dfs - bf_result = io.StringIO() - - scalars_df.info(buf=bf_result) - - assert expected == bf_result.getvalue() - - -@pytest.mark.parametrize( - ("include", "exclude"), - [ - ("Int64", None), - (["int"], None), - ("number", None), - ([pd.Int64Dtype(), pd.BooleanDtype()], None), - (None, [pd.Int64Dtype(), pd.BooleanDtype()]), - ("Int64", ["boolean"]), - ], -) -def test_select_dtypes(scalars_dfs, include, exclude): - scalars_df, scalars_pandas_df = scalars_dfs - - pd_result = scalars_pandas_df.select_dtypes(include=include, exclude=exclude) - bf_result = scalars_df.select_dtypes(include=include, exclude=exclude).to_pandas() - - pd.testing.assert_frame_equal(pd_result, bf_result) - - -def test_drop_index(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - pd_result = scalars_pandas_df.drop(index=[4, 1, 2]) - bf_result = scalars_df.drop(index=[4, 1, 2]).to_pandas() - - pd.testing.assert_frame_equal(pd_result, bf_result) - - -def test_drop_pandas_index(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - drop_index = scalars_pandas_df.iloc[[4, 1, 2]].index - - pd_result = scalars_pandas_df.drop(index=drop_index) - bf_result = scalars_df.drop(index=drop_index).to_pandas() - - pd.testing.assert_frame_equal(pd_result, bf_result) - - -def test_drop_bigframes_index(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - drop_index = scalars_df.loc[[4, 1, 2]].index - drop_pandas_index = scalars_pandas_df.loc[[4, 1, 2]].index - - pd_result = scalars_pandas_df.drop(index=drop_pandas_index) - bf_result = scalars_df.drop(index=drop_index).to_pandas() - - pd.testing.assert_frame_equal(pd_result, bf_result) - - -def test_drop_bigframes_index_with_na(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - scalars_df = scalars_df.copy() - scalars_pandas_df = scalars_pandas_df.copy() - scalars_df = scalars_df.set_index("bytes_col") - scalars_pandas_df = scalars_pandas_df.set_index("bytes_col") - drop_index = scalars_df.iloc[[3, 5]].index - drop_pandas_index = scalars_pandas_df.iloc[[3, 5]].index - - pd_result = scalars_pandas_df.drop(index=drop_pandas_index) # drop_pandas_index) - bf_result = scalars_df.drop(index=drop_index).to_pandas() - - pd.testing.assert_frame_equal(pd_result, bf_result) - - -def test_drop_bigframes_multiindex(scalars_dfs): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - scalars_df, scalars_pandas_df = scalars_dfs - scalars_df = scalars_df.copy() - scalars_pandas_df = scalars_pandas_df.copy() - sub_df = scalars_df.iloc[[4, 1, 2]] - sub_pandas_df = scalars_pandas_df.iloc[[4, 1, 2]] - sub_df = sub_df.set_index(["bytes_col", "numeric_col"]) - sub_pandas_df = sub_pandas_df.set_index(["bytes_col", "numeric_col"]) - drop_index = sub_df.index - drop_pandas_index = sub_pandas_df.index - - scalars_df = scalars_df.set_index(["bytes_col", "numeric_col"]) - scalars_pandas_df = scalars_pandas_df.set_index(["bytes_col", "numeric_col"]) - bf_result = scalars_df.drop(index=drop_index).to_pandas() - pd_result = scalars_pandas_df.drop(index=drop_pandas_index) - - pd.testing.assert_frame_equal(pd_result, bf_result) - - -def test_drop_labels_axis_0(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - pd_result = scalars_pandas_df.drop(labels=[4, 1, 2], axis=0) - bf_result = scalars_df.drop(labels=[4, 1, 2], axis=0).to_pandas() - - pd.testing.assert_frame_equal(pd_result, bf_result) - - -def test_drop_index_and_columns(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - pd_result = scalars_pandas_df.drop(index=[4, 1, 2], columns="int64_col") - bf_result = scalars_df.drop(index=[4, 1, 2], columns="int64_col").to_pandas() - - pd.testing.assert_frame_equal(pd_result, bf_result) - - -def test_rename(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name_dict = {"bool_col": 1.2345} - df_pandas = scalars_df.rename(columns=col_name_dict).to_pandas() - pd.testing.assert_index_equal( - df_pandas.columns, scalars_pandas_df.rename(columns=col_name_dict).columns - ) - - -def test_df_peek(scalars_dfs_maybe_ordered): - scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered - - peek_result = scalars_df.peek(n=3, force=False, allow_large_results=True) - - pd.testing.assert_index_equal(scalars_pandas_df.columns, peek_result.columns) - assert len(peek_result) == 3 - - -def test_df_peek_with_large_results_not_allowed(scalars_dfs_maybe_ordered): - scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered - - peek_result = scalars_df.peek(n=3, force=False, allow_large_results=False) - - pd.testing.assert_index_equal(scalars_pandas_df.columns, peek_result.columns) - assert len(peek_result) == 3 - - -def test_df_peek_filtered(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - peek_result = scalars_df[scalars_df.int64_col != 0].peek(n=3, force=False) - pd.testing.assert_index_equal(scalars_pandas_df.columns, peek_result.columns) - assert len(peek_result) == 3 - - -def test_df_peek_exception(scalars_dfs): - scalars_df, _ = scalars_dfs - - with pytest.raises(ValueError): - # Window ops aren't compatible with efficient peeking - scalars_df[["int64_col", "int64_too"]].cumsum().peek(n=3, force=False) - - -def test_df_peek_force_default(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - peek_result = scalars_df[["int64_col", "int64_too"]].cumsum().peek(n=3) - pd.testing.assert_index_equal( - scalars_pandas_df[["int64_col", "int64_too"]].columns, peek_result.columns - ) - assert len(peek_result) == 3 - - -def test_df_peek_reset_index(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - peek_result = ( - scalars_df[["int64_col", "int64_too"]].reset_index(drop=True).peek(n=3) - ) - pd.testing.assert_index_equal( - scalars_pandas_df[["int64_col", "int64_too"]].columns, peek_result.columns - ) - assert len(peek_result) == 3 - - -def test_repr_w_all_rows(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - # Remove columns with flaky formatting, like NUMERIC columns (which use the - # object dtype). Also makes a copy so that mutating the index name doesn't - # break other tests. - scalars_df = scalars_df.drop(columns=["numeric_col"]) - scalars_pandas_df = scalars_pandas_df.drop(columns=["numeric_col"]) - - # When there are 10 or fewer rows, the outputs should be identical. - actual = repr(scalars_df.head(10)) - - with display_options.pandas_repr(bigframes.options.display): - expected = repr(scalars_pandas_df.head(10)) - - assert actual == expected - - -def test_join_repr(scalars_dfs_maybe_ordered): - scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered - - scalars_df = ( - scalars_df[["int64_col"]] - .join(scalars_df.set_index("int64_col")[["int64_too"]]) - .sort_index() - ) - scalars_pandas_df = ( - scalars_pandas_df[["int64_col"]] - .join(scalars_pandas_df.set_index("int64_col")[["int64_too"]]) - .sort_index() - ) - # Pandas join result index name seems to depend on the index values in a way that bigframes can't match exactly - scalars_pandas_df.index.name = None - - actual = repr(scalars_df) - - with display_options.pandas_repr(bigframes.options.display): - expected = repr(scalars_pandas_df) - - assert actual == expected - - -def test_repr_w_display_options(scalars_dfs, session): - metrics = session._metrics - scalars_df, _ = scalars_dfs - # get a pandas df of the expected format - df, _ = scalars_df._block.to_pandas() - pandas_df = df.set_axis(scalars_df._block.column_labels, axis=1) - pandas_df.index.name = scalars_df.index.name - - executions_pre = metrics.execution_count - with bigframes.option_context( - "display.max_rows", 10, "display.max_columns", 5, "display.max_colwidth", 10 - ): - - # When there are 10 or fewer rows, the outputs should be identical except for the extra note. - actual = scalars_df.head(10).__repr__() - executions_post = metrics.execution_count - - with display_options.pandas_repr(bigframes.options.display): - pandas_repr = pandas_df.head(10).__repr__() - - assert actual == pandas_repr - assert (executions_post - executions_pre) <= 3 - - -def test_repr_html_w_all_rows(scalars_dfs, session): - metrics = session._metrics - scalars_df, _ = scalars_dfs - # get a pandas df of the expected format - df, _ = scalars_df._block.to_pandas() - pandas_df = df.set_axis(scalars_df._block.column_labels, axis=1) - pandas_df.index.name = scalars_df.index.name - - executions_pre = metrics.execution_count - # When there are 10 or fewer rows, the outputs should be identical except for the extra note. - actual = scalars_df.head(10)._repr_html_() - executions_post = metrics.execution_count - - with display_options.pandas_repr(bigframes.options.display): - pandas_repr = pandas_df.head(10)._repr_html_() - - expected = ( - pandas_repr - + f"[{len(pandas_df.index)} rows x {len(pandas_df.columns)} columns in total]" - ) - assert actual == expected - assert (executions_post - executions_pre) <= 3 - - -def test_df_column_name_with_space(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name_dict = {"bool_col": "bool col"} - df_pandas = scalars_df.rename(columns=col_name_dict).to_pandas() - pd.testing.assert_index_equal( - df_pandas.columns, scalars_pandas_df.rename(columns=col_name_dict).columns - ) - - -def test_df_column_name_duplicate(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name_dict = {"int64_too": "int64_col"} - df_pandas = scalars_df.rename(columns=col_name_dict).to_pandas() - pd.testing.assert_index_equal( - df_pandas.columns, scalars_pandas_df.rename(columns=col_name_dict).columns - ) - - -def test_get_df_column_name_duplicate(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name_dict = {"int64_too": "int64_col"} - - bf_result = scalars_df.rename(columns=col_name_dict)["int64_col"].to_pandas() - pd_result = scalars_pandas_df.rename(columns=col_name_dict)["int64_col"] - pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) - - -@pytest.mark.parametrize( - ("indices", "axis"), - [ - ([1, 3, 5], 0), - ([2, 4, 6], 1), - ([1, -3, -5, -6], "index"), - ([-2, -4, -6], "columns"), - ], -) -def test_take_df(scalars_dfs, indices, axis): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = scalars_df.take(indices, axis=axis).to_pandas() - pd_result = scalars_pandas_df.take(indices, axis=axis) - - assert_pandas_df_equal(bf_result, pd_result) - - -def test_filter_df(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_bool_series = scalars_df["bool_col"] - bf_result = scalars_df[bf_bool_series].to_pandas() - - pd_bool_series = scalars_pandas_df["bool_col"] - pd_result = scalars_pandas_df[pd_bool_series] - - assert_pandas_df_equal(bf_result, pd_result) - - -def test_read_gbq_direct_to_batches_row_count(unordered_session): - df = unordered_session.read_gbq("bigquery-public-data.usa_names.usa_1910_2013") - iter = df.to_pandas_batches() - assert iter.total_rows == 5552452 - - -def test_df_to_pandas_batches(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - capped_unfiltered_batches = scalars_df.to_pandas_batches(page_size=2, max_results=6) - bf_bool_series = scalars_df["bool_col"] - filtered_batches = scalars_df[bf_bool_series].to_pandas_batches() - - pd_bool_series = scalars_pandas_df["bool_col"] - pd_result = scalars_pandas_df[pd_bool_series] - - assert 6 == capped_unfiltered_batches.total_rows - assert len(pd_result) == filtered_batches.total_rows - assert_pandas_df_equal(pd.concat(filtered_batches), pd_result) - - -@pytest.mark.parametrize( - ("literal", "expected_dtype"), - ( - pytest.param( - 2, - dtypes.INT_DTYPE, - id="INT64", - ), - # ==================================================================== - # NULL values - # - # These are regression tests for b/428999884. It needs to be possible to - # set a column to NULL with a desired type (not just the pandas default - # of float64). - # ==================================================================== - pytest.param(None, dtypes.FLOAT_DTYPE, id="NULL-None"), - pytest.param( - pa.scalar(None, type=pa.int64()), - dtypes.INT_DTYPE, - id="NULL-pyarrow-TIMESTAMP", - ), - pytest.param( - pa.scalar(None, type=pa.timestamp("us", tz="UTC")), - dtypes.TIMESTAMP_DTYPE, - id="NULL-pyarrow-TIMESTAMP", - ), - pytest.param( - pa.scalar(None, type=pa.timestamp("us")), - dtypes.DATETIME_DTYPE, - id="NULL-pyarrow-DATETIME", - ), - ), -) -def test_assign_new_column_w_literal(scalars_dfs, literal, expected_dtype): - scalars_df, scalars_pandas_df = scalars_dfs - df = scalars_df.assign(new_col=literal) - bf_result = df.to_pandas() - - new_col_pd = literal - if isinstance(literal, pa.Scalar): - # PyArrow integer scalars aren't yet supported in pandas Int64Dtype. - new_col_pd = literal.as_py() - - # Pandas might not pick the same dtype as BigFrames, but it should at least - # be castable to it. - pd_result = scalars_pandas_df.assign(new_col=new_col_pd) - pd_result["new_col"] = pd_result["new_col"].astype(expected_dtype) - - assert_pandas_df_equal(bf_result, pd_result) - - -def test_assign_new_column_w_loc(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_df = scalars_df.copy() - pd_df = scalars_pandas_df.copy() - bf_df.loc[:, "new_col"] = 2 - pd_df.loc[:, "new_col"] = 2 - bf_result = bf_df.to_pandas() - pd_result = pd_df - - # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. - pd_result["new_col"] = pd_result["new_col"].astype("Int64") - - pd.testing.assert_frame_equal(bf_result, pd_result) - - -@pytest.mark.parametrize( - ("scalar",), - [ - (2.1,), - (None,), - ], -) -def test_assign_new_column_w_setitem(scalars_dfs, scalar): - scalars_df, scalars_pandas_df = scalars_dfs - bf_df = scalars_df.copy() - pd_df = scalars_pandas_df.copy() - bf_df["new_col"] = scalar - pd_df["new_col"] = scalar - bf_result = bf_df.to_pandas() - pd_result = pd_df - - # Convert default pandas dtypes `float64` to match BigQuery DataFrames dtypes. - pd_result["new_col"] = pd_result["new_col"].astype("Float64") - - pd.testing.assert_frame_equal(bf_result, pd_result) - - -def test_assign_new_column_w_setitem_dataframe(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_df = scalars_df.copy() - pd_df = scalars_pandas_df.copy() - bf_df["int64_col"] = bf_df["int64_too"].to_frame() - pd_df["int64_col"] = pd_df["int64_too"].to_frame() - - # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. - pd_df["int64_col"] = pd_df["int64_col"].astype("Int64") - - pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df) - - -def test_assign_new_column_w_setitem_dataframe_error(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_df = scalars_df.copy() - pd_df = scalars_pandas_df.copy() - - with pytest.raises(ValueError): - bf_df["impossible_col"] = bf_df[["int64_too", "string_col"]] - with pytest.raises(ValueError): - pd_df["impossible_col"] = pd_df[["int64_too", "string_col"]] - - -def test_assign_new_column_w_setitem_list(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_df = scalars_df.copy() - pd_df = scalars_pandas_df.copy() - bf_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] - pd_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] - bf_result = bf_df.to_pandas() - pd_result = pd_df - - # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. - pd_result["new_col"] = pd_result["new_col"].astype("Int64") - - pd.testing.assert_frame_equal(bf_result, pd_result) - - -def test_assign_new_column_w_setitem_list_repeated(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_df = scalars_df.copy() - pd_df = scalars_pandas_df.copy() - bf_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] - pd_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] - bf_df["new_col_2"] = [1, 3, 2, 5, 4, 7, 6, 9, 8] - pd_df["new_col_2"] = [1, 3, 2, 5, 4, 7, 6, 9, 8] - bf_result = bf_df.to_pandas() - pd_result = pd_df - - # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. - pd_result["new_col"] = pd_result["new_col"].astype("Int64") - pd_result["new_col_2"] = pd_result["new_col_2"].astype("Int64") - - pd.testing.assert_frame_equal(bf_result, pd_result) - - -def test_assign_new_column_w_setitem_list_custom_index(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_df = scalars_df.copy() - pd_df = scalars_pandas_df.copy() - - # set the custom index - pd_df = pd_df.set_index(["string_col", "int64_col"]) - bf_df = bf_df.set_index(["string_col", "int64_col"]) - - bf_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] - pd_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] - bf_result = bf_df.to_pandas() - pd_result = pd_df - - # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. - pd_result["new_col"] = pd_result["new_col"].astype("Int64") - - pd.testing.assert_frame_equal(bf_result, pd_result) - - -def test_assign_new_column_w_setitem_list_error(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_df = scalars_df.copy() - pd_df = scalars_pandas_df.copy() - - with pytest.raises(ValueError): - pd_df["new_col"] = [1, 2, 3] # should be len 9, is 3 - with pytest.raises(ValueError): - bf_df["new_col"] = [1, 2, 3] - - -@pytest.mark.parametrize( - ("key", "value"), - [ - pytest.param(["int64_col", "int64_too"], 1, id="scalar_to_existing_column"), - pytest.param( - ["int64_col", "int64_too"], [1, 2], id="sequence_to_existing_column" - ), - pytest.param( - ["int64_col", "new_col"], [1, 2], id="sequence_to_partial_new_column" - ), - pytest.param( - ["new_col", "new_col_too"], [1, 2], id="sequence_to_full_new_column" - ), - pytest.param( - pd.Index(("new_col", "new_col_too")), - [1, 2], - id="sequence_to_full_new_column_as_index", - ), - ], -) -def test_setitem_multicolumn_with_literals(scalars_dfs, key, value): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df.copy() - pd_result = scalars_pandas_df.copy() - - bf_result[key] = value - pd_result[key] = value - - pd.testing.assert_frame_equal(pd_result, bf_result.to_pandas(), check_dtype=False) - - -def test_setitem_multicolumn_with_literals_different_lengths_raise_error(scalars_dfs): - scalars_df, _ = scalars_dfs - bf_result = scalars_df.copy() - - with pytest.raises(ValueError): - bf_result[["int64_col", "int64_too"]] = [1] - - -def test_setitem_multicolumn_with_dataframes(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df.copy() - pd_result = scalars_pandas_df.copy() - - bf_result[["int64_col", "int64_too"]] = bf_result[["int64_too", "int64_col"]] / 2 - pd_result[["int64_col", "int64_too"]] = pd_result[["int64_too", "int64_col"]] / 2 - - pd.testing.assert_frame_equal(pd_result, bf_result.to_pandas(), check_dtype=False) - - -def test_setitem_multicolumn_with_dataframes_series_on_rhs_raise_error(scalars_dfs): - scalars_df, _ = scalars_dfs - bf_result = scalars_df.copy() - - with pytest.raises(ValueError): - bf_result[["int64_col", "int64_too"]] = bf_result["int64_col"] / 2 - - -def test_setitem_multicolumn_with_dataframes_different_lengths_raise_error(scalars_dfs): - scalars_df, _ = scalars_dfs - bf_result = scalars_df.copy() - - with pytest.raises(ValueError): - bf_result[["int64_col"]] = bf_result[["int64_col", "int64_too"]] / 2 - - -def test_assign_existing_column(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - kwargs = {"int64_col": 2} - df = scalars_df.assign(**kwargs) - bf_result = df.to_pandas() - pd_result = scalars_pandas_df.assign(**kwargs) - - # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. - pd_result["int64_col"] = pd_result["int64_col"].astype("Int64") - - assert_pandas_df_equal(bf_result, pd_result) - - -def test_assign_listlike_to_empty_df(session): - empty_df = dataframe.DataFrame(session=session) - empty_pandas_df = pd.DataFrame() - - bf_result = empty_df.assign(new_col=[1, 2, 3]) - pd_result = empty_pandas_df.assign(new_col=[1, 2, 3]) - - pd_result["new_col"] = pd_result["new_col"].astype("Int64") - pd_result.index = pd_result.index.astype("Int64") - assert_pandas_df_equal(bf_result.to_pandas(), pd_result) - - -def test_assign_to_empty_df_multiindex_error(session): - empty_df = dataframe.DataFrame(session=session) - empty_pandas_df = pd.DataFrame() - - empty_df["empty_col_1"] = typing.cast(series.Series, []) - empty_df["empty_col_2"] = typing.cast(series.Series, []) - empty_pandas_df["empty_col_1"] = [] - empty_pandas_df["empty_col_2"] = [] - empty_df = empty_df.set_index(["empty_col_1", "empty_col_2"]) - empty_pandas_df = empty_pandas_df.set_index(["empty_col_1", "empty_col_2"]) - - with pytest.raises(ValueError): - empty_df.assign(new_col=[1, 2, 3, 4, 5, 6, 7, 8, 9]) - with pytest.raises(ValueError): - empty_pandas_df.assign(new_col=[1, 2, 3, 4, 5, 6, 7, 8, 9]) - - -@pytest.mark.parametrize( - ("ordered"), - [ - (True), - (False), - ], -) -def test_assign_series(scalars_dfs, ordered): - scalars_df, scalars_pandas_df = scalars_dfs - column_name = "int64_col" - df = scalars_df.assign(new_col=scalars_df[column_name]) - bf_result = df.to_pandas(ordered=ordered) - pd_result = scalars_pandas_df.assign(new_col=scalars_pandas_df[column_name]) - - assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) - - -def test_assign_series_overwrite(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - column_name = "int64_col" - df = scalars_df.assign(**{column_name: scalars_df[column_name] + 3}) - bf_result = df.to_pandas() - pd_result = scalars_pandas_df.assign( - **{column_name: scalars_pandas_df[column_name] + 3} - ) - - assert_pandas_df_equal(bf_result, pd_result) - - -def test_assign_sequential(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - kwargs = {"int64_col": 2, "new_col": 3, "new_col2": 4} - df = scalars_df.assign(**kwargs) - bf_result = df.to_pandas() - pd_result = scalars_pandas_df.assign(**kwargs) - - # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. - pd_result["int64_col"] = pd_result["int64_col"].astype("Int64") - pd_result["new_col"] = pd_result["new_col"].astype("Int64") - pd_result["new_col2"] = pd_result["new_col2"].astype("Int64") - - assert_pandas_df_equal(bf_result, pd_result) - - -# Require an index so that the self-join is consistent each time. -def test_assign_same_table_different_index_performs_self_join( - scalars_df_index, scalars_pandas_df_index -): - column_name = "int64_col" - bf_df = scalars_df_index.assign( - alternative_index=scalars_df_index["rowindex_2"] + 2 - ) - pd_df = scalars_pandas_df_index.assign( - alternative_index=scalars_pandas_df_index["rowindex_2"] + 2 - ) - bf_df_2 = bf_df.set_index("alternative_index") - pd_df_2 = pd_df.set_index("alternative_index") - bf_result = bf_df.assign(new_col=bf_df_2[column_name] * 10).to_pandas() - pd_result = pd_df.assign(new_col=pd_df_2[column_name] * 10) - - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -# Different table expression must have Index -def test_assign_different_df( - scalars_df_index, scalars_df_2_index, scalars_pandas_df_index -): - column_name = "int64_col" - df = scalars_df_index.assign(new_col=scalars_df_2_index[column_name]) - bf_result = df.to_pandas() - # Doesn't matter to pandas if it comes from the same DF or a different DF. - pd_result = scalars_pandas_df_index.assign( - new_col=scalars_pandas_df_index[column_name] - ) - - assert_pandas_df_equal(bf_result, pd_result) - - -def test_assign_different_df_w_loc( - scalars_df_index, scalars_df_2_index, scalars_pandas_df_index -): - bf_df = scalars_df_index.copy() - bf_df2 = scalars_df_2_index.copy() - pd_df = scalars_pandas_df_index.copy() - assert "int64_col" in bf_df.columns - assert "int64_col" in pd_df.columns - bf_df.loc[:, "int64_col"] = bf_df2.loc[:, "int64_col"] + 1 - pd_df.loc[:, "int64_col"] = pd_df.loc[:, "int64_col"] + 1 - bf_result = bf_df.to_pandas() - pd_result = pd_df - - # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. - pd_result["int64_col"] = pd_result["int64_col"].astype("Int64") - - pd.testing.assert_frame_equal(bf_result, pd_result) - - -def test_assign_different_df_w_setitem( - scalars_df_index, scalars_df_2_index, scalars_pandas_df_index -): - bf_df = scalars_df_index.copy() - bf_df2 = scalars_df_2_index.copy() - pd_df = scalars_pandas_df_index.copy() - assert "int64_col" in bf_df.columns - assert "int64_col" in pd_df.columns - bf_df["int64_col"] = bf_df2["int64_col"] + 1 - pd_df["int64_col"] = pd_df["int64_col"] + 1 - bf_result = bf_df.to_pandas() - pd_result = pd_df - - # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. - pd_result["int64_col"] = pd_result["int64_col"].astype("Int64") - - pd.testing.assert_frame_equal(bf_result, pd_result) - - -def test_assign_callable_lambda(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - kwargs = {"new_col": lambda x: x["int64_col"] + x["int64_too"]} - df = scalars_df.assign(**kwargs) - bf_result = df.to_pandas() - pd_result = scalars_pandas_df.assign(**kwargs) - - # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. - pd_result["new_col"] = pd_result["new_col"].astype("Int64") - - assert_pandas_df_equal(bf_result, pd_result) - - -@pytest.mark.parametrize( - ("axis", "how", "ignore_index", "subset"), - [ - (0, "any", False, None), - (0, "any", True, None), - (0, "all", False, ["bool_col", "time_col"]), - (0, "any", False, ["bool_col", "time_col"]), - (0, "all", False, "time_col"), - (1, "any", False, None), - (1, "all", False, None), - ], -) -def test_df_dropna_by_how(scalars_dfs, axis, how, ignore_index, subset): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - scalars_df, scalars_pandas_df = scalars_dfs - df = scalars_df.dropna(axis=axis, how=how, ignore_index=ignore_index, subset=subset) - bf_result = df.to_pandas() - pd_result = scalars_pandas_df.dropna( - axis=axis, how=how, ignore_index=ignore_index, subset=subset - ) - - # Pandas uses int64 instead of Int64 (nullable) dtype. - pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -@pytest.mark.parametrize( - ("axis", "ignore_index", "subset", "thresh"), - [ - (0, False, None, 2), - (0, True, None, 3), - (1, False, None, 2), - ], -) -def test_df_dropna_by_thresh(scalars_dfs, axis, ignore_index, subset, thresh): - """ - Tests that dropna correctly keeps rows/columns with a minimum number - of non-null values. - """ - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - scalars_df, scalars_pandas_df = scalars_dfs - - df_result = scalars_df.dropna( - axis=axis, thresh=thresh, ignore_index=ignore_index, subset=subset - ) - pd_result = scalars_pandas_df.dropna( - axis=axis, thresh=thresh, ignore_index=ignore_index, subset=subset - ) - - bf_result = df_result.to_pandas() - # Pandas uses int64 instead of Int64 (nullable) dtype. - pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - pd.testing.assert_frame_equal(bf_result, pd_result) - - -def test_df_dropna_range_columns(scalars_dfs): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - scalars_df, scalars_pandas_df = scalars_dfs - scalars_df = scalars_df.copy() - scalars_pandas_df = scalars_pandas_df.copy() - scalars_df.columns = pandas.RangeIndex(0, len(scalars_df.columns)) - scalars_pandas_df.columns = pandas.RangeIndex(0, len(scalars_pandas_df.columns)) - - df = scalars_df.dropna() - bf_result = df.to_pandas() - pd_result = scalars_pandas_df.dropna() - - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_df_interpolate(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - columns = ["int64_col", "int64_too", "float64_col"] - bf_result = scalars_df[columns].interpolate().to_pandas() - # Pandas can only interpolate on "float64" columns - # https://github.com/pandas-dev/pandas/issues/40252 - pd_result = scalars_pandas_df[columns].astype("float64").interpolate() - - pandas.testing.assert_frame_equal( - bf_result, - pd_result, - check_index_type=False, - check_dtype=False, - ) - - -@pytest.mark.parametrize( - "col, fill_value", - [ - (["int64_col", "float64_col"], 3), - (["string_col"], "A"), - (["datetime_col"], pd.Timestamp("2023-01-01")), - ], -) -def test_df_fillna(scalars_dfs, col, fill_value): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[col].fillna(fill_value).to_pandas() - pd_result = scalars_pandas_df[col].fillna(fill_value) - - pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) - - -def test_df_replace_scalar_scalar(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df.replace(555.555, 3).to_pandas() - pd_result = scalars_pandas_df.replace(555.555, 3) - - # pandas has narrower result types as they are determined dynamically - pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) - - -def test_df_replace_regex_scalar(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df.replace("^H.l", "Howdy, Planet!", regex=True).to_pandas() - pd_result = scalars_pandas_df.replace("^H.l", "Howdy, Planet!", regex=True) - - pd.testing.assert_frame_equal( - pd_result, - bf_result, - ) - - -def test_df_replace_list_scalar(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df.replace([555.555, 3.2], 3).to_pandas() - pd_result = scalars_pandas_df.replace([555.555, 3.2], 3) - - # pandas has narrower result types as they are determined dynamically - pd.testing.assert_frame_equal( - pd_result, - bf_result, - check_dtype=False, - ) - - -def test_df_replace_value_dict(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df.replace(1, {"int64_col": 100, "int64_too": 200}).to_pandas() - pd_result = scalars_pandas_df.replace(1, {"int64_col": 100, "int64_too": 200}) - - pd.testing.assert_frame_equal( - pd_result, - bf_result, - ) - - -def test_df_ffill(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[["int64_col", "float64_col"]].ffill(limit=1).to_pandas() - pd_result = scalars_pandas_df[["int64_col", "float64_col"]].ffill(limit=1) - - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_df_bfill(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[["int64_col", "float64_col"]].bfill().to_pandas() - pd_result = scalars_pandas_df[["int64_col", "float64_col"]].bfill() - - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_apply_series_series_callable( - scalars_df_index, - scalars_pandas_df_index, -): - columns = ["int64_too", "int64_col"] - - def foo(series, arg1, arg2, *, kwarg1=0, kwarg2=0): - return series**2 + (arg1 * arg2 % 4) + (kwarg1 * kwarg2 % 7) - - bf_result = ( - scalars_df_index[columns] - .apply(foo, args=(33, 61), kwarg1=52, kwarg2=21) - .to_pandas() - ) - - pd_result = scalars_pandas_df_index[columns].apply( - foo, args=(33, 61), kwarg1=52, kwarg2=21 - ) - - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_apply_series_listlike_callable( - scalars_df_index, - scalars_pandas_df_index, -): - columns = ["int64_too", "int64_col"] - bf_result = ( - scalars_df_index[columns].apply(lambda x: [len(x), x.min(), 24]).to_pandas() - ) - - pd_result = scalars_pandas_df_index[columns].apply(lambda x: [len(x), x.min(), 24]) - - # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. - pd_result.index = pd_result.index.astype("Int64") - pd_result = pd_result.astype("Int64") - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_apply_series_scalar_callable( - scalars_df_index, - scalars_pandas_df_index, -): - columns = ["int64_too", "int64_col"] - bf_result = scalars_df_index[columns].apply(lambda x: x.sum()) - - pd_result = scalars_pandas_df_index[columns].apply(lambda x: x.sum()) - - pandas.testing.assert_series_equal(bf_result, pd_result) - - -def test_df_pipe( - scalars_df_index, - scalars_pandas_df_index, -): - columns = ["int64_too", "int64_col"] - - def foo(x: int, y: int, df): - return (df + x) % y - - bf_result = ( - scalars_df_index[columns] - .pipe((foo, "df"), x=7, y=9) - .pipe(lambda x: x**2) - .to_pandas() - ) - - pd_result = ( - scalars_pandas_df_index[columns] - .pipe((foo, "df"), x=7, y=9) - .pipe(lambda x: x**2) - ) - - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_df_keys( - scalars_df_index, - scalars_pandas_df_index, -): - pandas.testing.assert_index_equal( - scalars_df_index.keys(), scalars_pandas_df_index.keys() - ) - - -def test_df_iter( - scalars_df_index, - scalars_pandas_df_index, -): - for bf_i, df_i in zip(scalars_df_index, scalars_pandas_df_index): - assert bf_i == df_i - - -def test_iterrows( - scalars_df_index, - scalars_pandas_df_index, -): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - scalars_df_index = scalars_df_index.add_suffix("_suffix", axis=1) - scalars_pandas_df_index = scalars_pandas_df_index.add_suffix("_suffix", axis=1) - for (bf_index, bf_series), (pd_index, pd_series) in zip( - scalars_df_index.iterrows(), scalars_pandas_df_index.iterrows() - ): - assert bf_index == pd_index - pandas.testing.assert_series_equal(bf_series, pd_series) - - -@pytest.mark.parametrize( - ( - "index", - "name", - ), - [ - ( - True, - "my_df", - ), - (False, None), - ], -) -def test_itertuples(scalars_df_index, index, name): - # Numeric has slightly different representation as a result of conversions. - bf_tuples = scalars_df_index.itertuples(index, name) - pd_tuples = scalars_df_index.to_pandas().itertuples(index, name) - for bf_tuple, pd_tuple in zip(bf_tuples, pd_tuples): - assert bf_tuple == pd_tuple - - -def test_df_isin_list_w_null(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - values = ["Hello, World!", 55555, 2.51, pd.NA, True] - bf_result = ( - scalars_df[["int64_col", "float64_col", "string_col", "bool_col"]] - .isin(values) - .to_pandas() - ) - pd_result = scalars_pandas_df[ - ["int64_col", "float64_col", "string_col", "bool_col"] - ].isin(values) - - pandas.testing.assert_frame_equal(bf_result, pd_result.astype("boolean")) - - -def test_df_isin_list_wo_null(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - values = ["Hello, World!", 55555, 2.51, True] - bf_result = ( - scalars_df[["int64_col", "float64_col", "string_col", "bool_col"]] - .isin(values) - .to_pandas() - ) - pd_result = scalars_pandas_df[ - ["int64_col", "float64_col", "string_col", "bool_col"] - ].isin(values) - - pandas.testing.assert_frame_equal(bf_result, pd_result.astype("boolean")) - - -def test_df_isin_dict(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - values = { - "string_col": ["Hello, World!", 55555, 2.51, pd.NA, True], - "int64_col": [5555, 2.51], - "bool_col": [pd.NA], - } - bf_result = ( - scalars_df[["int64_col", "float64_col", "string_col", "bool_col"]] - .isin(values) - .to_pandas() - ) - pd_result = scalars_pandas_df[ - ["int64_col", "float64_col", "string_col", "bool_col"] - ].isin(values) - - pandas.testing.assert_frame_equal(bf_result, pd_result.astype("boolean")) - - -def test_df_cross_merge(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - left_columns = ["int64_col", "float64_col", "rowindex_2"] - right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"] - - left = scalars_df[left_columns] - # Offset the rows somewhat so that outer join can have an effect. - right = scalars_df[right_columns].assign(rowindex_2=scalars_df["rowindex_2"] + 2) - - bf_result = left.merge(right, "cross").to_pandas() - - pd_result = scalars_pandas_df[left_columns].merge( - scalars_pandas_df[right_columns].assign( - rowindex_2=scalars_pandas_df["rowindex_2"] + 2 - ), - "cross", - ) - pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) - - -@pytest.mark.parametrize( - ("merge_how",), - [ - ("inner",), - ("outer",), - ("left",), - ("right",), - ], -) -def test_df_merge(scalars_dfs, merge_how): - scalars_df, scalars_pandas_df = scalars_dfs - on = "rowindex_2" - left_columns = ["int64_col", "float64_col", "rowindex_2"] - right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"] - - left = scalars_df[left_columns] - # Offset the rows somewhat so that outer join can have an effect. - right = scalars_df[right_columns].assign(rowindex_2=scalars_df["rowindex_2"] + 2) - - df = left.merge(right, merge_how, on, sort=True) - bf_result = df.to_pandas() - - pd_result = scalars_pandas_df[left_columns].merge( - scalars_pandas_df[right_columns].assign( - rowindex_2=scalars_pandas_df["rowindex_2"] + 2 - ), - merge_how, - on, - sort=True, - ) - - assert_pandas_df_equal( - bf_result, pd_result, ignore_order=True, check_index_type=False - ) - - -@pytest.mark.parametrize( - ("left_on", "right_on"), - [ - (["int64_col", "rowindex_2"], ["int64_col", "rowindex_2"]), - (["rowindex_2", "int64_col"], ["int64_col", "rowindex_2"]), - (["rowindex_2", "float64_col"], ["int64_col", "rowindex_2"]), - ], -) -def test_df_merge_multi_key(scalars_dfs, left_on, right_on): - scalars_df, scalars_pandas_df = scalars_dfs - left_columns = ["int64_col", "float64_col", "rowindex_2"] - right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"] - - left = scalars_df[left_columns] - # Offset the rows somewhat so that outer join can have an effect. - right = scalars_df[right_columns].assign(rowindex_2=scalars_df["rowindex_2"] + 2) - - df = left.merge(right, "outer", left_on=left_on, right_on=right_on, sort=True) - bf_result = df.to_pandas() - - pd_result = scalars_pandas_df[left_columns].merge( - scalars_pandas_df[right_columns].assign( - rowindex_2=scalars_pandas_df["rowindex_2"] + 2 - ), - "outer", - left_on=left_on, - right_on=right_on, - sort=True, - ) - - assert_pandas_df_equal( - bf_result, pd_result, ignore_order=True, check_index_type=False - ) - - -@pytest.mark.parametrize( - ("merge_how",), - [ - ("inner",), - ("outer",), - ("left",), - ("right",), - ], -) -def test_merge_custom_col_name(scalars_dfs, merge_how): - scalars_df, scalars_pandas_df = scalars_dfs - left_columns = ["int64_col", "float64_col"] - right_columns = ["int64_col", "bool_col", "string_col"] - on = "int64_col" - rename_columns = {"float64_col": "f64_col"} - - left = scalars_df[left_columns] - left = left.rename(columns=rename_columns) - right = scalars_df[right_columns] - df = left.merge(right, merge_how, on, sort=True) - bf_result = df.to_pandas() - - pandas_left_df = scalars_pandas_df[left_columns] - pandas_left_df = pandas_left_df.rename(columns=rename_columns) - pandas_right_df = scalars_pandas_df[right_columns] - pd_result = pandas_left_df.merge(pandas_right_df, merge_how, on, sort=True) - - assert_pandas_df_equal( - bf_result, pd_result, ignore_order=True, check_index_type=False - ) - - -@pytest.mark.parametrize( - ("merge_how",), - [ - ("inner",), - ("outer",), - ("left",), - ("right",), - ], -) -def test_merge_left_on_right_on(scalars_dfs, merge_how): - scalars_df, scalars_pandas_df = scalars_dfs - left_columns = ["int64_col", "float64_col", "int64_too"] - right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"] - - left = scalars_df[left_columns] - right = scalars_df[right_columns] - - df = left.merge( - right, merge_how, left_on="int64_too", right_on="rowindex_2", sort=True - ) - bf_result = df.to_pandas() - - pd_result = scalars_pandas_df[left_columns].merge( - scalars_pandas_df[right_columns], - merge_how, - left_on="int64_too", - right_on="rowindex_2", - sort=True, - ) - - assert_pandas_df_equal( - bf_result, pd_result, ignore_order=True, check_index_type=False - ) - - -def test_self_merge_self_w_on_args(): - data = { - "A": pd.Series([1, 2, 3], dtype="Int64"), - "B": pd.Series([1, 2, 3], dtype="Int64"), - "C": pd.Series([100, 200, 300], dtype="Int64"), - "D": pd.Series(["alpha", "beta", "gamma"], dtype="string[pyarrow]"), - } - df = pd.DataFrame(data) - - df1 = df[["A", "C"]] - df2 = df[["B", "C", "D"]] - pd_result = df1.merge(df2, left_on=["A", "C"], right_on=["B", "C"], how="inner") - - bf_df = bpd.DataFrame(data) - - bf_df1 = bf_df[["A", "C"]] - bf_df2 = bf_df[["B", "C", "D"]] - bf_result = bf_df1.merge( - bf_df2, left_on=["A", "C"], right_on=["B", "C"], how="inner" - ).to_pandas() - pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) - - -@pytest.mark.parametrize( - ("decimals",), - [ - (2,), - ({"float64_col": 0, "bool_col": 1, "int64_too": -3},), - ({},), - ], -) -def test_dataframe_round(scalars_dfs, decimals): - if pd.__version__.startswith("1."): - pytest.skip("Rounding doesn't work as expected in pandas 1.x") - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = scalars_df.round(decimals).to_pandas() - pd_result = scalars_pandas_df.round(decimals) - - assert_pandas_df_equal(bf_result, pd_result) - - -def test_get_dtypes(scalars_df_default_index): - dtypes = scalars_df_default_index.dtypes - dtypes_dict: Dict[str, bigframes.dtypes.Dtype] = { - "bool_col": pd.BooleanDtype(), - "bytes_col": pd.ArrowDtype(pa.binary()), - "date_col": pd.ArrowDtype(pa.date32()), - "datetime_col": pd.ArrowDtype(pa.timestamp("us")), - "geography_col": gpd.array.GeometryDtype(), - "int64_col": pd.Int64Dtype(), - "int64_too": pd.Int64Dtype(), - "numeric_col": pd.ArrowDtype(pa.decimal128(38, 9)), - "float64_col": pd.Float64Dtype(), - "rowindex": pd.Int64Dtype(), - "rowindex_2": pd.Int64Dtype(), - "string_col": pd.StringDtype(storage="pyarrow"), - "time_col": pd.ArrowDtype(pa.time64("us")), - "timestamp_col": pd.ArrowDtype(pa.timestamp("us", tz="UTC")), - "duration_col": pd.ArrowDtype(pa.duration("us")), - } - pd.testing.assert_series_equal( - dtypes, - pd.Series(dtypes_dict), - ) - - -def test_get_dtypes_array_struct_query(session): - df = session.read_gbq( - """SELECT - [1, 3, 2] AS array_column, - STRUCT( - "a" AS string_field, - 1.2 AS float_field) AS struct_column""" - ) - - dtypes = df.dtypes - pd.testing.assert_series_equal( - dtypes, - pd.Series( - { - "array_column": pd.ArrowDtype(pa.list_(pa.int64())), - "struct_column": pd.ArrowDtype( - pa.struct( - [ - ("string_field", pa.string()), - ("float_field", pa.float64()), - ] - ) - ), - } - ), - ) - - -def test_get_dtypes_array_struct_table(nested_df): - dtypes = nested_df.dtypes - pd.testing.assert_series_equal( - dtypes, - pd.Series( - { - "customer_id": pd.StringDtype(storage="pyarrow"), - "day": pd.ArrowDtype(pa.date32()), - "flag": pd.Int64Dtype(), - "label": pd.ArrowDtype( - pa.struct( - [ - ("key", pa.string()), - ("value", pa.string()), - ] - ), - ), - "event_sequence": pd.ArrowDtype( - pa.list_( - pa.struct( - [ - pa.field( - "data", - pa.list_( - pa.struct( - [ - ("value", pa.float64()), - ("key", pa.string()), - ], - ), - ), - nullable=False, - ), - ("timestamp", pa.timestamp("us", "UTC")), - ("category", pa.string()), - ] - ), - ), - ), - "address": pd.ArrowDtype( - pa.struct( - [ - ("street", pa.string()), - ("city", pa.string()), - ] - ), - ), - } - ), - ) - - -def test_shape(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df.shape - pd_result = scalars_pandas_df.shape - - assert bf_result == pd_result - - -@pytest.mark.parametrize( - "reference_table, test_table", - [ - ( - "bigframes-dev.bigframes_tests_sys.base_table", - "bigframes-dev.bigframes_tests_sys.base_table_mat_view", - ), - ( - "bigframes-dev.bigframes_tests_sys.base_table", - "bigframes-dev.bigframes_tests_sys.base_table_view", - ), - ( - "bigframes-dev.bigframes_tests_sys.csv_native_table", - "bigframes-dev.bigframes_tests_sys.csv_external_table", - ), - ], -) -def test_view_and_external_table_shape(session, reference_table, test_table): - reference_df = session.read_gbq(reference_table) - test_df = session.read_gbq(test_table) - - assert test_df.shape == reference_df.shape - - -def test_len(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = len(scalars_df) - pd_result = len(scalars_pandas_df) - - assert bf_result == pd_result - - -@pytest.mark.parametrize( - ("n_rows",), - [ - (50,), - (10000,), - ], -) -@pytest.mark.parametrize( - "write_engine", - ["bigquery_load", "bigquery_streaming", "bigquery_write"], -) -def test_df_len_local(session, n_rows, write_engine): - assert ( - len( - session.read_pandas( - pd.DataFrame(np.random.randint(1, 7, n_rows), columns=["one"]), - write_engine=write_engine, - ) - ) - == n_rows - ) - - -def test_size(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df.size - pd_result = scalars_pandas_df.size - - assert bf_result == pd_result - - -def test_ndim(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df.ndim - pd_result = scalars_pandas_df.ndim - - assert bf_result == pd_result - - -def test_empty_false(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = scalars_df.empty - pd_result = scalars_pandas_df.empty - - assert bf_result == pd_result - - -def test_empty_true_column_filter(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = scalars_df[[]].empty - pd_result = scalars_pandas_df[[]].empty - - assert bf_result == pd_result - - -def test_empty_true_row_filter(scalars_dfs: Tuple[dataframe.DataFrame, pd.DataFrame]): - scalars_df, scalars_pandas_df = scalars_dfs - bf_bool: series.Series = typing.cast(series.Series, scalars_df["bool_col"]) - pd_bool: pd.Series = scalars_pandas_df["bool_col"] - bf_false = bf_bool.notna() & (bf_bool != bf_bool) - pd_false = pd_bool.notna() & (pd_bool != pd_bool) - - bf_result = scalars_df[bf_false].empty - pd_result = scalars_pandas_df[pd_false].empty - - assert pd_result - assert bf_result == pd_result - - -def test_empty_true_memtable(session: bigframes.Session): - bf_df = dataframe.DataFrame(session=session) - pd_df = pd.DataFrame() - - bf_result = bf_df.empty - pd_result = pd_df.empty - - assert pd_result - assert bf_result == pd_result - - -@pytest.mark.parametrize( - ("drop",), - ((True,), (False,)), -) -def test_reset_index(scalars_df_index, scalars_pandas_df_index, drop): - df = scalars_df_index.reset_index(drop=drop) - assert df.index.name is None - - bf_result = df.to_pandas() - pd_result = scalars_pandas_df_index.reset_index(drop=drop) - - # Pandas uses int64 instead of Int64 (nullable) dtype. - pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - - # reset_index should maintain the original ordering. - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_reset_index_allow_duplicates(scalars_df_index, scalars_pandas_df_index): - scalars_df_index = scalars_df_index.copy() - scalars_df_index.index.name = "int64_col" - df = scalars_df_index.reset_index(allow_duplicates=True, drop=False) - assert df.index.name is None - - bf_result = df.to_pandas() - - scalars_pandas_df_index = scalars_pandas_df_index.copy() - scalars_pandas_df_index.index.name = "int64_col" - pd_result = scalars_pandas_df_index.reset_index(allow_duplicates=True, drop=False) - - # Pandas uses int64 instead of Int64 (nullable) dtype. - pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - - # reset_index should maintain the original ordering. - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_reset_index_duplicates_error(scalars_df_index): - scalars_df_index = scalars_df_index.copy() - scalars_df_index.index.name = "int64_col" - with pytest.raises(ValueError): - scalars_df_index.reset_index(allow_duplicates=False, drop=False) - - -@pytest.mark.parametrize( - ("drop",), - ((True,), (False,)), -) -def test_reset_index_inplace(scalars_df_index, scalars_pandas_df_index, drop): - df = scalars_df_index.copy() - df.reset_index(drop=drop, inplace=True) - assert df.index.name is None - - bf_result = df.to_pandas() - pd_result = scalars_pandas_df_index.copy() - pd_result.reset_index(drop=drop, inplace=True) - - # Pandas uses int64 instead of Int64 (nullable) dtype. - pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - - # reset_index should maintain the original ordering. - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_reset_index_then_filter( - scalars_df_index, - scalars_pandas_df_index, -): - bf_filter = scalars_df_index["bool_col"].fillna(True) - bf_df = scalars_df_index.reset_index()[bf_filter] - bf_result = bf_df.to_pandas() - pd_filter = scalars_pandas_df_index["bool_col"].fillna(True) - pd_result = scalars_pandas_df_index.reset_index()[pd_filter] - - # Pandas uses int64 instead of Int64 (nullable) dtype. - pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - - # reset_index should maintain the original ordering and index keys - # post-filter will have gaps. - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_reset_index_with_unnamed_index( - scalars_df_index, - scalars_pandas_df_index, -): - scalars_df_index = scalars_df_index.copy() - scalars_pandas_df_index = scalars_pandas_df_index.copy() - - scalars_df_index.index.name = None - scalars_pandas_df_index.index.name = None - df = scalars_df_index.reset_index(drop=False) - assert df.index.name is None - - # reset_index(drop=False) creates a new column "index". - assert df.columns[0] == "index" - - bf_result = df.to_pandas() - pd_result = scalars_pandas_df_index.reset_index(drop=False) - - # Pandas uses int64 instead of Int64 (nullable) dtype. - pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - - # reset_index should maintain the original ordering. - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_reset_index_with_unnamed_multiindex( - scalars_df_index, - scalars_pandas_df_index, -): - bf_df = dataframe.DataFrame( - ([1, 2, 3], [2, 5, 7]), - index=pd.MultiIndex.from_tuples([("a", "aa"), ("a", "aa")]), - ) - pd_df = pd.DataFrame( - ([1, 2, 3], [2, 5, 7]), - index=pd.MultiIndex.from_tuples([("a", "aa"), ("a", "aa")]), - ) - - bf_df = bf_df.reset_index() - pd_df = pd_df.reset_index() - - assert pd_df.columns[0] == "level_0" - assert bf_df.columns[0] == "level_0" - assert pd_df.columns[1] == "level_1" - assert bf_df.columns[1] == "level_1" - - -def test_reset_index_with_unnamed_index_and_index_column( - scalars_df_index, - scalars_pandas_df_index, -): - scalars_df_index = scalars_df_index.copy() - scalars_pandas_df_index = scalars_pandas_df_index.copy() - - scalars_df_index.index.name = None - scalars_pandas_df_index.index.name = None - df = scalars_df_index.assign(index=scalars_df_index["int64_col"]).reset_index( - drop=False - ) - assert df.index.name is None - - # reset_index(drop=False) creates a new column "level_0" if the "index" column already exists. - assert df.columns[0] == "level_0" - - bf_result = df.to_pandas() - pd_result = scalars_pandas_df_index.assign( - index=scalars_pandas_df_index["int64_col"] - ).reset_index(drop=False) - - # Pandas uses int64 instead of Int64 (nullable) dtype. - pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - - # reset_index should maintain the original ordering. - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -@pytest.mark.parametrize( - ("drop",), - ( - (True,), - (False,), - ), -) -@pytest.mark.parametrize( - ("append",), - ( - (True,), - (False,), - ), -) -@pytest.mark.parametrize( - ("index_column",), - (("int64_too",), ("string_col",), ("timestamp_col",)), -) -def test_set_index(scalars_dfs, index_column, drop, append): - scalars_df, scalars_pandas_df = scalars_dfs - df = scalars_df.set_index(index_column, append=append, drop=drop) - bf_result = df.to_pandas() - pd_result = scalars_pandas_df.set_index(index_column, append=append, drop=drop) - - # Sort to disambiguate when there are duplicate index labels. - # Note: Doesn't use assert_pandas_df_equal_ignore_ordering because we get - # "ValueError: 'timestamp_col' is both an index level and a column label, - # which is ambiguous" when trying to sort by a column with the same name as - # the index. - bf_result = bf_result.sort_values("rowindex_2") - pd_result = pd_result.sort_values("rowindex_2") - - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_set_index_key_error(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - with pytest.raises(KeyError): - scalars_pandas_df.set_index(["not_a_col"]) - with pytest.raises(KeyError): - scalars_df.set_index(["not_a_col"]) - - -@pytest.mark.parametrize( - ("ascending",), - ((True,), (False,)), -) -@pytest.mark.parametrize( - ("na_position",), - (("first",), ("last",)), -) -@pytest.mark.parametrize( - ("axis",), - ((0,), ("columns",)), -) -def test_sort_index(scalars_dfs, ascending, na_position, axis): - index_column = "int64_col" - scalars_df, scalars_pandas_df = scalars_dfs - df = scalars_df.set_index(index_column) - bf_result = df.sort_index( - ascending=ascending, na_position=na_position, axis=axis - ).to_pandas() - pd_result = scalars_pandas_df.set_index(index_column).sort_index( - ascending=ascending, na_position=na_position, axis=axis - ) - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_dataframe_sort_index_inplace(scalars_dfs): - index_column = "int64_col" - scalars_df, scalars_pandas_df = scalars_dfs - df = scalars_df.copy().set_index(index_column) - df.sort_index(ascending=False, inplace=True) - bf_result = df.to_pandas() - - pd_result = scalars_pandas_df.set_index(index_column).sort_index(ascending=False) - pandas.testing.assert_frame_equal(bf_result, pd_result) - - -def test_df_abs(scalars_dfs_maybe_ordered): - scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered - columns = ["int64_col", "int64_too", "float64_col"] - - bf_result = scalars_df[columns].abs() - pd_result = scalars_pandas_df[columns].abs() - - assert_dfs_equivalent(pd_result, bf_result) - - -def test_df_pos(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = (+scalars_df[["int64_col", "numeric_col"]]).to_pandas() - pd_result = +scalars_pandas_df[["int64_col", "numeric_col"]] - - assert_pandas_df_equal(pd_result, bf_result) - - -def test_df_neg(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = (-scalars_df[["int64_col", "numeric_col"]]).to_pandas() - pd_result = -scalars_pandas_df[["int64_col", "numeric_col"]] - - assert_pandas_df_equal(pd_result, bf_result) - - -def test_df__abs__(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = ( - abs(scalars_df[["int64_col", "numeric_col", "float64_col"]]) - ).to_pandas() - pd_result = abs(scalars_pandas_df[["int64_col", "numeric_col", "float64_col"]]) - - assert_pandas_df_equal(pd_result, bf_result) - - -def test_df_invert(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - columns = ["int64_col", "bool_col"] - - bf_result = (~scalars_df[columns]).to_pandas() - pd_result = ~scalars_pandas_df[columns] - - assert_pandas_df_equal(bf_result, pd_result) - - -def test_df_isnull(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - columns = ["int64_col", "int64_too", "string_col", "bool_col"] - bf_result = scalars_df[columns].isnull().to_pandas() - pd_result = scalars_pandas_df[columns].isnull() - - # One of dtype mismatches to be documented. Here, the `bf_result.dtype` is - # `BooleanDtype` but the `pd_result.dtype` is `bool`. - pd_result["int64_col"] = pd_result["int64_col"].astype(pd.BooleanDtype()) - pd_result["int64_too"] = pd_result["int64_too"].astype(pd.BooleanDtype()) - pd_result["string_col"] = pd_result["string_col"].astype(pd.BooleanDtype()) - pd_result["bool_col"] = pd_result["bool_col"].astype(pd.BooleanDtype()) - - assert_pandas_df_equal(bf_result, pd_result) - - -def test_df_notnull(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - columns = ["int64_col", "int64_too", "string_col", "bool_col"] - bf_result = scalars_df[columns].notnull().to_pandas() - pd_result = scalars_pandas_df[columns].notnull() - - # One of dtype mismatches to be documented. Here, the `bf_result.dtype` is - # `BooleanDtype` but the `pd_result.dtype` is `bool`. - pd_result["int64_col"] = pd_result["int64_col"].astype(pd.BooleanDtype()) - pd_result["int64_too"] = pd_result["int64_too"].astype(pd.BooleanDtype()) - pd_result["string_col"] = pd_result["string_col"].astype(pd.BooleanDtype()) - pd_result["bool_col"] = pd_result["bool_col"].astype(pd.BooleanDtype()) - - assert_pandas_df_equal(bf_result, pd_result) - - -@pytest.mark.parametrize( - ("left_labels", "right_labels", "overwrite", "fill_value"), - [ - (["a", "b", "c"], ["c", "a", "b"], True, None), - (["a", "b", "c"], ["c", "a", "b"], False, None), - (["a", "b", "c"], ["a", "b", "c"], False, 2), - ], - ids=[ - "one_one_match_overwrite", - "one_one_match_no_overwrite", - "exact_match", - ], -) -def test_combine( - scalars_df_index, - scalars_df_2_index, - scalars_pandas_df_index, - left_labels, - right_labels, - overwrite, - fill_value, -): - if pd.__version__.startswith("1."): - pytest.skip("pd.NA vs NaN not handled well in pandas 1.x.") - columns = ["int64_too", "int64_col", "float64_col"] - - bf_df_a = scalars_df_index[columns] - bf_df_a.columns = left_labels - bf_df_b = scalars_df_2_index[columns] - bf_df_b.columns = right_labels - bf_result = bf_df_a.combine( - bf_df_b, - lambda x, y: x**2 + 2 * x * y + y**2, - overwrite=overwrite, - fill_value=fill_value, - ).to_pandas() - - pd_df_a = scalars_pandas_df_index[columns] - pd_df_a.columns = left_labels - pd_df_b = scalars_pandas_df_index[columns] - pd_df_b.columns = right_labels - pd_result = pd_df_a.combine( - pd_df_b, - lambda x, y: x**2 + 2 * x * y + y**2, - overwrite=overwrite, - fill_value=fill_value, - ) - - # Some dtype inconsistency for all-NULL columns - pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) - - -@pytest.mark.parametrize( - ("overwrite", "filter_func"), - [ - (True, None), - (False, None), - (True, lambda x: x.isna() | (x % 2 == 0)), - ], - ids=[ - "default", - "overwritefalse", - "customfilter", - ], -) -def test_df_update(overwrite, filter_func): - if pd.__version__.startswith("1."): - pytest.skip("dtype handled differently in pandas 1.x.") - - index1: pandas.Index = pandas.Index([1, 2, 3, 4], dtype="Int64") - - index2: pandas.Index = pandas.Index([1, 2, 4, 5], dtype="Int64") - pd_df1 = pandas.DataFrame( - {"a": [1, None, 3, 4], "b": [5, 6, None, 8]}, dtype="Int64", index=index1 - ) - pd_df2 = pandas.DataFrame( - {"a": [None, 20, 30, 40], "c": [90, None, 110, 120]}, - dtype="Int64", - index=index2, - ) - - bf_df1 = dataframe.DataFrame(pd_df1) - bf_df2 = dataframe.DataFrame(pd_df2) - - bf_df1.update(bf_df2, overwrite=overwrite, filter_func=filter_func) - pd_df1.update(pd_df2, overwrite=overwrite, filter_func=filter_func) - - pd.testing.assert_frame_equal(bf_df1.to_pandas(), pd_df1) - - -def test_df_idxmin(): - pd_df = pd.DataFrame( - {"a": [1, 2, 3], "b": [7, None, 3], "c": [4, 4, 4]}, index=["x", "y", "z"] - ) - bf_df = dataframe.DataFrame(pd_df) - - bf_result = bf_df.idxmin().to_pandas() - pd_result = pd_df.idxmin() - - pd.testing.assert_series_equal( - bf_result, pd_result, check_index_type=False, check_dtype=False - ) - - -def test_df_idxmax(): - pd_df = pd.DataFrame( - {"a": [1, 2, 3], "b": [7, None, 3], "c": [4, 4, 4]}, index=["x", "y", "z"] - ) - bf_df = dataframe.DataFrame(pd_df) - - bf_result = bf_df.idxmax().to_pandas() - pd_result = pd_df.idxmax() - - pd.testing.assert_series_equal( - bf_result, pd_result, check_index_type=False, check_dtype=False - ) - - -@pytest.mark.parametrize( - ("join", "axis"), - [ - ("outer", None), - ("outer", 0), - ("outer", 1), - ("left", 0), - ("right", 1), - ("inner", None), - ("inner", 1), - ], -) -def test_df_align(join, axis): - - index1: pandas.Index = pandas.Index([1, 2, 3, 4], dtype="Int64") - - index2: pandas.Index = pandas.Index([1, 2, 4, 5], dtype="Int64") - pd_df1 = pandas.DataFrame( - {"a": [1, None, 3, 4], "b": [5, 6, None, 8]}, dtype="Int64", index=index1 - ) - pd_df2 = pandas.DataFrame( - {"a": [None, 20, 30, 40], "c": [90, None, 110, 120]}, - dtype="Int64", - index=index2, - ) - - bf_df1 = dataframe.DataFrame(pd_df1) - bf_df2 = dataframe.DataFrame(pd_df2) - - bf_result1, bf_result2 = bf_df1.align(bf_df2, join=join, axis=axis) - pd_result1, pd_result2 = pd_df1.align(pd_df2, join=join, axis=axis) - - # Don't check dtype as pandas does unnecessary float conversion - assert isinstance(bf_result1, dataframe.DataFrame) and isinstance( - bf_result2, dataframe.DataFrame - ) - pd.testing.assert_frame_equal(bf_result1.to_pandas(), pd_result1, check_dtype=False) - pd.testing.assert_frame_equal(bf_result2.to_pandas(), pd_result2, check_dtype=False) - - -def test_combine_first( - scalars_df_index, - scalars_df_2_index, - scalars_pandas_df_index, -): - if pd.__version__.startswith("1."): - pytest.skip("pd.NA vs NaN not handled well in pandas 1.x.") - columns = ["int64_too", "int64_col", "float64_col"] - - bf_df_a = scalars_df_index[columns].iloc[0:6] - bf_df_a.columns = ["a", "b", "c"] - bf_df_b = scalars_df_2_index[columns].iloc[2:8] - bf_df_b.columns = ["b", "a", "d"] - bf_result = bf_df_a.combine_first(bf_df_b).to_pandas() - - pd_df_a = scalars_pandas_df_index[columns].iloc[0:6] - pd_df_a.columns = ["a", "b", "c"] - pd_df_b = scalars_pandas_df_index[columns].iloc[2:8] - pd_df_b.columns = ["b", "a", "d"] - pd_result = pd_df_a.combine_first(pd_df_b) - - # Some dtype inconsistency for all-NULL columns - pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) - - -@pytest.mark.parametrize( - ("columns", "numeric_only"), - [ - (["bool_col", "int64_col", "float64_col"], True), - (["bool_col", "int64_col", "float64_col"], False), - (["bool_col", "int64_col", "float64_col", "string_col"], True), - pytest.param( - ["bool_col", "int64_col", "float64_col", "string_col"], - False, - marks=pytest.mark.xfail( - raises=NotImplementedError, - ), - ), - ], -) -def test_df_corr_w_numeric_only(scalars_dfs_maybe_ordered, columns, numeric_only): - scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered - - bf_result = scalars_df[columns].corr(numeric_only=numeric_only).to_pandas() - pd_result = scalars_pandas_df[columns].corr(numeric_only=numeric_only) - - # BigFrames and Pandas differ in their data type handling: - # - Column types: BigFrames uses Float64, Pandas uses float64. - # - Index types: BigFrames uses strign, Pandas uses object. - pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) - # Only check row order in ordered mode. - pd.testing.assert_frame_equal( - bf_result, - pd_result, - check_dtype=False, - check_index_type=False, - check_like=~scalars_df._block.session._strictly_ordered, - ) - - -def test_df_corr_w_invalid_parameters(scalars_dfs): - columns = ["int64_too", "int64_col", "float64_col"] - scalars_df, _ = scalars_dfs - - with pytest.raises(NotImplementedError): - scalars_df[columns].corr(method="kendall") - - with pytest.raises(NotImplementedError): - scalars_df[columns].corr(min_periods=1) - - -@pytest.mark.parametrize( - ("columns", "numeric_only"), - [ - (["bool_col", "int64_col", "float64_col"], True), - (["bool_col", "int64_col", "float64_col"], False), - (["bool_col", "int64_col", "float64_col", "string_col"], True), - pytest.param( - ["bool_col", "int64_col", "float64_col", "string_col"], - False, - marks=pytest.mark.xfail( - raises=NotImplementedError, - ), - ), - ], -) -def test_cov_w_numeric_only(scalars_dfs_maybe_ordered, columns, numeric_only): - scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered - bf_result = scalars_df[columns].cov(numeric_only=numeric_only).to_pandas() - pd_result = scalars_pandas_df[columns].cov(numeric_only=numeric_only) - # BigFrames and Pandas differ in their data type handling: - # - Column types: BigFrames uses Float64, Pandas uses float64. - # - Index types: BigFrames uses strign, Pandas uses object. - pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) - # Only check row order in ordered mode. - pd.testing.assert_frame_equal( - bf_result, - pd_result, - check_dtype=False, - check_index_type=False, - check_like=~scalars_df._block.session._strictly_ordered, - ) - - -def test_df_corrwith_df(scalars_dfs_maybe_ordered): - scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered - - l_cols = ["int64_col", "float64_col", "int64_too"] - r_cols = ["int64_too", "float64_col"] - - bf_result = scalars_df[l_cols].corrwith(scalars_df[r_cols]).to_pandas() - pd_result = scalars_pandas_df[l_cols].corrwith(scalars_pandas_df[r_cols]) - - # BigFrames and Pandas differ in their data type handling: - # - Column types: BigFrames uses Float64, Pandas uses float64. - # - Index types: BigFrames uses strign, Pandas uses object. - pd.testing.assert_series_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False - ) - - -def test_df_corrwith_df_numeric_only(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - l_cols = ["int64_col", "float64_col", "int64_too", "string_col"] - r_cols = ["int64_too", "float64_col", "bool_col"] - - bf_result = ( - scalars_df[l_cols].corrwith(scalars_df[r_cols], numeric_only=True).to_pandas() - ) - pd_result = scalars_pandas_df[l_cols].corrwith( - scalars_pandas_df[r_cols], numeric_only=True - ) - - # BigFrames and Pandas differ in their data type handling: - # - Column types: BigFrames uses Float64, Pandas uses float64. - # - Index types: BigFrames uses strign, Pandas uses object. - pd.testing.assert_series_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False - ) - - -def test_df_corrwith_df_non_numeric_error(scalars_dfs): - scalars_df, _ = scalars_dfs - - l_cols = ["int64_col", "float64_col", "int64_too", "string_col"] - r_cols = ["int64_too", "float64_col", "bool_col"] - - with pytest.raises(NotImplementedError): - scalars_df[l_cols].corrwith(scalars_df[r_cols], numeric_only=False) - - -def test_df_corrwith_series(scalars_dfs_maybe_ordered): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered - - l_cols = ["int64_col", "float64_col", "int64_too"] - r_col = "float64_col" - - bf_result = scalars_df[l_cols].corrwith(scalars_df[r_col]).to_pandas() - pd_result = scalars_pandas_df[l_cols].corrwith(scalars_pandas_df[r_col]) - - # BigFrames and Pandas differ in their data type handling: - # - Column types: BigFrames uses Float64, Pandas uses float64. - # - Index types: BigFrames uses strign, Pandas uses object. - pd.testing.assert_series_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False - ) - - -@pytest.mark.parametrize( - ("op"), - [ - operator.add, - operator.sub, - operator.mul, - operator.truediv, - operator.floordiv, - operator.eq, - operator.ne, - operator.gt, - operator.ge, - operator.lt, - operator.le, - ], - ids=[ - "add", - "subtract", - "multiply", - "true_divide", - "floor_divide", - "eq", - "ne", - "gt", - "ge", - "lt", - "le", - ], -) -# TODO(garrettwu): deal with NA values -@pytest.mark.parametrize(("other_scalar"), [1, 2.5, 0, 0.0]) -@pytest.mark.parametrize(("reverse_operands"), [True, False]) -def test_scalar_binop(scalars_dfs, op, other_scalar, reverse_operands): - scalars_df, scalars_pandas_df = scalars_dfs - columns = ["int64_col", "float64_col"] - - maybe_reversed_op = (lambda x, y: op(y, x)) if reverse_operands else op - - bf_result = maybe_reversed_op(scalars_df[columns], other_scalar).to_pandas() - pd_result = maybe_reversed_op(scalars_pandas_df[columns], other_scalar) - - assert_pandas_df_equal(bf_result, pd_result) - - -def test_dataframe_string_radd_const(scalars_dfs): - pytest.importorskip( - "pandas", - minversion="2.0.0", - reason="PyArrow string addition requires pandas 2.0+", - ) - - scalars_df, scalars_pandas_df = scalars_dfs - columns = ["string_col", "string_col"] - - bf_result = ("prefix" + scalars_df[columns]).to_pandas() - pd_result = "prefix" + scalars_pandas_df[columns] - - assert_pandas_df_equal(bf_result, pd_result) - - -@pytest.mark.parametrize(("other_scalar"), [1, -2]) -def test_mod(scalars_dfs, other_scalar): - # Zero case excluded as pandas produces 0 result for Int64 inputs rather than NA/NaN. - # This is likely a pandas bug as mod 0 is undefined in other dtypes, and most programming languages. - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = (scalars_df[["int64_col", "int64_too"]] % other_scalar).to_pandas() - pd_result = scalars_pandas_df[["int64_col", "int64_too"]] % other_scalar - - assert_pandas_df_equal(bf_result, pd_result) - - -def test_scalar_binop_str_exception(scalars_dfs): - scalars_df, _ = scalars_dfs - columns = ["string_col"] - with pytest.raises(TypeError, match="Cannot add dtypes"): - (scalars_df[columns] + 1).to_pandas() - - -@pytest.mark.parametrize( - ("op"), - [ - (lambda x, y: x.add(y, axis="index")), - (lambda x, y: x.radd(y, axis="index")), - (lambda x, y: x.sub(y, axis="index")), - (lambda x, y: x.rsub(y, axis="index")), - (lambda x, y: x.mul(y, axis="index")), - (lambda x, y: x.rmul(y, axis="index")), - (lambda x, y: x.truediv(y, axis="index")), - (lambda x, y: x.rtruediv(y, axis="index")), - (lambda x, y: x.floordiv(y, axis="index")), - (lambda x, y: x.floordiv(y, axis="index")), - (lambda x, y: x.gt(y, axis="index")), - (lambda x, y: x.ge(y, axis="index")), - (lambda x, y: x.lt(y, axis="index")), - (lambda x, y: x.le(y, axis="index")), - ], - ids=[ - "add", - "radd", - "sub", - "rsub", - "mul", - "rmul", - "truediv", - "rtruediv", - "floordiv", - "rfloordiv", - "gt", - "ge", - "lt", - "le", - ], -) -def test_series_binop_axis_index( - scalars_dfs, - op, -): - scalars_df, scalars_pandas_df = scalars_dfs - df_columns = ["int64_col", "float64_col"] - series_column = "int64_too" - - bf_result = op(scalars_df[df_columns], scalars_df[series_column]).to_pandas() - pd_result = op(scalars_pandas_df[df_columns], scalars_pandas_df[series_column]) - - assert_pandas_df_equal(bf_result, pd_result) - - -@pytest.mark.parametrize( - ("input"), - [ - ((1000, 2000, 3000)), - (pd.Index([1000, 2000, 3000])), - (pd.Series((1000, 2000), index=["int64_too", "float64_col"])), - ], - ids=[ - "tuple", - "pd_index", - "pd_series", - ], -) -def test_listlike_binop_axis_1_in_memory_data(scalars_dfs, input): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - scalars_df, scalars_pandas_df = scalars_dfs - - df_columns = ["int64_col", "float64_col", "int64_too"] - - bf_result = scalars_df[df_columns].add(input, axis=1).to_pandas() - if hasattr(input, "to_pandas"): - input = input.to_pandas() - pd_result = scalars_pandas_df[df_columns].add(input, axis=1) - - assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) - - -def test_df_reverse_binop_pandas(scalars_dfs): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - scalars_df, scalars_pandas_df = scalars_dfs - - pd_series = pd.Series([100, 200, 300]) - - df_columns = ["int64_col", "float64_col", "int64_too"] - - bf_result = pd_series + scalars_df[df_columns].to_pandas() - pd_result = pd_series + scalars_pandas_df[df_columns] - - assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) - - -def test_listlike_binop_axis_1_bf_index(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - - df_columns = ["int64_col", "float64_col", "int64_too"] - - bf_result = ( - scalars_df[df_columns] - .add(bf_indexes.Index([1000, 2000, 3000]), axis=1) - .to_pandas() - ) - pd_result = scalars_pandas_df[df_columns].add(pd.Index([1000, 2000, 3000]), axis=1) - - assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) - - -def test_binop_with_self_aggregate(scalars_dfs_maybe_ordered): - scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered - - df_columns = ["int64_col", "float64_col", "int64_too"] - - # Ensure that this takes the optimized single-query path by counting executions - execution_count_before = scalars_df._session._metrics.execution_count - bf_df = scalars_df[df_columns] - bf_result = (bf_df - bf_df.mean()).to_pandas() - execution_count_after = scalars_df._session._metrics.execution_count - - pd_df = scalars_pandas_df[df_columns] - pd_result = pd_df - pd_df.mean() - - executions = execution_count_after - execution_count_before - - assert executions == 1 - assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) - - -def test_binop_with_self_aggregate_w_index_reset(scalars_dfs_maybe_ordered): - scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered - - df_columns = ["int64_col", "float64_col", "int64_too"] - - # Ensure that this takes the optimized single-query path by counting executions - execution_count_before = scalars_df._session._metrics.execution_count - bf_df = scalars_df[df_columns].reset_index(drop=True) - bf_result = (bf_df - bf_df.mean()).to_pandas() - execution_count_after = scalars_df._session._metrics.execution_count - - pd_df = scalars_pandas_df[df_columns].reset_index(drop=True) - pd_result = pd_df - pd_df.mean() - - executions = execution_count_after - execution_count_before - - assert executions == 1 - pd_result.index = pd_result.index.astype("Int64") - assert_pandas_df_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False - ) - - -@pytest.mark.parametrize( - ("left_labels", "right_labels"), - [ - (["a", "a", "b"], ["c", "c", "d"]), - (["a", "b", "c"], ["c", "a", "b"]), - (["a", "c", "c"], ["c", "a", "c"]), - (["a", "b", "c"], ["a", "b", "c"]), - ], - ids=[ - "no_overlap", - "one_one_match", - "multi_match", - "exact_match", - ], -) -def test_binop_df_df_binary_op( - scalars_df_index, - scalars_df_2_index, - scalars_pandas_df_index, - left_labels, - right_labels, -): - if pd.__version__.startswith("1."): - pytest.skip("pd.NA vs NaN not handled well in pandas 1.x.") - columns = ["int64_too", "int64_col", "float64_col"] - - bf_df_a = scalars_df_index[columns] - bf_df_a.columns = left_labels - bf_df_b = scalars_df_2_index[columns] - bf_df_b.columns = right_labels - bf_result = (bf_df_a - bf_df_b).to_pandas() - - pd_df_a = scalars_pandas_df_index[columns] - pd_df_a.columns = left_labels - pd_df_b = scalars_pandas_df_index[columns] - pd_df_b.columns = right_labels - pd_result = pd_df_a - pd_df_b - - # Some dtype inconsistency for all-NULL columns - pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) - - -# Differnt table will only work for explicit index, since default index orders are arbitrary. -@pytest.mark.parametrize( - ("ordered"), - [ - (True), - (False), - ], -) -def test_series_binop_add_different_table( - scalars_df_index, scalars_pandas_df_index, scalars_df_2_index, ordered -): - df_columns = ["int64_col", "float64_col"] - series_column = "int64_too" - - bf_result = ( - scalars_df_index[df_columns] - .add(scalars_df_2_index[series_column], axis="index") - .to_pandas(ordered=ordered) - ) - pd_result = scalars_pandas_df_index[df_columns].add( - scalars_pandas_df_index[series_column], axis="index" - ) - - assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) - - -# TODO(garrettwu): Test series binop with different index - -all_joins = pytest.mark.parametrize( - ("how",), - (("outer",), ("left",), ("right",), ("inner",), ("cross",)), -) - - -@all_joins -def test_join_same_table(scalars_dfs_maybe_ordered, how): - bf_df, pd_df = scalars_dfs_maybe_ordered - - bf_df_a = bf_df.set_index("int64_too")[["string_col", "int64_col"]] - bf_df_a = bf_df_a.sort_index() - - bf_df_b = bf_df.set_index("int64_too")[["float64_col"]] - bf_df_b = bf_df_b[bf_df_b.float64_col > 0] - bf_df_b = bf_df_b.sort_values("float64_col") - - bf_result = bf_df_a.join(bf_df_b, how=how).to_pandas() - - pd_df_a = pd_df.set_index("int64_too")[["string_col", "int64_col"]].sort_index() - pd_df_a = pd_df_a.sort_index() - - pd_df_b = pd_df.set_index("int64_too")[["float64_col"]] - pd_df_b = pd_df_b[pd_df_b.float64_col > 0] - pd_df_b = pd_df_b.sort_values("float64_col") - - pd_result = pd_df_a.join(pd_df_b, how=how) - - assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) - - -def test_join_incompatible_key_type_error(scalars_dfs): - bf_df, _ = scalars_dfs - - bf_df_a = bf_df.set_index("int64_too")[["string_col", "int64_col"]] - bf_df_a = bf_df_a.sort_index() - - bf_df_b = bf_df.set_index("date_col")[["float64_col"]] - bf_df_b = bf_df_b[bf_df_b.float64_col > 0] - bf_df_b = bf_df_b.sort_values("float64_col") - - with pytest.raises(TypeError): - # joining incompatible date, int columns - bf_df_a.join(bf_df_b, how="left") - - -@all_joins -def test_join_different_table( - scalars_df_index, scalars_df_2_index, scalars_pandas_df_index, how -): - bf_df_a = scalars_df_index[["string_col", "int64_col"]] - bf_df_b = scalars_df_2_index.dropna()[["float64_col"]] - bf_result = bf_df_a.join(bf_df_b, how=how).to_pandas() - pd_df_a = scalars_pandas_df_index[["string_col", "int64_col"]] - pd_df_b = scalars_pandas_df_index.dropna()[["float64_col"]] - pd_result = pd_df_a.join(pd_df_b, how=how) - assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) - - -@all_joins -def test_join_different_table_with_duplicate_column_name( - scalars_df_index, scalars_pandas_df_index, how -): - bf_df_a = scalars_df_index[["string_col", "int64_col", "int64_too"]].rename( - columns={"int64_too": "int64_col"} - ) - bf_df_b = scalars_df_index.dropna()[ - ["string_col", "int64_col", "int64_too"] - ].rename(columns={"int64_too": "int64_col"}) - bf_result = bf_df_a.join(bf_df_b, how=how, lsuffix="_l", rsuffix="_r").to_pandas() - pd_df_a = scalars_pandas_df_index[["string_col", "int64_col", "int64_too"]].rename( - columns={"int64_too": "int64_col"} - ) - pd_df_b = scalars_pandas_df_index.dropna()[ - ["string_col", "int64_col", "int64_too"] - ].rename(columns={"int64_too": "int64_col"}) - pd_result = pd_df_a.join(pd_df_b, how=how, lsuffix="_l", rsuffix="_r") - - # Ensure no inplace changes - pd.testing.assert_index_equal(bf_df_a.columns, pd_df_a.columns) - pd.testing.assert_index_equal(bf_df_b.index.to_pandas(), pd_df_b.index) - pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) - - -@all_joins -def test_join_param_on_with_duplicate_column_name_not_on_col( - scalars_df_index, scalars_pandas_df_index, how -): - # This test is for duplicate column names, but the 'on' column is not duplicated. - if how == "cross": - return - bf_df_a = scalars_df_index[ - ["string_col", "datetime_col", "timestamp_col", "int64_too"] - ].rename(columns={"timestamp_col": "datetime_col"}) - bf_df_b = scalars_df_index.dropna()[ - ["string_col", "datetime_col", "timestamp_col"] - ].rename(columns={"timestamp_col": "datetime_col"}) - bf_result = bf_df_a.join( - bf_df_b, on="int64_too", how=how, lsuffix="_l", rsuffix="_r" - ).to_pandas() - pd_df_a = scalars_pandas_df_index[ - ["string_col", "datetime_col", "timestamp_col", "int64_too"] - ].rename(columns={"timestamp_col": "datetime_col"}) - pd_df_b = scalars_pandas_df_index.dropna()[ - ["string_col", "datetime_col", "timestamp_col"] - ].rename(columns={"timestamp_col": "datetime_col"}) - pd_result = pd_df_a.join( - pd_df_b, on="int64_too", how=how, lsuffix="_l", rsuffix="_r" - ) - pd.testing.assert_frame_equal( - bf_result.sort_index(), - pd_result.sort_index(), - check_like=True, - check_index_type=False, - check_names=False, - ) - pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) - - -@pytest.mark.skipif( - pandas.__version__.startswith("1."), reason="bad left join in pandas 1.x" -) -@all_joins -def test_join_param_on_with_duplicate_column_name_on_col( - scalars_df_index, scalars_pandas_df_index, how -): - # This test is for duplicate column names, and the 'on' column is duplicated. - if how == "cross": - return - bf_df_a = scalars_df_index[ - ["string_col", "datetime_col", "timestamp_col", "int64_too"] - ].rename(columns={"timestamp_col": "datetime_col"}) - bf_df_b = scalars_df_index.dropna()[ - ["string_col", "datetime_col", "timestamp_col", "int64_too"] - ].rename(columns={"timestamp_col": "datetime_col"}) - bf_result = bf_df_a.join( - bf_df_b, on="int64_too", how=how, lsuffix="_l", rsuffix="_r" - ).to_pandas() - pd_df_a = scalars_pandas_df_index[ - ["string_col", "datetime_col", "timestamp_col", "int64_too"] - ].rename(columns={"timestamp_col": "datetime_col"}) - pd_df_b = scalars_pandas_df_index.dropna()[ - ["string_col", "datetime_col", "timestamp_col", "int64_too"] - ].rename(columns={"timestamp_col": "datetime_col"}) - pd_result = pd_df_a.join( - pd_df_b, on="int64_too", how=how, lsuffix="_l", rsuffix="_r" - ) - pd.testing.assert_frame_equal( - bf_result.sort_index(), - pd_result.sort_index(), - check_like=True, - check_index_type=False, - check_names=False, - ) - pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) - - -@all_joins -def test_join_param_on(scalars_dfs, how): - bf_df, pd_df = scalars_dfs - - bf_df_a = bf_df[["string_col", "int64_col", "rowindex_2"]] - bf_df_a = bf_df_a.assign(rowindex_2=bf_df_a["rowindex_2"] + 2) - bf_df_b = bf_df[["float64_col"]] - - if how == "cross": - with pytest.raises(ValueError): - bf_df_a.join(bf_df_b, on="rowindex_2", how=how) - else: - bf_result = bf_df_a.join(bf_df_b, on="rowindex_2", how=how).to_pandas() - - pd_df_a = pd_df[["string_col", "int64_col", "rowindex_2"]] - pd_df_a = pd_df_a.assign(rowindex_2=pd_df_a["rowindex_2"] + 2) - pd_df_b = pd_df[["float64_col"]] - pd_result = pd_df_a.join(pd_df_b, on="rowindex_2", how=how) - assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) - - -@all_joins -def test_df_join_series(scalars_dfs, how): - bf_df, pd_df = scalars_dfs - - bf_df_a = bf_df[["string_col", "int64_col", "rowindex_2"]] - bf_df_a = bf_df_a.assign(rowindex_2=bf_df_a["rowindex_2"] + 2) - bf_series_b = bf_df["float64_col"] - - if how == "cross": - with pytest.raises(ValueError): - bf_df_a.join(bf_series_b, on="rowindex_2", how=how) - else: - bf_result = bf_df_a.join(bf_series_b, on="rowindex_2", how=how).to_pandas() - - pd_df_a = pd_df[["string_col", "int64_col", "rowindex_2"]] - pd_df_a = pd_df_a.assign(rowindex_2=pd_df_a["rowindex_2"] + 2) - pd_series_b = pd_df["float64_col"] - pd_result = pd_df_a.join(pd_series_b, on="rowindex_2", how=how) - assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) - - -@pytest.mark.parametrize( - ("by", "ascending", "na_position"), - [ - ("int64_col", True, "first"), - (["bool_col", "int64_col"], True, "last"), - ("int64_col", False, "first"), - (["bool_col", "int64_col"], [False, True], "last"), - (["bool_col", "int64_col"], [True, False], "first"), - ], -) -def test_dataframe_sort_values( - scalars_df_index, scalars_pandas_df_index, by, ascending, na_position -): - # Test needs values to be unique - bf_result = scalars_df_index.sort_values( - by, ascending=ascending, na_position=na_position - ).to_pandas() - pd_result = scalars_pandas_df_index.sort_values( - by, ascending=ascending, na_position=na_position - ) - - pandas.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -@pytest.mark.parametrize( - ("by", "ascending", "na_position"), - [ - ("int64_col", True, "first"), - (["bool_col", "int64_col"], True, "last"), - ], -) -def test_dataframe_sort_values_inplace( - scalars_df_index, scalars_pandas_df_index, by, ascending, na_position -): - # Test needs values to be unique - bf_sorted = scalars_df_index.copy() - bf_sorted.sort_values( - by, ascending=ascending, na_position=na_position, inplace=True - ) - bf_result = bf_sorted.to_pandas() - pd_result = scalars_pandas_df_index.sort_values( - by, ascending=ascending, na_position=na_position - ) - - pandas.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_dataframe_sort_values_invalid_input(scalars_df_index): - with pytest.raises(KeyError): - scalars_df_index.sort_values(by=scalars_df_index["int64_col"]) - - -def test_dataframe_sort_values_stable(scalars_df_index, scalars_pandas_df_index): - bf_result = ( - scalars_df_index.sort_values("int64_col", kind="stable") - .sort_values("bool_col", kind="stable") - .to_pandas() - ) - pd_result = scalars_pandas_df_index.sort_values( - "int64_col", kind="stable" - ).sort_values("bool_col", kind="stable") - - pandas.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -@pytest.mark.parametrize( - ("operator", "columns"), - [ - pytest.param(lambda x: x.cumsum(), ["float64_col", "int64_too"]), - pytest.param(lambda x: x.cumprod(), ["float64_col", "int64_too"]), - pytest.param( - lambda x: x.cumprod(), - ["string_col"], - marks=pytest.mark.xfail( - raises=ValueError, - ), - ), - ], - ids=[ - "cumsum", - "cumprod", - "non-numeric", - ], -) -def test_dataframe_numeric_analytic_op( - scalars_df_index, scalars_pandas_df_index, operator, columns -): - # TODO: Add nullable ints (pandas 1.x has poor behavior on these) - bf_series = operator(scalars_df_index[columns]) - pd_series = operator(scalars_pandas_df_index[columns]) - bf_result = bf_series.to_pandas() - pd.testing.assert_frame_equal(pd_series, bf_result, check_dtype=False) - - -@pytest.mark.parametrize( - ("operator"), - [ - (lambda x: x.cummin()), - (lambda x: x.cummax()), - (lambda x: x.shift(2)), - (lambda x: x.shift(-2)), - ], - ids=[ - "cummin", - "cummax", - "shiftpostive", - "shiftnegative", - ], -) -def test_dataframe_general_analytic_op( - scalars_df_index, scalars_pandas_df_index, operator -): - col_names = ["int64_too", "float64_col", "int64_col", "bool_col"] - bf_series = operator(scalars_df_index[col_names]) - pd_series = operator(scalars_pandas_df_index[col_names]) - bf_result = bf_series.to_pandas() - pd.testing.assert_frame_equal( - pd_series, - bf_result, - ) - - -@pytest.mark.parametrize( - ("periods",), - [ - (1,), - (2,), - (-1,), - ], -) -def test_dataframe_diff(scalars_df_index, scalars_pandas_df_index, periods): - col_names = ["int64_too", "float64_col", "int64_col"] - bf_result = scalars_df_index[col_names].diff(periods=periods).to_pandas() - pd_result = scalars_pandas_df_index[col_names].diff(periods=periods) - pd.testing.assert_frame_equal( - pd_result, - bf_result, - ) - - -@pytest.mark.parametrize( - ("periods",), - [ - (1,), - (2,), - (-1,), - ], -) -def test_dataframe_pct_change(scalars_df_index, scalars_pandas_df_index, periods): - col_names = ["int64_too", "float64_col", "int64_col"] - bf_result = scalars_df_index[col_names].pct_change(periods=periods).to_pandas() - pd_result = scalars_pandas_df_index[col_names].pct_change(periods=periods) - pd.testing.assert_frame_equal( - pd_result, - bf_result, - ) - - -def test_dataframe_agg_single_string(scalars_dfs): - numeric_cols = ["int64_col", "int64_too", "float64_col"] - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = scalars_df[numeric_cols].agg("sum").to_pandas() - pd_result = scalars_pandas_df[numeric_cols].agg("sum") - - assert bf_result.dtype == "Float64" - pd.testing.assert_series_equal( - pd_result, bf_result, check_dtype=False, check_index_type=False - ) - - -@pytest.mark.parametrize( - ("agg",), - ( - ("sum",), - ("size",), - ), -) -def test_dataframe_agg_int_single_string(scalars_dfs, agg): - numeric_cols = ["int64_col", "int64_too", "bool_col"] - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = scalars_df[numeric_cols].agg(agg).to_pandas() - pd_result = scalars_pandas_df[numeric_cols].agg(agg) - - assert bf_result.dtype == "Int64" - pd.testing.assert_series_equal( - pd_result, bf_result, check_dtype=False, check_index_type=False - ) - - -def test_dataframe_agg_multi_string(scalars_dfs_maybe_ordered): - numeric_cols = ["int64_col", "int64_too", "float64_col"] - aggregations = [ - "sum", - "mean", - "median", - "std", - "var", - "min", - "max", - "nunique", - "count", - ] - scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered - bf_result = scalars_df[numeric_cols].agg(aggregations) - pd_result = scalars_pandas_df[numeric_cols].agg(aggregations) - - # Pandas may produce narrower numeric types, but bigframes always produces Float64 - pd_result = pd_result.astype("Float64") - - # Drop median, as it's an approximation. - bf_median = bf_result.loc["median", :] - bf_result = bf_result.drop(labels=["median"]) - pd_result = pd_result.drop(labels=["median"]) - - assert_dfs_equivalent(pd_result, bf_result, check_index_type=False) - - # Double-check that median is at least plausible. - assert ( - (bf_result.loc["min", :] <= bf_median) & (bf_median <= bf_result.loc["max", :]) - ).all() - - -def test_dataframe_agg_int_multi_string(scalars_dfs): - numeric_cols = ["int64_col", "int64_too", "bool_col"] - aggregations = [ - "sum", - "nunique", - "count", - "size", - ] - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[numeric_cols].agg(aggregations).to_pandas() - pd_result = scalars_pandas_df[numeric_cols].agg(aggregations) - - for dtype in bf_result.dtypes: - assert dtype == "Int64" - - # Pandas may produce narrower numeric types - # Pandas has object index type - pd.testing.assert_frame_equal( - pd_result, bf_result, check_dtype=False, check_index_type=False - ) - - -def test_df_transpose(): - # Include some floats to ensure type coercion - values = [[0, 3.5, True], [1, 4.5, False], [2, 6.5, None]] - # Test complex case of both axes being multi-indices with non-unique elements - - columns: pandas.Index = pd.Index( - ["A", "B", "A"], dtype=pd.StringDtype(storage="pyarrow") - ) - columns_multi = pd.MultiIndex.from_arrays([columns, columns], names=["c1", "c2"]) - - index: pandas.Index = pd.Index( - ["b", "a", "a"], dtype=pd.StringDtype(storage="pyarrow") - ) - rows_multi = pd.MultiIndex.from_arrays([index, index], names=["r1", "r2"]) - - pd_df = pandas.DataFrame(values, index=rows_multi, columns=columns_multi) - bf_df = dataframe.DataFrame(values, index=rows_multi, columns=columns_multi) - - pd_result = pd_df.T - bf_result = bf_df.T.to_pandas() - - pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) - - -def test_df_transpose_error(): - with pytest.raises(TypeError, match="Cannot coerce.*to a common type."): - dataframe.DataFrame([[1, "hello"], [2, "world"]]).transpose() - - -def test_df_transpose_repeated_uses_cache(): - bf_df = dataframe.DataFrame([[1, 2.5], [2, 3.5]]) - pd_df = pandas.DataFrame([[1, 2.5], [2, 3.5]]) - # Transposing many times so that operation will fail from complexity if not using cache - for i in range(10): - # Cache still works even with simple scalar binop - bf_df = bf_df.transpose() + i - pd_df = pd_df.transpose() + i - - pd.testing.assert_frame_equal( - pd_df, bf_df.to_pandas(), check_dtype=False, check_index_type=False - ) - - -@pytest.mark.parametrize( - ("ordered"), - [ - (True), - (False), - ], -) -def test_df_stack(scalars_dfs, ordered): - if pandas.__version__.startswith("1.") or pandas.__version__.startswith("2.0"): - pytest.skip("pandas <2.1 uses different stack implementation") - scalars_df, scalars_pandas_df = scalars_dfs - # To match bigquery dataframes - scalars_pandas_df = scalars_pandas_df.copy() - scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") - # Can only stack identically-typed columns - columns = ["int64_col", "int64_too", "rowindex_2"] - - bf_result = scalars_df[columns].stack().to_pandas(ordered=ordered) - pd_result = scalars_pandas_df[columns].stack(future_stack=True) - - # Pandas produces NaN, where bq dataframes produces pd.NA - assert_series_equal( - bf_result, pd_result, check_dtype=False, ignore_order=not ordered - ) - - -def test_df_melt_default(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - # To match bigquery dataframes - scalars_pandas_df = scalars_pandas_df.copy() - scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") - # Can only stack identically-typed columns - columns = ["int64_col", "int64_too", "rowindex_2"] - - bf_result = scalars_df[columns].melt().to_pandas() - pd_result = scalars_pandas_df[columns].melt() - - # Pandas produces int64 index, Bigframes produces Int64 (nullable) - pd.testing.assert_frame_equal( - bf_result, - pd_result, - check_index_type=False, - check_dtype=False, - ) - - -def test_df_melt_parameterized(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - # To match bigquery dataframes - scalars_pandas_df = scalars_pandas_df.copy() - scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") - # Can only stack identically-typed columns - - bf_result = scalars_df.melt( - var_name="alice", - value_name="bob", - id_vars=["string_col"], - value_vars=["int64_col", "int64_too"], - ).to_pandas() - pd_result = scalars_pandas_df.melt( - var_name="alice", - value_name="bob", - id_vars=["string_col"], - value_vars=["int64_col", "int64_too"], - ) - - # Pandas produces int64 index, Bigframes produces Int64 (nullable) - pd.testing.assert_frame_equal( - bf_result, pd_result, check_index_type=False, check_dtype=False - ) - - -@pytest.mark.parametrize( - ("ordered"), - [ - (True), - (False), - ], -) -def test_df_unstack(scalars_dfs, ordered): - scalars_df, scalars_pandas_df = scalars_dfs - # To match bigquery dataframes - scalars_pandas_df = scalars_pandas_df.copy() - scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") - # Can only stack identically-typed columns - columns = [ - "rowindex_2", - "int64_col", - "int64_too", - ] - - # unstack on mono-index produces series - bf_result = scalars_df[columns].unstack().to_pandas(ordered=ordered) - pd_result = scalars_pandas_df[columns].unstack() - - # Pandas produces NaN, where bq dataframes produces pd.NA - assert_series_equal( - bf_result, pd_result, check_dtype=False, ignore_order=not ordered - ) - - -@pytest.mark.parametrize( - ("values", "index", "columns"), - [ - ("int64_col", "int64_too", ["string_col"]), - (["int64_col"], "int64_too", ["string_col"]), - (["int64_col", "float64_col"], "int64_too", ["string_col"]), - ], -) -def test_df_pivot(scalars_dfs, values, index, columns): - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = scalars_df.pivot( - values=values, index=index, columns=columns - ).to_pandas() - pd_result = scalars_pandas_df.pivot(values=values, index=index, columns=columns) - - # Pandas produces NaN, where bq dataframes produces pd.NA - bf_result = bf_result.fillna(float("nan")) - pd_result = pd_result.fillna(float("nan")) - pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) - - -@pytest.mark.parametrize( - ("values", "index", "columns"), - [ - (["goals", "assists"], ["team_name", "season"], ["position"]), - (["goals", "assists"], ["season"], ["team_name", "position"]), - ], -) -def test_df_pivot_hockey(hockey_df, hockey_pandas_df, values, index, columns): - bf_result = ( - hockey_df.reset_index() - .pivot(values=values, index=index, columns=columns) - .to_pandas() - ) - pd_result = hockey_pandas_df.reset_index().pivot( - values=values, index=index, columns=columns - ) - - # Pandas produces NaN, where bq dataframes produces pd.NA - pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) - - -@pytest.mark.parametrize( - ("values", "index", "columns", "aggfunc"), - [ - (("culmen_length_mm", "body_mass_g"), "species", "sex", "std"), - (["body_mass_g", "culmen_length_mm"], ("species", "island"), "sex", "sum"), - ("body_mass_g", "sex", ["island", "species"], "mean"), - ("culmen_depth_mm", "island", "species", "max"), - ], -) -def test_df_pivot_table( - penguins_df_default_index, - penguins_pandas_df_default_index, - values, - index, - columns, - aggfunc, -): - bf_result = penguins_df_default_index.pivot_table( - values=values, index=index, columns=columns, aggfunc=aggfunc - ).to_pandas() - pd_result = penguins_pandas_df_default_index.pivot_table( - values=values, index=index, columns=columns, aggfunc=aggfunc - ) - pd.testing.assert_frame_equal( - bf_result, pd_result, check_dtype=False, check_column_type=False - ) - - -def test_ipython_key_completions_with_drop(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_names = "string_col" - bf_dataframe = scalars_df.drop(columns=col_names) - pd_dataframe = scalars_pandas_df.drop(columns=col_names) - expected = pd_dataframe.columns.tolist() - - results = bf_dataframe._ipython_key_completions_() - - assert col_names not in results - assert results == expected - # _ipython_key_completions_ is called with square brackets - # so only column names are relevant with tab completion - assert "to_gbq" not in results - assert "merge" not in results - assert "drop" not in results - - -def test_ipython_key_completions_with_rename(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name_dict = {"string_col": "a_renamed_column"} - bf_dataframe = scalars_df.rename(columns=col_name_dict) - pd_dataframe = scalars_pandas_df.rename(columns=col_name_dict) - expected = pd_dataframe.columns.tolist() - - results = bf_dataframe._ipython_key_completions_() - - assert "string_col" not in results - assert "a_renamed_column" in results - assert results == expected - # _ipython_key_completions_ is called with square brackets - # so only column names are relevant with tab completion - assert "to_gbq" not in results - assert "merge" not in results - assert "drop" not in results - - -def test__dir__with_drop(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_names = "string_col" - bf_dataframe = scalars_df.drop(columns=col_names) - pd_dataframe = scalars_pandas_df.drop(columns=col_names) - expected = pd_dataframe.columns.tolist() - - results = dir(bf_dataframe) - - assert col_names not in results - assert frozenset(expected) <= frozenset(results) - # __dir__ is called with a '.' and displays all methods, columns names, etc. - assert "to_gbq" in results - assert "merge" in results - assert "drop" in results - - -def test__dir__with_rename(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - col_name_dict = {"string_col": "a_renamed_column"} - bf_dataframe = scalars_df.rename(columns=col_name_dict) - pd_dataframe = scalars_pandas_df.rename(columns=col_name_dict) - expected = pd_dataframe.columns.tolist() - - results = dir(bf_dataframe) - - assert "string_col" not in results - assert "a_renamed_column" in results - assert frozenset(expected) <= frozenset(results) - # __dir__ is called with a '.' and displays all methods, columns names, etc. - assert "to_gbq" in results - assert "merge" in results - assert "drop" in results - - -def test_loc_select_columns_w_repeats(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index[["int64_col", "int64_col", "int64_too"]].to_pandas() - pd_result = scalars_pandas_df_index[["int64_col", "int64_col", "int64_too"]] - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -@pytest.mark.parametrize( - ("start", "stop", "step"), - [ - (0, 0, None), - (None, None, None), - (1, None, None), - (None, 4, None), - (None, None, 2), - (None, 50000000000, 1), - (5, 4, None), - (3, None, 2), - (1, 7, 2), - (1, 7, 50000000000), - ], -) -def test_iloc_slice(scalars_df_index, scalars_pandas_df_index, start, stop, step): - bf_result = scalars_df_index.iloc[start:stop:step].to_pandas() - pd_result = scalars_pandas_df_index.iloc[start:stop:step] - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -@pytest.mark.parametrize( - ("start", "stop", "step"), - [ - (0, 0, None), - ], -) -def test_iloc_slice_after_cache( - scalars_df_index, scalars_pandas_df_index, start, stop, step -): - scalars_df_index.cache() - bf_result = scalars_df_index.iloc[start:stop:step].to_pandas() - pd_result = scalars_pandas_df_index.iloc[start:stop:step] - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_iloc_slice_zero_step(scalars_df_index): - with pytest.raises(ValueError): - scalars_df_index.iloc[0:0:0] - - -@pytest.mark.parametrize( - ("ordered"), - [ - (True), - (False), - ], -) -def test_iloc_slice_nested(scalars_df_index, scalars_pandas_df_index, ordered): - bf_result = scalars_df_index.iloc[1:].iloc[1:].to_pandas(ordered=ordered) - pd_result = scalars_pandas_df_index.iloc[1:].iloc[1:] - - assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) - - -@pytest.mark.parametrize( - "index", - [0, 5, -2, (2,)], -) -def test_iloc_single_integer(scalars_df_index, scalars_pandas_df_index, index): - bf_result = scalars_df_index.iloc[index] - pd_result = scalars_pandas_df_index.iloc[index] - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -@pytest.mark.parametrize( - "index", - [(2, 5), (5, 0), (0, 0)], -) -def test_iloc_tuple(scalars_df_index, scalars_pandas_df_index, index): - bf_result = scalars_df_index.iloc[index] - pd_result = scalars_pandas_df_index.iloc[index] - - assert bf_result == pd_result - - -@pytest.mark.parametrize( - "index", - [(slice(None), [1, 2, 3]), (slice(1, 7, 2), [2, 5, 3])], -) -def test_iloc_tuple_multi_columns(scalars_df_index, scalars_pandas_df_index, index): - bf_result = scalars_df_index.iloc[index].to_pandas() - pd_result = scalars_pandas_df_index.iloc[index] - - pd.testing.assert_frame_equal(bf_result, pd_result) - - -def test_iloc_tuple_multi_columns_single_row(scalars_df_index, scalars_pandas_df_index): - index = (2, [2, 1, 3, -4]) - bf_result = scalars_df_index.iloc[index] - pd_result = scalars_pandas_df_index.iloc[index] - pd.testing.assert_series_equal(bf_result, pd_result) - - -@pytest.mark.parametrize( - ("index", "error"), - [ - ((1, 1, 1), pd.errors.IndexingError), - (("asd", "asd", "asd"), pd.errors.IndexingError), - (("asd"), TypeError), - ], -) -def test_iloc_tuple_errors(scalars_df_index, scalars_pandas_df_index, index, error): - with pytest.raises(error): - scalars_df_index.iloc[index] - with pytest.raises(error): - scalars_pandas_df_index.iloc[index] - - -@pytest.mark.parametrize( - "index", - [(2, 5), (5, 0), (0, 0)], -) -def test_iat(scalars_df_index, scalars_pandas_df_index, index): - bf_result = scalars_df_index.iat[index] - pd_result = scalars_pandas_df_index.iat[index] - - assert bf_result == pd_result - - -@pytest.mark.parametrize( - ("index", "error"), - [ - (0, TypeError), - ("asd", ValueError), - ((1, 2, 3), TypeError), - (("asd", "asd"), ValueError), - ], -) -def test_iat_errors(scalars_df_index, scalars_pandas_df_index, index, error): - with pytest.raises(error): - scalars_pandas_df_index.iat[index] - with pytest.raises(error): - scalars_df_index.iat[index] - - -def test_iloc_single_integer_out_of_bound_error(scalars_df_index): - with pytest.raises(IndexError, match="single positional indexer is out-of-bounds"): - scalars_df_index.iloc[99] - - -def test_loc_bool_series(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.loc[scalars_df_index.bool_col].to_pandas() - pd_result = scalars_pandas_df_index.loc[scalars_pandas_df_index.bool_col] - - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_loc_list_select_rows_and_columns(scalars_df_index, scalars_pandas_df_index): - idx_list = [0, 3, 5] - bf_result = scalars_df_index.loc[idx_list, ["bool_col", "int64_col"]].to_pandas() - pd_result = scalars_pandas_df_index.loc[idx_list, ["bool_col", "int64_col"]] - - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_loc_select_column(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.loc[:, "int64_col"].to_pandas() - pd_result = scalars_pandas_df_index.loc[:, "int64_col"] - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -def test_loc_select_with_column_condition(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.loc[:, scalars_df_index.dtypes == "Int64"].to_pandas() - pd_result = scalars_pandas_df_index.loc[ - :, scalars_pandas_df_index.dtypes == "Int64" - ] - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_loc_select_with_column_condition_bf_series( - scalars_df_index, scalars_pandas_df_index -): - # (b/347072677) GEOGRAPH type doesn't support DISTINCT op - columns = [ - item for item in scalars_pandas_df_index.columns if item != "geography_col" - ] - scalars_df_index = scalars_df_index[columns] - scalars_pandas_df_index = scalars_pandas_df_index[columns] - - size_half = len(scalars_pandas_df_index) / 2 - bf_result = scalars_df_index.loc[ - :, scalars_df_index.nunique() > size_half - ].to_pandas() - pd_result = scalars_pandas_df_index.loc[ - :, scalars_pandas_df_index.nunique() > size_half - ] - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_loc_single_index_with_duplicate(scalars_df_index, scalars_pandas_df_index): - scalars_df_index = scalars_df_index.set_index("string_col", drop=False) - scalars_pandas_df_index = scalars_pandas_df_index.set_index( - "string_col", drop=False - ) - index = "Hello, World!" - bf_result = scalars_df_index.loc[index] - pd_result = scalars_pandas_df_index.loc[index] - pd.testing.assert_frame_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_loc_single_index_no_duplicate(scalars_df_index, scalars_pandas_df_index): - scalars_df_index = scalars_df_index.set_index("int64_too", drop=False) - scalars_pandas_df_index = scalars_pandas_df_index.set_index("int64_too", drop=False) - index = -2345 - bf_result = scalars_df_index.loc[index] - pd_result = scalars_pandas_df_index.loc[index] - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -def test_at_with_duplicate(scalars_df_index, scalars_pandas_df_index): - scalars_df_index = scalars_df_index.set_index("string_col", drop=False) - scalars_pandas_df_index = scalars_pandas_df_index.set_index( - "string_col", drop=False - ) - index = "Hello, World!" - bf_result = scalars_df_index.at[index, "int64_too"] - pd_result = scalars_pandas_df_index.at[index, "int64_too"] - pd.testing.assert_series_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_at_no_duplicate(scalars_df_index, scalars_pandas_df_index): - scalars_df_index = scalars_df_index.set_index("int64_too", drop=False) - scalars_pandas_df_index = scalars_pandas_df_index.set_index("int64_too", drop=False) - index = -2345 - bf_result = scalars_df_index.at[index, "string_col"] - pd_result = scalars_pandas_df_index.at[index, "string_col"] - assert bf_result == pd_result - - -def test_loc_setitem_bool_series_scalar_new_col(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_df = scalars_df.copy() - pd_df = scalars_pandas_df.copy() - bf_df.loc[bf_df["int64_too"] == 0, "new_col"] = 99 - pd_df.loc[pd_df["int64_too"] == 0, "new_col"] = 99 - - # pandas uses float64 instead - pd_df["new_col"] = pd_df["new_col"].astype("Float64") - - pd.testing.assert_frame_equal( - bf_df.to_pandas(), - pd_df, - ) - - -@pytest.mark.parametrize( - ("col", "value"), - [ - ("string_col", "hello"), - ("int64_col", 3), - ("float64_col", 3.5), - ], -) -def test_loc_setitem_bool_series_scalar_existing_col(scalars_dfs, col, value): - if pd.__version__.startswith("1."): - pytest.skip("this loc overload not supported in pandas 1.x.") - - scalars_df, scalars_pandas_df = scalars_dfs - bf_df = scalars_df.copy() - pd_df = scalars_pandas_df.copy() - bf_df.loc[bf_df["int64_too"] == 1, col] = value - pd_df.loc[pd_df["int64_too"] == 1, col] = value - - pd.testing.assert_frame_equal( - bf_df.to_pandas(), - pd_df, - ) - - -def test_loc_setitem_bool_series_scalar_error(scalars_dfs): - if pd.__version__.startswith("1."): - pytest.skip("this loc overload not supported in pandas 1.x.") - - scalars_df, scalars_pandas_df = scalars_dfs - bf_df = scalars_df.copy() - pd_df = scalars_pandas_df.copy() - - with pytest.raises(Exception): - bf_df.loc[bf_df["int64_too"] == 1, "string_col"] = 99 - with pytest.raises(Exception): - pd_df.loc[pd_df["int64_too"] == 1, "string_col"] = 99 - - -@pytest.mark.parametrize( - ("col", "op"), - [ - # Int aggregates - pytest.param("int64_col", lambda x: x.sum(), id="int-sum"), - pytest.param("int64_col", lambda x: x.min(), id="int-min"), - pytest.param("int64_col", lambda x: x.max(), id="int-max"), - pytest.param("int64_col", lambda x: x.count(), id="int-count"), - pytest.param("int64_col", lambda x: x.nunique(), id="int-nunique"), - # Float aggregates - pytest.param("float64_col", lambda x: x.count(), id="float-count"), - pytest.param("float64_col", lambda x: x.nunique(), id="float-nunique"), - # Bool aggregates - pytest.param("bool_col", lambda x: x.sum(), id="bool-sum"), - pytest.param("bool_col", lambda x: x.count(), id="bool-count"), - pytest.param("bool_col", lambda x: x.nunique(), id="bool-nunique"), - # String aggregates - pytest.param("string_col", lambda x: x.count(), id="string-count"), - pytest.param("string_col", lambda x: x.nunique(), id="string-nunique"), - ], -) -def test_dataframe_aggregate_int(scalars_df_index, scalars_pandas_df_index, col, op): - bf_result = op(scalars_df_index[[col]]).to_pandas() - pd_result = op(scalars_pandas_df_index[[col]]) - - # Check dtype separately - assert bf_result.dtype == "Int64" - # Is otherwise "object" dtype - pd_result.index = pd_result.index.astype("string[pyarrow]") - # Pandas may produce narrower numeric types - assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) - - -@pytest.mark.parametrize( - ("col", "op"), - [ - pytest.param("bool_col", lambda x: x.min(), id="bool-min"), - pytest.param("bool_col", lambda x: x.max(), id="bool-max"), - ], -) -def test_dataframe_aggregate_bool(scalars_df_index, scalars_pandas_df_index, col, op): - bf_result = op(scalars_df_index[[col]]).to_pandas() - pd_result = op(scalars_pandas_df_index[[col]]) - - # Check dtype separately - assert bf_result.dtype == "boolean" - - # Pandas may produce narrower numeric types - # Pandas has object index type - pd_result.index = pd_result.index.astype("string[pyarrow]") - assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) - - -@pytest.mark.parametrize( - ("op", "bf_dtype"), - [ - (lambda x: x.sum(numeric_only=True), "Float64"), - (lambda x: x.mean(numeric_only=True), "Float64"), - (lambda x: x.min(numeric_only=True), "Float64"), - (lambda x: x.max(numeric_only=True), "Float64"), - (lambda x: x.std(numeric_only=True), "Float64"), - (lambda x: x.var(numeric_only=True), "Float64"), - (lambda x: x.count(numeric_only=False), "Int64"), - (lambda x: x.nunique(), "Int64"), - ], - ids=["sum", "mean", "min", "max", "std", "var", "count", "nunique"], -) -def test_dataframe_aggregates(scalars_dfs_maybe_ordered, op, bf_dtype): - scalars_df_index, scalars_pandas_df_index = scalars_dfs_maybe_ordered - col_names = ["int64_too", "float64_col", "string_col", "int64_col", "bool_col"] - bf_series = op(scalars_df_index[col_names]) - bf_result = bf_series - pd_result = op(scalars_pandas_df_index[col_names]) - - # Check dtype separately - assert bf_result.dtype == bf_dtype - - # Pandas may produce narrower numeric types, but bigframes always produces Float64 - # Pandas has object index type - pd_result.index = pd_result.index.astype("string[pyarrow]") - assert_series_equivalent( - pd_result, - bf_result, - check_dtype=False, - check_index_type=False, - ) - - -@pytest.mark.parametrize( - ("op"), - [ - (lambda x: x.sum(axis=1, numeric_only=True)), - (lambda x: x.mean(axis=1, numeric_only=True)), - (lambda x: x.min(axis=1, numeric_only=True)), - (lambda x: x.max(axis=1, numeric_only=True)), - (lambda x: x.std(axis=1, numeric_only=True)), - (lambda x: x.var(axis=1, numeric_only=True)), - ], - ids=["sum", "mean", "min", "max", "std", "var"], -) -def test_dataframe_aggregates_axis_1(scalars_df_index, scalars_pandas_df_index, op): - col_names = ["int64_too", "int64_col", "float64_col", "bool_col", "string_col"] - bf_result = op(scalars_df_index[col_names]).to_pandas() - pd_result = op(scalars_pandas_df_index[col_names]) - - # Pandas may produce narrower numeric types, but bigframes always produces Float64 - pd_result = pd_result.astype("Float64") - # Pandas has object index type - pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) - - -def test_dataframe_aggregates_median(scalars_df_index, scalars_pandas_df_index): - col_names = ["int64_too", "float64_col", "int64_col", "bool_col"] - bf_result = scalars_df_index[col_names].median(numeric_only=True).to_pandas() - pd_result = scalars_pandas_df_index[col_names].agg(["min", "max"]) - - # Pandas may produce narrower numeric types, but bigframes always produces Float64 - pd_result = pd_result.astype("Float64") - - # Median is an approximation, but double-check that median is plausible. - for col in col_names: - assert (pd_result.loc["min", col] <= bf_result[col]) and ( - bf_result[col] <= pd_result.loc["max", col] - ) - - -def test_dataframe_aggregates_quantile_mono(scalars_df_index, scalars_pandas_df_index): - q = 0.45 - col_names = ["int64_too", "int64_col", "float64_col"] - bf_result = scalars_df_index[col_names].quantile(q=q).to_pandas() - pd_result = scalars_pandas_df_index[col_names].quantile(q=q) - - # Pandas may produce narrower numeric types, but bigframes always produces Float64 - pd_result = pd_result.astype("Float64") - - pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) - - -def test_dataframe_aggregates_quantile_multi(scalars_df_index, scalars_pandas_df_index): - q = [0, 0.33, 0.67, 1.0] - col_names = ["int64_too", "int64_col", "float64_col"] - bf_result = scalars_df_index[col_names].quantile(q=q).to_pandas() - pd_result = scalars_pandas_df_index[col_names].quantile(q=q) - - # Pandas may produce narrower numeric types, but bigframes always produces Float64 - pd_result = pd_result.astype("Float64") - pd_result.index = pd_result.index.astype("Float64") - - pd.testing.assert_frame_equal(bf_result, pd_result) - - -@pytest.mark.parametrize( - ("op"), - [ - (lambda x: x.all(bool_only=True)), - (lambda x: x.any(bool_only=True)), - (lambda x: x.all(axis=1, bool_only=True)), - (lambda x: x.any(axis=1, bool_only=True)), - ], - ids=["all_axis0", "any_axis0", "all_axis1", "any_axis1"], -) -def test_dataframe_bool_aggregates(scalars_df_index, scalars_pandas_df_index, op): - # Pandas will drop nullable 'boolean' dtype so we convert first to bool, then cast back later - scalars_df_index = scalars_df_index.assign( - bool_col=scalars_df_index.bool_col.fillna(False) - ) - scalars_pandas_df_index = scalars_pandas_df_index.assign( - bool_col=scalars_pandas_df_index.bool_col.fillna(False).astype("bool") - ) - bf_series = op(scalars_df_index) - pd_series = op(scalars_pandas_df_index).astype("boolean") - bf_result = bf_series.to_pandas() - - pd_series.index = pd_series.index.astype(bf_result.index.dtype) - pd.testing.assert_series_equal(pd_series, bf_result, check_index_type=False) - - -def test_dataframe_prod(scalars_df_index, scalars_pandas_df_index): - col_names = ["int64_too", "float64_col"] - bf_series = scalars_df_index[col_names].prod() - pd_series = scalars_pandas_df_index[col_names].prod() - bf_result = bf_series.to_pandas() - - # Pandas may produce narrower numeric types, but bigframes always produces Float64 - pd_series = pd_series.astype("Float64") - # Pandas has object index type - pd.testing.assert_series_equal(pd_series, bf_result, check_index_type=False) - - -def test_df_skew_too_few_values(scalars_dfs): - columns = ["float64_col", "int64_col"] - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[columns].head(2).skew().to_pandas() - pd_result = scalars_pandas_df[columns].head(2).skew() - - # Pandas may produce narrower numeric types, but bigframes always produces Float64 - pd_result = pd_result.astype("Float64") - - pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) - - -@pytest.mark.parametrize( - ("ordered"), - [ - (True), - (False), - ], -) -def test_df_skew(scalars_dfs, ordered): - columns = ["float64_col", "int64_col"] - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[columns].skew().to_pandas(ordered=ordered) - pd_result = scalars_pandas_df[columns].skew() - - # Pandas may produce narrower numeric types, but bigframes always produces Float64 - pd_result = pd_result.astype("Float64") - - assert_series_equal( - pd_result, bf_result, check_index_type=False, ignore_order=not ordered - ) - - -def test_df_kurt_too_few_values(scalars_dfs): - columns = ["float64_col", "int64_col"] - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[columns].head(2).kurt().to_pandas() - pd_result = scalars_pandas_df[columns].head(2).kurt() - - # Pandas may produce narrower numeric types, but bigframes always produces Float64 - pd_result = pd_result.astype("Float64") - - pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) - - -def test_df_kurt(scalars_dfs): - columns = ["float64_col", "int64_col"] - scalars_df, scalars_pandas_df = scalars_dfs - bf_result = scalars_df[columns].kurt().to_pandas() - pd_result = scalars_pandas_df[columns].kurt() - - # Pandas may produce narrower numeric types, but bigframes always produces Float64 - pd_result = pd_result.astype("Float64") - - pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) - - -@pytest.mark.parametrize( - ("frac", "n", "random_state"), - [ - (None, 4, None), - (0.5, None, None), - (None, 4, 10), - (0.5, None, 10), - (None, None, None), - ], - ids=[ - "n_wo_random_state", - "frac_wo_random_state", - "n_w_random_state", - "frac_w_random_state", - "n_default", - ], -) -def test_sample(scalars_dfs, frac, n, random_state): - scalars_df, _ = scalars_dfs - df = scalars_df.sample(frac=frac, n=n, random_state=random_state) - bf_result = df.to_pandas() - - n = 1 if n is None else n - expected_sample_size = round(frac * scalars_df.shape[0]) if frac is not None else n - assert bf_result.shape[0] == expected_sample_size - assert bf_result.shape[1] == scalars_df.shape[1] - - -def test_sample_determinism(penguins_df_default_index): - df = penguins_df_default_index.sample(n=100, random_state=12345).head(15) - bf_result = df.to_pandas() - bf_result2 = df.to_pandas() - - pandas.testing.assert_frame_equal(bf_result, bf_result2) - - -def test_sample_raises_value_error(scalars_dfs): - scalars_df, _ = scalars_dfs - with pytest.raises( - ValueError, match="Only one of 'n' or 'frac' parameter can be specified." - ): - scalars_df.sample(frac=0.5, n=4) - - -def test_sample_args_sort(scalars_dfs): - scalars_df, _ = scalars_dfs - index = [4, 3, 2, 5, 1, 0] - scalars_df = scalars_df.iloc[index] - - kwargs = {"frac": 1.0, "random_state": 333} - - df = scalars_df.sample(**kwargs).to_pandas() - assert df.index.values != index - assert df.index.values != sorted(index) - - df = scalars_df.sample(sort="random", **kwargs).to_pandas() - assert df.index.values != index - assert df.index.values != sorted(index) - - df = scalars_df.sample(sort=True, **kwargs).to_pandas() - assert df.index.values == sorted(index) - - df = scalars_df.sample(sort=False, **kwargs).to_pandas() - assert df.index.values == index - - -@pytest.mark.parametrize( - ("axis",), - [ - (None,), - (0,), - (1,), - ], -) -def test_df_add_prefix(scalars_df_index, scalars_pandas_df_index, axis): - if pd.__version__.startswith("1."): - pytest.skip("add_prefix axis parameter not supported in pandas 1.x.") - bf_result = scalars_df_index.add_prefix("prefix_", axis).to_pandas() - - pd_result = scalars_pandas_df_index.add_prefix("prefix_", axis) - - pd.testing.assert_frame_equal( - bf_result, - pd_result, - check_index_type=False, - ) - - -@pytest.mark.parametrize( - ("axis",), - [ - (0,), - (1,), - ], -) -def test_df_add_suffix(scalars_df_index, scalars_pandas_df_index, axis): - if pd.__version__.startswith("1."): - pytest.skip("add_prefix axis parameter not supported in pandas 1.x.") - bf_result = scalars_df_index.add_suffix("_suffix", axis).to_pandas() - - pd_result = scalars_pandas_df_index.add_suffix("_suffix", axis) - - pd.testing.assert_frame_equal( - bf_result, - pd_result, - check_index_type=False, - ) - - -def test_df_astype_error_error(session): - input = pd.DataFrame(["hello", "world", "3.11", "4000"]) - with pytest.raises(ValueError): - session.read_pandas(input).astype("Float64", errors="bad_value") - - -def test_df_columns_filter_items(scalars_df_index, scalars_pandas_df_index): - if pd.__version__.startswith("2.0") or pd.__version__.startswith("1."): - pytest.skip("pandas filter items behavior different pre-2.1") - bf_result = scalars_df_index.filter(items=["string_col", "int64_col"]).to_pandas() - - pd_result = scalars_pandas_df_index.filter(items=["string_col", "int64_col"]) - # Ignore column ordering as pandas order differently depending on version - pd.testing.assert_frame_equal( - bf_result.sort_index(axis=1), - pd_result.sort_index(axis=1), - ) - - -def test_df_columns_filter_like(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.filter(like="64_col").to_pandas() - - pd_result = scalars_pandas_df_index.filter(like="64_col") - - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_df_columns_filter_regex(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.filter(regex="^[^_]+$").to_pandas() - - pd_result = scalars_pandas_df_index.filter(regex="^[^_]+$") - - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_df_rows_filter_items(scalars_df_index, scalars_pandas_df_index): - if pd.__version__.startswith("2.0") or pd.__version__.startswith("1."): - pytest.skip("pandas filter items behavior different pre-2.1") - bf_result = scalars_df_index.filter(items=[5, 1, 3], axis=0).to_pandas() - - pd_result = scalars_pandas_df_index.filter(items=[5, 1, 3], axis=0) - - # Pandas uses int64 instead of Int64 (nullable) dtype. - pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - # Ignore ordering as pandas order differently depending on version - assert_pandas_df_equal( - bf_result, - pd_result, - ignore_order=True, - check_names=False, - ) - - -def test_df_rows_filter_like(scalars_df_index, scalars_pandas_df_index): - scalars_df_index = scalars_df_index.copy().set_index("string_col") - scalars_pandas_df_index = scalars_pandas_df_index.copy().set_index("string_col") - - bf_result = scalars_df_index.filter(like="ello", axis=0).to_pandas() - - pd_result = scalars_pandas_df_index.filter(like="ello", axis=0) - - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_df_rows_filter_regex(scalars_df_index, scalars_pandas_df_index): - scalars_df_index = scalars_df_index.copy().set_index("string_col") - scalars_pandas_df_index = scalars_pandas_df_index.copy().set_index("string_col") - - bf_result = scalars_df_index.filter(regex="^[GH].*", axis=0).to_pandas() - - pd_result = scalars_pandas_df_index.filter(regex="^[GH].*", axis=0) - - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_df_reindex_rows_list(scalars_dfs_maybe_ordered): - scalars_df_index, scalars_pandas_df_index = scalars_dfs_maybe_ordered - bf_result = scalars_df_index.reindex(index=[5, 1, 3, 99, 1]) - - pd_result = scalars_pandas_df_index.reindex(index=[5, 1, 3, 99, 1]) - - # Pandas uses int64 instead of Int64 (nullable) dtype. - pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - assert_dfs_equivalent( - pd_result, - bf_result, - ) - - -def test_df_reindex_rows_index(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.reindex( - index=pd.Index([5, 1, 3, 99, 1], name="newname") - ).to_pandas() - - pd_result = scalars_pandas_df_index.reindex( - index=pd.Index([5, 1, 3, 99, 1], name="newname") - ) - - # Pandas uses int64 instead of Int64 (nullable) dtype. - pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_df_reindex_nonunique(scalars_df_index): - with pytest.raises(ValueError): - # int64_too is non-unique - scalars_df_index.set_index("int64_too").reindex( - index=[5, 1, 3, 99, 1], validate=True - ) - - -def test_df_reindex_columns(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.reindex( - columns=["not_a_col", "int64_col", "int64_too"] - ).to_pandas() - - pd_result = scalars_pandas_df_index.reindex( - columns=["not_a_col", "int64_col", "int64_too"] - ) - - # Pandas uses float64 as default for newly created empty column, bf uses Float64 - pd_result.not_a_col = pd_result.not_a_col.astype(pandas.Float64Dtype()) - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_df_reindex_columns_with_same_order(scalars_df_index, scalars_pandas_df_index): - # First, make sure the two dataframes have the same columns in order. - columns = ["int64_col", "int64_too"] - bf = scalars_df_index[columns] - pd_df = scalars_pandas_df_index[columns] - - bf_result = bf.reindex(columns=columns).to_pandas() - pd_result = pd_df.reindex(columns=columns) - - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_df_equals_identical(scalars_df_index, scalars_pandas_df_index): - unsupported = [ - "geography_col", - ] - scalars_df_index = scalars_df_index.drop(columns=unsupported) - scalars_pandas_df_index = scalars_pandas_df_index.drop(columns=unsupported) - - bf_result = scalars_df_index.equals(scalars_df_index) - pd_result = scalars_pandas_df_index.equals(scalars_pandas_df_index) - - assert pd_result == bf_result - - -def test_df_equals_series(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index[["int64_col"]].equals(scalars_df_index["int64_col"]) - pd_result = scalars_pandas_df_index[["int64_col"]].equals( - scalars_pandas_df_index["int64_col"] - ) - - assert pd_result == bf_result - - -def test_df_equals_different_dtype(scalars_df_index, scalars_pandas_df_index): - columns = ["int64_col", "int64_too"] - scalars_df_index = scalars_df_index[columns] - scalars_pandas_df_index = scalars_pandas_df_index[columns] - - bf_modified = scalars_df_index.copy() - bf_modified = bf_modified.astype("Float64") - - pd_modified = scalars_pandas_df_index.copy() - pd_modified = pd_modified.astype("Float64") - - bf_result = scalars_df_index.equals(bf_modified) - pd_result = scalars_pandas_df_index.equals(pd_modified) - - assert pd_result == bf_result - - -def test_df_equals_different_values(scalars_df_index, scalars_pandas_df_index): - columns = ["int64_col", "int64_too"] - scalars_df_index = scalars_df_index[columns] - scalars_pandas_df_index = scalars_pandas_df_index[columns] - - bf_modified = scalars_df_index.copy() - bf_modified["int64_col"] = bf_modified.int64_col + 1 - - pd_modified = scalars_pandas_df_index.copy() - pd_modified["int64_col"] = pd_modified.int64_col + 1 - - bf_result = scalars_df_index.equals(bf_modified) - pd_result = scalars_pandas_df_index.equals(pd_modified) - - assert pd_result == bf_result - - -def test_df_equals_extra_column(scalars_df_index, scalars_pandas_df_index): - columns = ["int64_col", "int64_too"] - more_columns = ["int64_col", "int64_too", "float64_col"] - - bf_result = scalars_df_index[columns].equals(scalars_df_index[more_columns]) - pd_result = scalars_pandas_df_index[columns].equals( - scalars_pandas_df_index[more_columns] - ) - - assert pd_result == bf_result - - -def test_df_reindex_like(scalars_df_index, scalars_pandas_df_index): - reindex_target_bf = scalars_df_index.reindex( - columns=["not_a_col", "int64_col", "int64_too"], index=[5, 1, 3, 99, 1] - ) - bf_result = scalars_df_index.reindex_like(reindex_target_bf).to_pandas() - - reindex_target_pd = scalars_pandas_df_index.reindex( - columns=["not_a_col", "int64_col", "int64_too"], index=[5, 1, 3, 99, 1] - ) - pd_result = scalars_pandas_df_index.reindex_like(reindex_target_pd) - - # Pandas uses float64 as default for newly created empty column, bf uses Float64 - # Pandas uses int64 instead of Int64 (nullable) dtype. - pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - # Pandas uses float64 as default for newly created empty column, bf uses Float64 - pd_result.not_a_col = pd_result.not_a_col.astype(pandas.Float64Dtype()) - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_df_values(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.values - - pd_result = scalars_pandas_df_index.values - # Numpy isn't equipped to compare non-numeric objects, so convert back to dataframe - pd.testing.assert_frame_equal( - pd.DataFrame(bf_result), pd.DataFrame(pd_result), check_dtype=False - ) - - -def test_df_to_numpy(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.to_numpy() - - pd_result = scalars_pandas_df_index.to_numpy() - # Numpy isn't equipped to compare non-numeric objects, so convert back to dataframe - pd.testing.assert_frame_equal( - pd.DataFrame(bf_result), pd.DataFrame(pd_result), check_dtype=False - ) - - -def test_df___array__(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.__array__() - - pd_result = scalars_pandas_df_index.__array__() - # Numpy isn't equipped to compare non-numeric objects, so convert back to dataframe - pd.testing.assert_frame_equal( - pd.DataFrame(bf_result), pd.DataFrame(pd_result), check_dtype=False - ) - - -@pytest.mark.parametrize( - ("key",), - [ - ("hello",), - (2,), - ("int64_col",), - (None,), - ], -) -def test_df_contains(scalars_df_index, scalars_pandas_df_index, key): - bf_result = key in scalars_df_index - pd_result = key in scalars_pandas_df_index - - assert bf_result == pd_result - - -def test_df_getattr_attribute_error_when_pandas_has(scalars_df_index): - # swapaxes is implemented in pandas but not in bigframes - with pytest.raises(AttributeError): - scalars_df_index.swapaxes() - - -def test_df_getattr_attribute_error(scalars_df_index): - with pytest.raises(AttributeError): - scalars_df_index.not_a_method() - - -def test_df_getattr_axes(): - df = dataframe.DataFrame( - [[1, 1, 1], [1, 1, 1]], columns=["index", "columns", "my_column"] - ) - assert isinstance(df.index, bigframes.core.indexes.Index) - assert isinstance(df.columns, pandas.Index) - assert isinstance(df.my_column, series.Series) - - -def test_df_setattr_index(): - pd_df = pandas.DataFrame( - [[1, 1, 1], [1, 1, 1]], columns=["index", "columns", "my_column"] - ) - bf_df = dataframe.DataFrame(pd_df) - - pd_df.index = pandas.Index([4, 5]) - bf_df.index = [4, 5] - - assert_pandas_df_equal( - pd_df, bf_df.to_pandas(), check_index_type=False, check_dtype=False - ) - - -def test_df_setattr_columns(): - pd_df = pandas.DataFrame( - [[1, 1, 1], [1, 1, 1]], columns=["index", "columns", "my_column"] - ) - bf_df = dataframe.DataFrame(pd_df) - - pd_df.columns = typing.cast(pandas.Index, pandas.Index([4, 5, 6])) - - bf_df.columns = pandas.Index([4, 5, 6]) - - assert_pandas_df_equal( - pd_df, bf_df.to_pandas(), check_index_type=False, check_dtype=False - ) - - -def test_df_setattr_modify_column(): - pd_df = pandas.DataFrame( - [[1, 1, 1], [1, 1, 1]], columns=["index", "columns", "my_column"] - ) - bf_df = dataframe.DataFrame(pd_df) - pd_df.my_column = [4, 5] - bf_df.my_column = [4, 5] - - assert_pandas_df_equal( - pd_df, bf_df.to_pandas(), check_index_type=False, check_dtype=False - ) - - -def test_loc_list_string_index(scalars_df_index, scalars_pandas_df_index): - index_list = scalars_pandas_df_index.string_col.iloc[[0, 1, 1, 5]].values - - scalars_df_index = scalars_df_index.set_index("string_col") - scalars_pandas_df_index = scalars_pandas_df_index.set_index("string_col") - - bf_result = scalars_df_index.loc[index_list].to_pandas() - pd_result = scalars_pandas_df_index.loc[index_list] - - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_loc_list_integer_index(scalars_df_index, scalars_pandas_df_index): - index_list = [3, 2, 1, 3, 2, 1] - - bf_result = scalars_df_index.loc[index_list] - pd_result = scalars_pandas_df_index.loc[index_list] - - pd.testing.assert_frame_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_loc_list_multiindex(scalars_dfs_maybe_ordered): - scalars_df_index, scalars_pandas_df_index = scalars_dfs_maybe_ordered - scalars_df_multiindex = scalars_df_index.set_index(["string_col", "int64_col"]) - scalars_pandas_df_multiindex = scalars_pandas_df_index.set_index( - ["string_col", "int64_col"] - ) - index_list = [("Hello, World!", -234892), ("Hello, World!", 123456789)] - - bf_result = scalars_df_multiindex.loc[index_list] - pd_result = scalars_pandas_df_multiindex.loc[index_list] - - assert_dfs_equivalent( - pd_result, - bf_result, - ) - - -@pytest.mark.parametrize( - "index_list", - [ - [0, 1, 2, 3, 4, 4], - [0, 0, 0, 5, 4, 7, -2, -5, 3], - [-1, -2, -3, -4, -5, -5], - ], -) -def test_iloc_list(scalars_df_index, scalars_pandas_df_index, index_list): - bf_result = scalars_df_index.iloc[index_list] - pd_result = scalars_pandas_df_index.iloc[index_list] - - pd.testing.assert_frame_equal( - bf_result.to_pandas(), - pd_result, - ) - - -@pytest.mark.parametrize( - "index_list", - [ - [0, 1, 2, 3, 4, 4], - [0, 0, 0, 5, 4, 7, -2, -5, 3], - [-1, -2, -3, -4, -5, -5], - ], -) -def test_iloc_list_partial_ordering( - scalars_df_partial_ordering, scalars_pandas_df_index, index_list -): - bf_result = scalars_df_partial_ordering.iloc[index_list] - pd_result = scalars_pandas_df_index.iloc[index_list] - - pd.testing.assert_frame_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_iloc_list_multiindex(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - scalars_df = scalars_df.copy() - scalars_pandas_df = scalars_pandas_df.copy() - scalars_df = scalars_df.set_index(["bytes_col", "numeric_col"]) - scalars_pandas_df = scalars_pandas_df.set_index(["bytes_col", "numeric_col"]) - - index_list = [0, 0, 0, 5, 4, 7] - - bf_result = scalars_df.iloc[index_list] - pd_result = scalars_pandas_df.iloc[index_list] - - pd.testing.assert_frame_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_iloc_empty_list(scalars_df_index, scalars_pandas_df_index): - - index_list: List[int] = [] - - bf_result = scalars_df_index.iloc[index_list] - pd_result = scalars_pandas_df_index.iloc[index_list] - - bf_result = bf_result.to_pandas() - assert bf_result.shape == pd_result.shape # types are known to be different - - -def test_rename_axis(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.rename_axis("newindexname") - pd_result = scalars_pandas_df_index.rename_axis("newindexname") - - pd.testing.assert_frame_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_rename_axis_nonstring(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.rename_axis((4,)) - pd_result = scalars_pandas_df_index.rename_axis((4,)) - - pd.testing.assert_frame_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_loc_bf_series_string_index(scalars_df_index, scalars_pandas_df_index): - pd_string_series = scalars_pandas_df_index.string_col.iloc[[0, 5, 1, 1, 5]] - bf_string_series = scalars_df_index.string_col.iloc[[0, 5, 1, 1, 5]] - - scalars_df_index = scalars_df_index.set_index("string_col") - scalars_pandas_df_index = scalars_pandas_df_index.set_index("string_col") - - bf_result = scalars_df_index.loc[bf_string_series] - pd_result = scalars_pandas_df_index.loc[pd_string_series] - - pd.testing.assert_frame_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_loc_bf_series_multiindex(scalars_df_index, scalars_pandas_df_index): - pd_string_series = scalars_pandas_df_index.string_col.iloc[[0, 5, 1, 1, 5]] - bf_string_series = scalars_df_index.string_col.iloc[[0, 5, 1, 1, 5]] - - scalars_df_multiindex = scalars_df_index.set_index(["string_col", "int64_col"]) - scalars_pandas_df_multiindex = scalars_pandas_df_index.set_index( - ["string_col", "int64_col"] - ) - - bf_result = scalars_df_multiindex.loc[bf_string_series] - pd_result = scalars_pandas_df_multiindex.loc[pd_string_series] - - pd.testing.assert_frame_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_loc_bf_index_integer_index(scalars_df_index, scalars_pandas_df_index): - pd_index = scalars_pandas_df_index.iloc[[0, 5, 1, 1, 5]].index - bf_index = scalars_df_index.iloc[[0, 5, 1, 1, 5]].index - - bf_result = scalars_df_index.loc[bf_index] - pd_result = scalars_pandas_df_index.loc[pd_index] - - pd.testing.assert_frame_equal( - bf_result.to_pandas(), - pd_result, - ) - - -def test_loc_bf_index_integer_index_renamed_col( - scalars_df_index, scalars_pandas_df_index -): - scalars_df_index = scalars_df_index.rename(columns={"int64_col": "rename"}) - scalars_pandas_df_index = scalars_pandas_df_index.rename( - columns={"int64_col": "rename"} - ) - - pd_index = scalars_pandas_df_index.iloc[[0, 5, 1, 1, 5]].index - bf_index = scalars_df_index.iloc[[0, 5, 1, 1, 5]].index - - bf_result = scalars_df_index.loc[bf_index] - pd_result = scalars_pandas_df_index.loc[pd_index] - - pd.testing.assert_frame_equal( - bf_result.to_pandas(), - pd_result, - ) - - -@pytest.mark.parametrize( - ("subset"), - [ - None, - "bool_col", - ["bool_col", "int64_too"], - ], -) -@pytest.mark.parametrize( - ("keep",), - [ - ("first",), - ("last",), - (False,), - ], -) -def test_df_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep, subset): - columns = ["bool_col", "int64_too", "int64_col"] - bf_df = scalars_df_index[columns].drop_duplicates(subset, keep=keep).to_pandas() - pd_df = scalars_pandas_df_index[columns].drop_duplicates(subset, keep=keep) - pd.testing.assert_frame_equal( - pd_df, - bf_df, - ) - - -@pytest.mark.parametrize( - ("keep",), - [ - ("first",), - ("last",), - (False,), - ], -) -def test_df_drop_duplicates_w_json(json_df, keep): - bf_df = json_df.drop_duplicates(keep=keep).to_pandas() - - # drop_duplicates relies on pa.compute.dictionary_encode, which is incompatible - # with Arrow string extension types. Temporary conversion to standard Pandas - # strings is required. - json_pandas_df = json_df.to_pandas() - json_pandas_df["json_col"] = json_pandas_df["json_col"].astype( - pd.StringDtype(storage="pyarrow") - ) - - pd_df = json_pandas_df.drop_duplicates(keep=keep) - pd_df["json_col"] = pd_df["json_col"].astype(dtypes.JSON_DTYPE) - pd.testing.assert_frame_equal( - pd_df, - bf_df, - ) - - -@pytest.mark.parametrize( - ("subset"), - [ - None, - ["bool_col"], - ], -) -@pytest.mark.parametrize( - ("keep",), - [ - ("first",), - ("last",), - (False,), - ], -) -def test_df_duplicated(scalars_df_index, scalars_pandas_df_index, keep, subset): - columns = ["bool_col", "int64_too", "int64_col"] - bf_series = scalars_df_index[columns].duplicated(subset, keep=keep).to_pandas() - pd_series = scalars_pandas_df_index[columns].duplicated(subset, keep=keep) - pd.testing.assert_series_equal(pd_series, bf_series, check_dtype=False) - - -def test_df_from_dict_columns_orient(): - data = {"a": [1, 2], "b": [3.3, 2.4]} - bf_result = dataframe.DataFrame.from_dict(data, orient="columns").to_pandas() - pd_result = pd.DataFrame.from_dict(data, orient="columns") - assert_pandas_df_equal( - pd_result, bf_result, check_dtype=False, check_index_type=False - ) - - -def test_df_from_dict_index_orient(): - data = {"a": [1, 2], "b": [3.3, 2.4]} - bf_result = dataframe.DataFrame.from_dict( - data, orient="index", columns=["col1", "col2"] - ).to_pandas() - pd_result = pd.DataFrame.from_dict(data, orient="index", columns=["col1", "col2"]) - assert_pandas_df_equal( - pd_result, bf_result, check_dtype=False, check_index_type=False - ) - - -def test_df_from_dict_tight_orient(): - data = { - "index": [("i1", "i2"), ("i3", "i4")], - "columns": ["col1", "col2"], - "data": [[1, 2.6], [3, 4.5]], - "index_names": ["in1", "in2"], - "column_names": ["column_axis"], - } - - bf_result = dataframe.DataFrame.from_dict(data, orient="tight").to_pandas() - pd_result = pd.DataFrame.from_dict(data, orient="tight") - assert_pandas_df_equal( - pd_result, bf_result, check_dtype=False, check_index_type=False - ) - - -def test_df_from_records(): - records = ((1, "a"), (2.5, "b"), (3.3, "c"), (4.9, "d")) - - bf_result = dataframe.DataFrame.from_records( - records, columns=["c1", "c2"] - ).to_pandas() - pd_result = pd.DataFrame.from_records(records, columns=["c1", "c2"]) - assert_pandas_df_equal( - pd_result, bf_result, check_dtype=False, check_index_type=False - ) - - -def test_df_to_dict(scalars_df_index, scalars_pandas_df_index): - unsupported = ["numeric_col"] # formatted differently - bf_result = scalars_df_index.drop(columns=unsupported).to_dict() - pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_dict() - - assert bf_result == pd_result - - -def test_df_to_excel(scalars_df_index, scalars_pandas_df_index): - unsupported = ["timestamp_col"] - with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: - scalars_df_index.drop(columns=unsupported).to_excel(bf_result_file) - scalars_pandas_df_index.drop(columns=unsupported).to_excel(pd_result_file) - bf_result = bf_result_file.read() - pd_result = bf_result_file.read() - - assert bf_result == pd_result - - -def test_df_to_latex(scalars_df_index, scalars_pandas_df_index): - unsupported = ["numeric_col"] # formatted differently - bf_result = scalars_df_index.drop(columns=unsupported).to_latex() - pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_latex() - - assert bf_result == pd_result - - -def test_df_to_json_local_str(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.to_json() - # default_handler for arrow types that have no default conversion - pd_result = scalars_pandas_df_index.to_json(default_handler=str) - - assert bf_result == pd_result - - -def test_df_to_json_local_file(scalars_df_index, scalars_pandas_df_index): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - # duration not fully supported at pandas level - scalars_df_index = scalars_df_index.drop(columns="duration_col") - scalars_pandas_df_index = scalars_pandas_df_index.drop(columns="duration_col") - with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: - scalars_df_index.to_json(bf_result_file, orient="table") - # default_handler for arrow types that have no default conversion - scalars_pandas_df_index.to_json( - pd_result_file, orient="table", default_handler=str - ) - - bf_result = bf_result_file.read() - pd_result = pd_result_file.read() - - assert bf_result == pd_result - - -def test_df_to_csv_local_str(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.to_csv() - # default_handler for arrow types that have no default conversion - pd_result = scalars_pandas_df_index.to_csv() - - assert bf_result == pd_result - - -def test_df_to_csv_local_file(scalars_df_index, scalars_pandas_df_index): - with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: - scalars_df_index.to_csv(bf_result_file) - scalars_pandas_df_index.to_csv(pd_result_file) - - bf_result = bf_result_file.read() - pd_result = pd_result_file.read() - - assert bf_result == pd_result - - -def test_df_to_parquet_local_bytes(scalars_df_index, scalars_pandas_df_index): - # GEOGRAPHY not supported in parquet export. - unsupported = ["geography_col"] - - bf_result = scalars_df_index.drop(columns=unsupported).to_parquet() - # default_handler for arrow types that have no default conversion - pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_parquet() - - assert bf_result == pd_result - - -def test_df_to_parquet_local_file(scalars_df_index, scalars_pandas_df_index): - # GEOGRAPHY not supported in parquet export. - unsupported = ["geography_col"] - with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: - scalars_df_index.drop(columns=unsupported).to_parquet(bf_result_file) - scalars_pandas_df_index.drop(columns=unsupported).to_parquet(pd_result_file) - - bf_result = bf_result_file.read() - pd_result = pd_result_file.read() - - assert bf_result == pd_result - - -def test_df_to_records(scalars_df_index, scalars_pandas_df_index): - unsupported = ["numeric_col"] - bf_result = scalars_df_index.drop(columns=unsupported).to_records() - pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_records() - - for bfi, pdi in zip(bf_result, pd_result): - for bfj, pdj in zip(bfi, pdi): - assert pd.isna(bfj) and pd.isna(pdj) or bfj == pdj - - -def test_df_to_string(scalars_df_index, scalars_pandas_df_index): - unsupported = ["numeric_col"] # formatted differently - - bf_result = scalars_df_index.drop(columns=unsupported).to_string() - pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_string() - - assert bf_result == pd_result - - -def test_df_to_html(scalars_df_index, scalars_pandas_df_index): - unsupported = ["numeric_col"] # formatted differently - - bf_result = scalars_df_index.drop(columns=unsupported).to_html() - pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_html() - - assert bf_result == pd_result - - -def test_df_to_markdown(scalars_df_index, scalars_pandas_df_index): - # Nulls have bug from tabulate https://github.com/astanin/python-tabulate/issues/231 - bf_result = scalars_df_index.dropna().to_markdown() - pd_result = scalars_pandas_df_index.dropna().to_markdown() - - assert bf_result == pd_result - - -def test_df_to_pickle(scalars_df_index, scalars_pandas_df_index): - with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: - scalars_df_index.to_pickle(bf_result_file) - scalars_pandas_df_index.to_pickle(pd_result_file) - bf_result = bf_result_file.read() - pd_result = pd_result_file.read() - - assert bf_result == pd_result - - -def test_df_to_orc(scalars_df_index, scalars_pandas_df_index): - unsupported = [ - "numeric_col", - "bytes_col", - "date_col", - "datetime_col", - "time_col", - "timestamp_col", - "geography_col", - "duration_col", - ] - - bf_result_file = tempfile.TemporaryFile() - pd_result_file = tempfile.TemporaryFile() - scalars_df_index.drop(columns=unsupported).to_orc(bf_result_file) - scalars_pandas_df_index.drop(columns=unsupported).reset_index().to_orc( - pd_result_file - ) - bf_result = bf_result_file.read() - pd_result = bf_result_file.read() - - assert bf_result == pd_result - - -@pytest.mark.parametrize( - ("expr",), - [ - ("new_col = int64_col + int64_too",), - ("new_col = (rowindex > 3) | bool_col",), - ("int64_too = bool_col\nnew_col2 = rowindex",), - ], -) -def test_df_eval(scalars_dfs, expr): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = scalars_df.eval(expr).to_pandas() - pd_result = scalars_pandas_df.eval(expr) - - pd.testing.assert_frame_equal(bf_result, pd_result) - - -@pytest.mark.parametrize( - ("expr",), - [ - ("int64_col > int64_too",), - ("bool_col",), - ("((int64_col - int64_too) % @local_var) == 0",), - ], -) -def test_df_query(scalars_dfs, expr): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - # local_var is referenced in expressions - local_var = 3 # NOQA - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = scalars_df.query(expr).to_pandas() - pd_result = scalars_pandas_df.query(expr) - - pd.testing.assert_frame_equal(bf_result, pd_result) - - -@pytest.mark.parametrize( - ("subset", "normalize", "ascending", "dropna"), - [ - (None, False, False, False), - (None, True, True, True), - ("bool_col", True, False, True), - ], -) -def test_df_value_counts(scalars_dfs, subset, normalize, ascending, dropna): - if pd.__version__.startswith("1."): - pytest.skip("pandas 1.x produces different column labels.") - scalars_df, scalars_pandas_df = scalars_dfs - - bf_result = ( - scalars_df[["string_col", "bool_col"]] - .value_counts(subset, normalize=normalize, ascending=ascending, dropna=dropna) - .to_pandas() - ) - pd_result = scalars_pandas_df[["string_col", "bool_col"]].value_counts( - subset, normalize=normalize, ascending=ascending, dropna=dropna - ) - - pd.testing.assert_series_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False - ) - - -@pytest.mark.parametrize( - ("na_option", "method", "ascending", "numeric_only", "pct"), - [ - ("keep", "average", True, True, True), - ("top", "min", False, False, False), - ("bottom", "max", False, False, True), - ("top", "first", False, False, False), - ("bottom", "dense", False, False, True), - ], -) -def test_df_rank_with_nulls( - scalars_df_index, - scalars_pandas_df_index, - na_option, - method, - ascending, - numeric_only, - pct, -): - unsupported_columns = ["geography_col"] - bf_result = ( - scalars_df_index.drop(columns=unsupported_columns) - .rank( - na_option=na_option, - method=method, - ascending=ascending, - numeric_only=numeric_only, - pct=pct, - ) - .to_pandas() - ) - pd_result = ( - scalars_pandas_df_index.drop(columns=unsupported_columns) - .rank( - na_option=na_option, - method=method, - ascending=ascending, - numeric_only=numeric_only, - pct=pct, - ) - .astype(pd.Float64Dtype()) - ) - - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_df_bool_interpretation_error(scalars_df_index): - with pytest.raises(ValueError): - True if scalars_df_index else False - - -def test_query_job_setters(scalars_df_default_index: dataframe.DataFrame): - # if allow_large_results=False, might not create query job - with bigframes.option_context("compute.allow_large_results", True): - job_ids = set() - repr(scalars_df_default_index) - assert scalars_df_default_index.query_job is not None - job_ids.add(scalars_df_default_index.query_job.job_id) - scalars_df_default_index.to_pandas(allow_large_results=True) - job_ids.add(scalars_df_default_index.query_job.job_id) - - assert len(job_ids) == 2 - - -def test_df_cached(scalars_df_index): - df = scalars_df_index.set_index(["int64_too", "int64_col"]).sort_values( - "string_col" - ) - df = df[df["rowindex_2"] % 2 == 0] - - df_cached_copy = df.cache() - pandas.testing.assert_frame_equal(df.to_pandas(), df_cached_copy.to_pandas()) - - -def test_df_cached_many_index_cols(scalars_df_index): - index_cols = [ - "int64_too", - "time_col", - "int64_col", - "bool_col", - "date_col", - "timestamp_col", - "string_col", - ] - df = scalars_df_index.set_index(index_cols) - df = df[df["rowindex_2"] % 2 == 0] - - df_cached_copy = df.cache() - pandas.testing.assert_frame_equal(df.to_pandas(), df_cached_copy.to_pandas()) - - -def test_assign_after_binop_row_joins(): - pd_df = pd.DataFrame( - { - "idx1": [1, 1, 1, 1, 2, 2, 2, 2], - "idx2": [10, 10, 20, 20, 10, 10, 20, 20], - "metric1": [10, 14, 2, 13, 6, 2, 9, 5], - "metric2": [25, -3, 8, 2, -1, 0, 0, -4], - }, - dtype=pd.Int64Dtype(), - ).set_index(["idx1", "idx2"]) - bf_df = dataframe.DataFrame(pd_df) - - # Expect implicit joiner to be used, preserving input cardinality rather than getting relational join - bf_df["metric_diff"] = bf_df.metric1 - bf_df.metric2 - pd_df["metric_diff"] = pd_df.metric1 - pd_df.metric2 - - assert_pandas_df_equal(bf_df.to_pandas(), pd_df) - - -def test_df_cache_with_implicit_join(scalars_df_index): - """expectation is that cache will be used, but no explicit join will be performed""" - df = scalars_df_index[["int64_col", "int64_too"]].sort_index().reset_index() + 3 - df.cache() - bf_result = df + (df * 2) - sql = bf_result.sql - - # Very crude asserts, want sql to not use join and not use base table, only reference cached table - assert "JOIN" not in sql - assert "bigframes_testing" not in sql - - -def test_df_dot_inline(session): - df1 = pd.DataFrame([[1, 2, 3], [2, 5, 7]]) - df2 = pd.DataFrame([[2, 4, 8], [1, 5, 10], [3, 6, 9]]) - - bf1 = session.read_pandas(df1) - bf2 = session.read_pandas(df2) - bf_result = bf1.dot(bf2).to_pandas() - pd_result = df1.dot(df2) - - # Patch pandas dtypes for testing parity - # Pandas uses int64 instead of Int64 (nullable) dtype. - for name in pd_result.columns: - pd_result[name] = pd_result[name].astype(pd.Int64Dtype()) - pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_df_dot( - matrix_2by3_df, matrix_2by3_pandas_df, matrix_3by4_df, matrix_3by4_pandas_df -): - bf_result = matrix_2by3_df.dot(matrix_3by4_df).to_pandas() - pd_result = matrix_2by3_pandas_df.dot(matrix_3by4_pandas_df) - - # Patch pandas dtypes for testing parity - # Pandas result is object instead of Int64 (nullable) dtype. - for name in pd_result.columns: - pd_result[name] = pd_result[name].astype(pd.Int64Dtype()) - - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_df_dot_operator( - matrix_2by3_df, matrix_2by3_pandas_df, matrix_3by4_df, matrix_3by4_pandas_df -): - bf_result = (matrix_2by3_df @ matrix_3by4_df).to_pandas() - pd_result = matrix_2by3_pandas_df @ matrix_3by4_pandas_df - - # Patch pandas dtypes for testing parity - # Pandas result is object instead of Int64 (nullable) dtype. - for name in pd_result.columns: - pd_result[name] = pd_result[name].astype(pd.Int64Dtype()) - - pd.testing.assert_frame_equal( - bf_result, - pd_result, - ) - - -def test_df_dot_series_inline(): - left = [[1, 2, 3], [2, 5, 7]] - right = [2, 1, 3] - - bf1 = dataframe.DataFrame(left) - bf2 = series.Series(right) - bf_result = bf1.dot(bf2).to_pandas() - - df1 = pd.DataFrame(left) - df2 = pd.Series(right) - pd_result = df1.dot(df2) - - # Patch pandas dtypes for testing parity - # Pandas result is int64 instead of Int64 (nullable) dtype. - pd_result = pd_result.astype(pd.Int64Dtype()) - pd_result.index = pd_result.index.astype(pd.Int64Dtype()) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -def test_df_dot_series( - matrix_2by3_df, matrix_2by3_pandas_df, matrix_3by4_df, matrix_3by4_pandas_df -): - bf_result = matrix_2by3_df.dot(matrix_3by4_df["x"]).to_pandas() - pd_result = matrix_2by3_pandas_df.dot(matrix_3by4_pandas_df["x"]) - - # Patch pandas dtypes for testing parity - # Pandas result is object instead of Int64 (nullable) dtype. - pd_result = pd_result.astype(pd.Int64Dtype()) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -def test_df_dot_operator_series( - matrix_2by3_df, matrix_2by3_pandas_df, matrix_3by4_df, matrix_3by4_pandas_df -): - bf_result = (matrix_2by3_df @ matrix_3by4_df["x"]).to_pandas() - pd_result = matrix_2by3_pandas_df @ matrix_3by4_pandas_df["x"] - - # Patch pandas dtypes for testing parity - # Pandas result is object instead of Int64 (nullable) dtype. - pd_result = pd_result.astype(pd.Int64Dtype()) - - pd.testing.assert_series_equal( - bf_result, - pd_result, - ) - - -# TODO(tswast): We may be able to re-enable this test after we break large -# queries up in https://github.com/googleapis/python-bigquery-dataframes/pull/427 -@pytest.mark.skipif( - sys.version_info >= (3, 12), - # See: https://github.com/python/cpython/issues/112282 - reason="setrecursionlimit has no effect on the Python C stack since Python 3.12.", -) -def test_recursion_limit(scalars_df_index): - scalars_df_index = scalars_df_index[["int64_too", "int64_col", "float64_col"]] - for i in range(400): - scalars_df_index = scalars_df_index + 4 - scalars_df_index.to_pandas() - - -@pytest.mark.skipif( - reason="b/366477265: Skip until query complexity error can be reliably triggered." -) -def test_query_complexity_error(scalars_df_index): - # This test requires automatic caching/query decomposition to be turned off - bf_df = scalars_df_index - for _ in range(8): - bf_df = bf_df.merge(bf_df, on="int64_col").head(30) - bf_df = bf_df[bf_df.columns[:20]] - - with pytest.raises( - bigframes.exceptions.QueryComplexityError, match=r"Try using DataFrame\.cache" - ): - bf_df.to_pandas() - - -def test_query_complexity_repeated_joins( - scalars_df_index, scalars_pandas_df_index, with_multiquery_execution -): - pd_df = scalars_pandas_df_index - bf_df = scalars_df_index - for _ in range(8): - # recursively join, resuling in 2^8 - 1 = 255 joins - pd_df = pd_df.merge(pd_df, on="int64_col").head(30) - pd_df = pd_df[pd_df.columns[:20]] - bf_df = bf_df.merge(bf_df, on="int64_col").head(30) - bf_df = bf_df[bf_df.columns[:20]] - - bf_result = bf_df.to_pandas() - pd_result = pd_df - assert_pandas_df_equal(bf_result, pd_result, check_index_type=False) - - -def test_query_complexity_repeated_subtrees( - scalars_df_index, scalars_pandas_df_index, with_multiquery_execution -): - # Recursively union the data, if fully inlined has 10^5 identical root tables. - pd_df = scalars_pandas_df_index - bf_df = scalars_df_index - for _ in range(5): - pd_df = pd.concat(10 * [pd_df]).head(5) - bf_df = bpd.concat(10 * [bf_df]).head(5) - bf_result = bf_df.to_pandas() - pd_result = pd_df - assert_pandas_df_equal(bf_result, pd_result) - - -@pytest.mark.skipif( - sys.version_info >= (3, 12), - # See: https://github.com/python/cpython/issues/112282 - reason="setrecursionlimit has no effect on the Python C stack since Python 3.12.", -) -def test_query_complexity_repeated_analytic(scalars_df_index, scalars_pandas_df_index): - bf_df = scalars_df_index[["int64_col", "int64_too"]] - pd_df = scalars_pandas_df_index[["int64_col", "int64_too"]] - # Uses LAG analytic operator, each in a new SELECT - for _ in range(50): - bf_df = bf_df.diff() - pd_df = pd_df.diff() - bf_result = bf_df.to_pandas() - pd_result = pd_df - assert_pandas_df_equal(bf_result, pd_result) - - -def test_to_gbq_and_create_dataset(session, scalars_df_index, dataset_id_not_created): - dataset_id = dataset_id_not_created - destination_table = f"{dataset_id}.scalars_df" - - result_table = scalars_df_index.to_gbq(destination_table) - assert ( - result_table == destination_table - if destination_table - else result_table is not None - ) - - loaded_scalars_df_index = session.read_gbq(result_table) - assert not loaded_scalars_df_index.empty - - -def test_read_gbq_to_pandas_no_exec(unordered_session: bigframes.Session): - metrics = unordered_session._metrics - execs_pre = metrics.execution_count - df = unordered_session.read_gbq("bigquery-public-data.ml_datasets.penguins") - df.to_pandas() - execs_post = metrics.execution_count - assert df.shape == (344, 7) - assert execs_pre == execs_post - - -def test_to_gbq_table_labels(scalars_df_index): - destination_table = "bigframes-dev.bigframes_tests_sys.table_labels" - result_table = scalars_df_index.to_gbq( - destination_table, labels={"test": "labels"}, if_exists="replace" - ) - client = scalars_df_index._session.bqclient - table = client.get_table(result_table) - assert table.labels - assert table.labels["test"] == "labels" - - -@pytest.mark.parametrize( - ("col_names", "ignore_index"), - [ - pytest.param(["A"], False, id="one_array_false"), - pytest.param(["A"], True, id="one_array_true"), - pytest.param(["B"], False, id="one_float_false"), - pytest.param(["B"], True, id="one_float_true"), - pytest.param(["A", "C"], False, id="two_arrays_false"), - pytest.param(["A", "C"], True, id="two_arrays_true"), - ], -) -def test_dataframe_explode(col_names, ignore_index, session): - data = { - "A": [[0, 1, 2], [], [3, 4]], - "B": 3, - "C": [["a", "b", "c"], np.nan, ["d", "e"]], - } - - metrics = session._metrics - df = bpd.DataFrame(data, session=session) - pd_df = df.to_pandas() - pd_result = pd_df.explode(col_names, ignore_index=ignore_index) - bf_result = df.explode(col_names, ignore_index=ignore_index) - - # Check that to_pandas() results in at most a single query execution - execs_pre = metrics.execution_count - bf_materialized = bf_result.to_pandas() - execs_post = metrics.execution_count - - pd.testing.assert_frame_equal( - bf_materialized, - pd_result, - check_index_type=False, - check_dtype=False, - ) - # we test this property on this method in particular as compilation - # is non-deterministic and won't use the query cache as implemented - assert execs_post - execs_pre <= 1 - - -@pytest.mark.parametrize( - ("ignore_index", "ordered"), - [ - pytest.param(True, True, id="include_index_ordered"), - pytest.param(True, False, id="include_index_unordered"), - pytest.param(False, True, id="ignore_index_ordered"), - ], -) -def test_dataframe_explode_reserve_order(ignore_index, ordered): - data = { - "a": [np.random.randint(0, 10, 10) for _ in range(10)], - "b": [np.random.randint(0, 10, 10) for _ in range(10)], - } - df = bpd.DataFrame(data) - pd_df = pd.DataFrame(data) - - res = df.explode(["a", "b"], ignore_index=ignore_index).to_pandas(ordered=ordered) - pd_res = pd_df.explode(["a", "b"], ignore_index=ignore_index).astype( - pd.Int64Dtype() - ) - pd.testing.assert_frame_equal( - res if ordered else res.sort_index(), - pd_res, - check_index_type=False, - ) - - -@pytest.mark.parametrize( - ("col_names"), - [ - pytest.param([], id="empty", marks=pytest.mark.xfail(raises=ValueError)), - pytest.param( - ["A", "A"], id="duplicate", marks=pytest.mark.xfail(raises=ValueError) - ), - pytest.param("unknown", id="unknown", marks=pytest.mark.xfail(raises=KeyError)), - ], -) -def test_dataframe_explode_xfail(col_names): - df = bpd.DataFrame({"A": [[0, 1, 2], [], [3, 4]]}) - df.explode(col_names) - - -@pytest.mark.parametrize( - ("on", "rule", "origin"), - [ - pytest.param("datetime_col", "100D", "start"), - pytest.param("datetime_col", "30W", "start"), - pytest.param("datetime_col", "5M", "epoch"), - pytest.param("datetime_col", "3Q", "start_day"), - pytest.param("datetime_col", "3YE", "start"), - pytest.param( - "int64_col", "100D", "start", marks=pytest.mark.xfail(raises=TypeError) - ), - pytest.param( - "datetime_col", "100D", "end", marks=pytest.mark.xfail(raises=ValueError) - ), - ], -) -def test__resample_with_column( - scalars_df_index, scalars_pandas_df_index, on, rule, origin -): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - bf_result = ( - scalars_df_index._resample(rule=rule, on=on, origin=origin)[ - ["int64_col", "int64_too"] - ] - .max() - .to_pandas() - ) - pd_result = scalars_pandas_df_index.resample(rule=rule, on=on, origin=origin)[ - ["int64_col", "int64_too"] - ].max() - pd.testing.assert_frame_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False - ) - - -@pytest.mark.parametrize( - ("append", "level", "col", "rule"), - [ - pytest.param(False, None, "timestamp_col", "100d"), - pytest.param(True, 1, "timestamp_col", "1200h"), - pytest.param(False, None, "datetime_col", "100d"), - ], -) -def test__resample_with_index( - scalars_df_index, scalars_pandas_df_index, append, level, col, rule -): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - scalars_df_index = scalars_df_index.set_index(col, append=append) - scalars_pandas_df_index = scalars_pandas_df_index.set_index(col, append=append) - bf_result = ( - scalars_df_index[["int64_col", "int64_too"]] - ._resample(rule=rule, level=level) - .min() - .to_pandas() - ) - pd_result = ( - scalars_pandas_df_index[["int64_col", "int64_too"]] - .resample(rule=rule, level=level) - .min() - ) - assert_pandas_df_equal(bf_result, pd_result) - - -@pytest.mark.parametrize( - ("rule", "origin", "data"), - [ - ( - "5h", - "epoch", - { - "timestamp_col": pd.date_range( - start="2021-01-01 13:00:00", periods=30, freq="1h" - ), - "int64_col": range(30), - "int64_too": range(10, 40), - }, - ), - ( - "75min", - "start_day", - { - "timestamp_col": pd.date_range( - start="2021-01-01 13:00:00", periods=30, freq="10min" - ), - "int64_col": range(30), - "int64_too": range(10, 40), - }, - ), - ( - "7s", - "epoch", - { - "timestamp_col": pd.date_range( - start="2021-01-01 13:00:00", periods=30, freq="1s" - ), - "int64_col": range(30), - "int64_too": range(10, 40), - }, - ), - ], -) -def test__resample_start_time(rule, origin, data): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - col = "timestamp_col" - scalars_df_index = bpd.DataFrame(data).set_index(col) - scalars_pandas_df_index = pd.DataFrame(data).set_index(col) - scalars_pandas_df_index.index.name = None - - bf_result = scalars_df_index._resample(rule=rule, origin=origin).min().to_pandas() - - pd_result = scalars_pandas_df_index.resample(rule=rule, origin=origin).min() - - pd.testing.assert_frame_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False - ) - - -@pytest.mark.parametrize( - "dtype", - [ - pytest.param("string[pyarrow]", id="type-string"), - pytest.param(pd.StringDtype(storage="pyarrow"), id="type-literal"), - pytest.param( - {"bool_col": "string[pyarrow]", "int64_col": pd.Float64Dtype()}, - id="multiple-types", - ), - ], -) -def test_df_astype(scalars_dfs, dtype): - bf_df, pd_df = scalars_dfs - target_cols = ["bool_col", "int64_col"] - bf_df = bf_df[target_cols] - pd_df = pd_df[target_cols] - - bf_result = bf_df.astype(dtype).to_pandas() - pd_result = pd_df.astype(dtype) - - pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) - - -def test_df_astype_python_types(scalars_dfs): - bf_df, pd_df = scalars_dfs - target_cols = ["bool_col", "int64_col"] - bf_df = bf_df[target_cols] - pd_df = pd_df[target_cols] - - bf_result = bf_df.astype({"bool_col": str, "int64_col": float}).to_pandas() - pd_result = pd_df.astype( - {"bool_col": "string[pyarrow]", "int64_col": pd.Float64Dtype()} - ) - - pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) - - -def test_astype_invalid_type_fail(scalars_dfs): - bf_df, _ = scalars_dfs - - with pytest.raises(TypeError, match=r".*Share your use case with.*"): - bf_df.astype(123) - - -def test_agg_with_dict_lists_strings(scalars_dfs): - bf_df, pd_df = scalars_dfs - agg_funcs = { - "int64_too": ["min", "max"], - "int64_col": ["min", "count"], - } - - bf_result = bf_df.agg(agg_funcs).to_pandas() - pd_result = pd_df.agg(agg_funcs) - - pd.testing.assert_frame_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False - ) - - -def test_agg_with_dict_lists_callables(scalars_dfs): - bf_df, pd_df = scalars_dfs - agg_funcs = { - "int64_too": [np.min, np.max], - "int64_col": [np.min, np.var], - } - - bf_result = bf_df.agg(agg_funcs).to_pandas() - pd_result = pd_df.agg(agg_funcs) - - pd.testing.assert_frame_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False - ) - - -def test_agg_with_dict_list_and_str(scalars_dfs): - bf_df, pd_df = scalars_dfs - agg_funcs = { - "int64_too": ["min", "max"], - "int64_col": "sum", - } - - bf_result = bf_df.agg(agg_funcs).to_pandas() - pd_result = pd_df.agg(agg_funcs) - - pd.testing.assert_frame_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False - ) - - -def test_agg_with_dict_strs(scalars_dfs): - bf_df, pd_df = scalars_dfs - agg_funcs = { - "int64_too": "min", - "int64_col": "sum", - "float64_col": "max", - } - - bf_result = bf_df.agg(agg_funcs).to_pandas() - pd_result = pd_df.agg(agg_funcs) - pd_result.index = pd_result.index.astype("string[pyarrow]") - - pd.testing.assert_series_equal( - bf_result, pd_result, check_dtype=False, check_index_type=False - ) - - -def test_agg_with_dict_containing_non_existing_col_raise_key_error(scalars_dfs): - bf_df, _ = scalars_dfs - agg_funcs = { - "int64_too": ["min", "max"], - "nonexisting_col": ["count"], - } - - with pytest.raises(KeyError): - bf_df.agg(agg_funcs) + # Verify the result is a string representation + assert isinstance(result["json_col"].iloc[0], str) From 30a9ef621e903109e0dcc213940a097e9d415afc Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 27 Oct 2025 22:29:28 +0000 Subject: [PATCH 29/53] Revert scalar_op_registry.py chnage --- bigframes/core/compile/ibis_compiler/scalar_op_registry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py index 7b17aac61a..e983fc7e21 100644 --- a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py +++ b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -1036,7 +1036,7 @@ def astype_op_impl(x: ibis_types.Value, op: ops.AsTypeOp): if to_type == ibis_dtypes.bool: return cast_json_to_bool_in_safe(x) if op.safe else cast_json_to_bool(x) if to_type == ibis_dtypes.string: - return to_json_string(x) + return cast_json_to_string_in_safe(x) if op.safe else cast_json_to_string(x) # TODO: either inline this function, or push rest of this op into the function return bigframes.core.compile.ibis_types.cast_ibis_value(x, to_type, safe=op.safe) From 6895def33e6a43577f1908c7b2c171d7b94e87ca Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 27 Oct 2025 22:31:53 +0000 Subject: [PATCH 30/53] remove unnecessary import --- bigframes/dataframe.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 788a47f38b..f3b78e8218 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -852,9 +852,7 @@ def _repr_html_(self) -> str: if opts.repr_mode == "anywidget": try: - import anywidget # noqa: F401 from IPython.display import display as ipython_display - import traitlets # noqa: F401 from bigframes import display From 46444c11ec6148f0ec595a44f0fefcc91ad802d0 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 27 Oct 2025 22:47:44 +0000 Subject: [PATCH 31/53] Remove duplicate conversation --- bigframes/display/anywidget.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index ff5a51f312..cf5d4e6310 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -74,21 +74,7 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame): "Please `pip install anywidget traitlets` or `pip install 'bigframes[anywidget]'` to use TableWidget." ) - super().__init__() - # Workaround for Arrow bug https://github.com/apache/arrow/issues/45262 - # JSON columns are not supported in `to_pandas_batches` and will be converted to string. - json_cols = [ - col - for col, dtype in dataframe.dtypes.items() - if dtype == bigframes.dtypes.JSON_DTYPE - ] - if json_cols: - df_copy = dataframe.copy() - for col in json_cols: - df_copy[str(col)] = df_copy[str(col)].astype("string") - self._dataframe = df_copy - else: - self._dataframe = dataframe + self._dataframe = dataframe super().__init__() From 3b8367b3fc74abaf72d5b246b7038ecc6d9a763e Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 27 Oct 2025 22:52:55 +0000 Subject: [PATCH 32/53] revert changes to test_dataframe.py --- tests/system/small/test_dataframe.py | 6151 +++++++++++++++++++++++++- 1 file changed, 6142 insertions(+), 9 deletions(-) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index ffd9bc512b..79f8efd00f 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1,11 +1,6144 @@ -def test_to_pandas_batches_with_json_columns(session): - """Test that JSON columns are properly handled in to_pandas_batches.""" - # Create a DataFrame with JSON column - df = session.read_gbq('SELECT JSON \'{"key": "value"}\' as json_col') +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. - # This should not raise an error - batches = df._to_pandas_batches(page_size=10) - result = next(batches) +import io +import operator +import sys +import tempfile +import typing +from typing import Dict, List, Tuple - # Verify the result is a string representation - assert isinstance(result["json_col"].iloc[0], str) +import geopandas as gpd # type: ignore +import numpy as np +import pandas as pd +import pandas.testing +import pyarrow as pa # type: ignore +import pytest + +import bigframes +import bigframes._config.display_options as display_options +import bigframes.core.indexes as bf_indexes +import bigframes.dataframe as dataframe +import bigframes.dtypes as dtypes +import bigframes.pandas as bpd +import bigframes.series as series +from bigframes.testing.utils import ( + assert_dfs_equivalent, + assert_pandas_df_equal, + assert_series_equal, + assert_series_equivalent, +) + + +def test_df_construct_copy(scalars_dfs): + columns = ["int64_col", "string_col", "float64_col"] + scalars_df, scalars_pandas_df = scalars_dfs + # Make the mapping from label to col_id non-trivial + bf_df = scalars_df.copy() + bf_df["int64_col"] = bf_df["int64_col"] / 2 + pd_df = scalars_pandas_df.copy() + pd_df["int64_col"] = pd_df["int64_col"] / 2 + + bf_result = dataframe.DataFrame(bf_df, columns=columns).to_pandas() + + pd_result = pd.DataFrame(pd_df, columns=columns) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_df_construct_pandas_default(scalars_dfs): + # This should trigger the inlined codepath + columns = [ + "int64_too", + "int64_col", + "float64_col", + "bool_col", + "string_col", + "date_col", + "datetime_col", + "numeric_col", + "float64_col", + "time_col", + "timestamp_col", + ] + _, scalars_pandas_df = scalars_dfs + bf_result = dataframe.DataFrame(scalars_pandas_df, columns=columns).to_pandas() + pd_result = pd.DataFrame(scalars_pandas_df, columns=columns) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("write_engine"), + [ + ("bigquery_inline"), + ("bigquery_load"), + ("bigquery_streaming"), + ("bigquery_write"), + ], +) +def test_read_pandas_all_nice_types( + session: bigframes.Session, scalars_pandas_df_index: pd.DataFrame, write_engine +): + bf_result = session.read_pandas( + scalars_pandas_df_index, write_engine=write_engine + ).to_pandas() + pandas.testing.assert_frame_equal(bf_result, scalars_pandas_df_index) + + +def test_df_construct_large_strings(): + data = [["hello", "w" + "o" * 50000 + "rld"]] + bf_result = dataframe.DataFrame(data).to_pandas() + pd_result = pd.DataFrame(data, dtype=pd.StringDtype(storage="pyarrow")) + pandas.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + + +def test_df_construct_pandas_load_job(scalars_dfs_maybe_ordered): + # This should trigger the inlined codepath + columns = [ + "int64_too", + "int64_col", + "float64_col", + "bool_col", + "string_col", + "date_col", + "datetime_col", + "numeric_col", + "float64_col", + "time_col", + "timestamp_col", + "geography_col", + ] + _, scalars_pandas_df = scalars_dfs_maybe_ordered + bf_result = dataframe.DataFrame(scalars_pandas_df, columns=columns) + pd_result = pd.DataFrame(scalars_pandas_df, columns=columns) + assert_dfs_equivalent(pd_result, bf_result) + + +def test_df_construct_structs(session): + pd_frame = pd.Series( + [ + {"version": 1, "project": "pandas"}, + {"version": 2, "project": "pandas"}, + {"version": 1, "project": "numpy"}, + ] + ).to_frame() + bf_series = session.read_pandas(pd_frame) + pd.testing.assert_frame_equal( + bf_series.to_pandas(), pd_frame, check_index_type=False, check_dtype=False + ) + + +def test_df_construct_local_concat_pd(scalars_pandas_df_index, session): + pd_df = pd.concat([scalars_pandas_df_index, scalars_pandas_df_index]) + + bf_df = session.read_pandas(pd_df) + + pd.testing.assert_frame_equal( + bf_df.to_pandas(), pd_df, check_index_type=False, check_dtype=False + ) + + +def test_df_construct_pandas_set_dtype(scalars_dfs): + columns = [ + "int64_too", + "int64_col", + "float64_col", + "bool_col", + ] + _, scalars_pandas_df = scalars_dfs + bf_result = dataframe.DataFrame( + scalars_pandas_df, columns=columns, dtype="Float64" + ).to_pandas() + pd_result = pd.DataFrame(scalars_pandas_df, columns=columns, dtype="Float64") + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_df_construct_from_series(scalars_dfs_maybe_ordered): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + bf_result = dataframe.DataFrame( + {"a": scalars_df["int64_col"], "b": scalars_df["string_col"]}, + dtype="string[pyarrow]", + ) + pd_result = pd.DataFrame( + {"a": scalars_pandas_df["int64_col"], "b": scalars_pandas_df["string_col"]}, + dtype="string[pyarrow]", + ) + assert_dfs_equivalent(pd_result, bf_result) + + +def test_df_construct_from_dict(): + input_dict = { + "Animal": ["Falcon", "Falcon", "Parrot", "Parrot"], + # With a space in column name. We use standardized SQL schema ids to solve the problem that BQ schema doesn't support column names with spaces. b/296751058 + "Max Speed": [380.0, 370.0, 24.0, 26.0], + } + bf_result = dataframe.DataFrame(input_dict).to_pandas() + pd_result = pd.DataFrame(input_dict) + + pandas.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +@pytest.mark.parametrize( + ("json_type"), + [ + pytest.param(dtypes.JSON_DTYPE), + pytest.param("json"), + ], +) +def test_df_construct_w_json_dtype(json_type): + data = [ + "1", + "false", + '["a", {"b": 1}, null]', + None, + ] + df = dataframe.DataFrame({"json_col": data}, dtype=json_type) + + assert df["json_col"].dtype == dtypes.JSON_DTYPE + assert df["json_col"][1] == "false" + + +def test_df_construct_inline_respects_location(reset_default_session_and_location): + # Note: This starts a thread-local session. + with bpd.option_context("bigquery.location", "europe-west1"): + df = bpd.DataFrame([[1, 2, 3], [4, 5, 6]]) + df.to_gbq() + assert df.query_job is not None + table = bpd.get_global_session().bqclient.get_table(df.query_job.destination) + + assert table.location == "europe-west1" + + +def test_df_construct_dtype(): + data = { + "int_col": [1, 2, 3], + "string_col": ["1.1", "2.0", "3.5"], + "float_col": [1.0, 2.0, 3.0], + } + dtype = pd.StringDtype(storage="pyarrow") + bf_result = dataframe.DataFrame(data, dtype=dtype) + pd_result = pd.DataFrame(data, dtype=dtype) + pd_result.index = pd_result.index.astype("Int64") + pandas.testing.assert_frame_equal(bf_result.to_pandas(), pd_result) + + +def test_get_column(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_col" + series = scalars_df[col_name] + bf_result = series.to_pandas() + pd_result = scalars_pandas_df[col_name] + assert_series_equal(bf_result, pd_result) + + +def test_get_column_nonstring(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + series = scalars_df.rename(columns={"int64_col": 123.1})[123.1] + bf_result = series.to_pandas() + pd_result = scalars_pandas_df.rename(columns={"int64_col": 123.1})[123.1] + assert_series_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + "row_slice", + [ + (slice(1, 7, 2)), + (slice(1, 7, None)), + (slice(None, -3, None)), + ], +) +def test_get_rows_with_slice(scalars_dfs, row_slice): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[row_slice].to_pandas() + pd_result = scalars_pandas_df[row_slice] + assert_pandas_df_equal(bf_result, pd_result) + + +def test_hasattr(scalars_dfs): + scalars_df, _ = scalars_dfs + assert hasattr(scalars_df, "int64_col") + assert hasattr(scalars_df, "head") + assert not hasattr(scalars_df, "not_exist") + + +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_head_with_custom_column_labels( + scalars_df_index, scalars_pandas_df_index, ordered +): + rename_mapping = { + "int64_col": "Integer Column", + "string_col": "言語列", + } + bf_df = scalars_df_index.rename(columns=rename_mapping).head(3) + bf_result = bf_df.to_pandas(ordered=ordered) + pd_result = scalars_pandas_df_index.rename(columns=rename_mapping).head(3) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) + + +def test_tail_with_custom_column_labels(scalars_df_index, scalars_pandas_df_index): + rename_mapping = { + "int64_col": "Integer Column", + "string_col": "言語列", + } + bf_df = scalars_df_index.rename(columns=rename_mapping).tail(3) + bf_result = bf_df.to_pandas() + pd_result = scalars_pandas_df_index.rename(columns=rename_mapping).tail(3) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("keep",), + [ + ("first",), + ("last",), + ("all",), + ], +) +def test_df_nlargest(scalars_df_index, scalars_pandas_df_index, keep): + bf_result = scalars_df_index.nlargest(3, ["bool_col", "int64_too"], keep=keep) + pd_result = scalars_pandas_df_index.nlargest( + 3, ["bool_col", "int64_too"], keep=keep + ) + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +@pytest.mark.parametrize( + ("keep",), + [ + ("first",), + ("last",), + ("all",), + ], +) +def test_df_nsmallest(scalars_df_index, scalars_pandas_df_index, keep): + bf_result = scalars_df_index.nsmallest(6, ["bool_col"], keep=keep) + pd_result = scalars_pandas_df_index.nsmallest(6, ["bool_col"], keep=keep) + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_get_column_by_attr(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + series = scalars_df.int64_col + bf_result = series.to_pandas() + pd_result = scalars_pandas_df.int64_col + assert_series_equal(bf_result, pd_result) + + +def test_get_columns(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_names = ["bool_col", "float64_col", "int64_col"] + df_subset = scalars_df.get(col_names) + df_pandas = df_subset.to_pandas() + pd.testing.assert_index_equal( + df_pandas.columns, scalars_pandas_df[col_names].columns + ) + + +def test_get_columns_default(scalars_dfs): + scalars_df, _ = scalars_dfs + col_names = ["not", "column", "names"] + result = scalars_df.get(col_names, "default_val") + assert result == "default_val" + + +@pytest.mark.parametrize( + ("loc", "column", "value", "allow_duplicates"), + [ + (0, 666, 2, False), + (5, "float64_col", 2.2, True), + (13, "rowindex_2", [8, 7, 6, 5, 4, 3, 2, 1, 0], True), + pytest.param( + 14, + "test", + 2, + False, + marks=pytest.mark.xfail( + raises=IndexError, + ), + ), + pytest.param( + 12, + "int64_col", + 2, + False, + marks=pytest.mark.xfail( + raises=ValueError, + ), + ), + ], +) +def test_insert(scalars_dfs, loc, column, value, allow_duplicates): + scalars_df, scalars_pandas_df = scalars_dfs + # insert works inplace, so will influence other tests. + # make a copy to avoid inplace changes. + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df.insert(loc, column, value, allow_duplicates) + pd_df.insert(loc, column, value, allow_duplicates) + + pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df, check_dtype=False) + + +def test_mask_series_cond(scalars_df_index, scalars_pandas_df_index): + cond_bf = scalars_df_index["int64_col"] > 0 + cond_pd = scalars_pandas_df_index["int64_col"] > 0 + + bf_df = scalars_df_index[["int64_too", "int64_col", "float64_col"]] + pd_df = scalars_pandas_df_index[["int64_too", "int64_col", "float64_col"]] + bf_result = bf_df.mask(cond_bf, bf_df + 1).to_pandas() + pd_result = pd_df.mask(cond_pd, pd_df + 1) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_mask_callable(scalars_df_index, scalars_pandas_df_index): + def is_positive(x): + return x > 0 + + bf_df = scalars_df_index[["int64_too", "int64_col", "float64_col"]] + pd_df = scalars_pandas_df_index[["int64_too", "int64_col", "float64_col"]] + bf_result = bf_df.mask(cond=is_positive, other=lambda x: x + 1).to_pandas() + pd_result = pd_df.mask(cond=is_positive, other=lambda x: x + 1) + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_where_multi_column(scalars_df_index, scalars_pandas_df_index): + # Test when a dataframe has multi-columns. + columns = ["int64_col", "float64_col"] + dataframe_bf = scalars_df_index[columns] + + dataframe_bf.columns = pd.MultiIndex.from_tuples( + [("str1", 1), ("str2", 2)], names=["STR", "INT"] + ) + cond_bf = dataframe_bf["str1"] > 0 + + with pytest.raises(NotImplementedError) as context: + dataframe_bf.where(cond_bf).to_pandas() + assert ( + str(context.value) + == "The dataframe.where() method does not support multi-column." + ) + + +def test_where_series_cond(scalars_df_index, scalars_pandas_df_index): + # Condition is dataframe, other is None (as default). + cond_bf = scalars_df_index["int64_col"] > 0 + cond_pd = scalars_pandas_df_index["int64_col"] > 0 + bf_result = scalars_df_index.where(cond_bf).to_pandas() + pd_result = scalars_pandas_df_index.where(cond_pd) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_where_series_cond_const_other(scalars_df_index, scalars_pandas_df_index): + # Condition is a series, other is a constant. + columns = ["int64_col", "float64_col"] + dataframe_bf = scalars_df_index[columns] + dataframe_pd = scalars_pandas_df_index[columns] + dataframe_bf.columns.name = "test_name" + dataframe_pd.columns.name = "test_name" + + cond_bf = dataframe_bf["int64_col"] > 0 + cond_pd = dataframe_pd["int64_col"] > 0 + other = 0 + + bf_result = dataframe_bf.where(cond_bf, other).to_pandas() + pd_result = dataframe_pd.where(cond_pd, other) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_where_series_cond_dataframe_other(scalars_df_index, scalars_pandas_df_index): + # Condition is a series, other is a dataframe. + columns = ["int64_col", "float64_col"] + dataframe_bf = scalars_df_index[columns] + dataframe_pd = scalars_pandas_df_index[columns] + + cond_bf = dataframe_bf["int64_col"] > 0 + cond_pd = dataframe_pd["int64_col"] > 0 + other_bf = -dataframe_bf + other_pd = -dataframe_pd + + bf_result = dataframe_bf.where(cond_bf, other_bf).to_pandas() + pd_result = dataframe_pd.where(cond_pd, other_pd) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_where_dataframe_cond(scalars_df_index, scalars_pandas_df_index): + # Condition is a dataframe, other is None. + columns = ["int64_col", "float64_col"] + dataframe_bf = scalars_df_index[columns] + dataframe_pd = scalars_pandas_df_index[columns] + + cond_bf = dataframe_bf > 0 + cond_pd = dataframe_pd > 0 + + bf_result = dataframe_bf.where(cond_bf, None).to_pandas() + pd_result = dataframe_pd.where(cond_pd, None) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_where_dataframe_cond_const_other(scalars_df_index, scalars_pandas_df_index): + # Condition is a dataframe, other is a constant. + columns = ["int64_col", "float64_col"] + dataframe_bf = scalars_df_index[columns] + dataframe_pd = scalars_pandas_df_index[columns] + + cond_bf = dataframe_bf > 0 + cond_pd = dataframe_pd > 0 + other_bf = 10 + other_pd = 10 + + bf_result = dataframe_bf.where(cond_bf, other_bf).to_pandas() + pd_result = dataframe_pd.where(cond_pd, other_pd) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_where_dataframe_cond_dataframe_other( + scalars_df_index, scalars_pandas_df_index +): + # Condition is a dataframe, other is a dataframe. + columns = ["int64_col", "float64_col"] + dataframe_bf = scalars_df_index[columns] + dataframe_pd = scalars_pandas_df_index[columns] + + cond_bf = dataframe_bf > 0 + cond_pd = dataframe_pd > 0 + other_bf = dataframe_bf * 2 + other_pd = dataframe_pd * 2 + + bf_result = dataframe_bf.where(cond_bf, other_bf).to_pandas() + pd_result = dataframe_pd.where(cond_pd, other_pd) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_where_callable_cond_constant_other(scalars_df_index, scalars_pandas_df_index): + # Condition is callable, other is a constant. + columns = ["int64_col", "float64_col"] + dataframe_bf = scalars_df_index[columns] + dataframe_pd = scalars_pandas_df_index[columns] + + other = 10 + + bf_result = dataframe_bf.where(lambda x: x > 0, other).to_pandas() + pd_result = dataframe_pd.where(lambda x: x > 0, other) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_where_dataframe_cond_callable_other(scalars_df_index, scalars_pandas_df_index): + # Condition is a dataframe, other is callable. + columns = ["int64_col", "float64_col"] + dataframe_bf = scalars_df_index[columns] + dataframe_pd = scalars_pandas_df_index[columns] + + cond_bf = dataframe_bf > 0 + cond_pd = dataframe_pd > 0 + + def func(x): + return x * 2 + + bf_result = dataframe_bf.where(cond_bf, func).to_pandas() + pd_result = dataframe_pd.where(cond_pd, func) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_where_callable_cond_callable_other(scalars_df_index, scalars_pandas_df_index): + # Condition is callable, other is callable too. + columns = ["int64_col", "float64_col"] + dataframe_bf = scalars_df_index[columns] + dataframe_pd = scalars_pandas_df_index[columns] + + def func(x): + return x["int64_col"] > 0 + + bf_result = dataframe_bf.where(func, lambda x: x * 2).to_pandas() + pd_result = dataframe_pd.where(func, lambda x: x * 2) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_where_series_other(scalars_df_index): + # When other is a series, throw an error. + columns = ["int64_col", "float64_col"] + dataframe_bf = scalars_df_index[columns] + + with pytest.raises( + ValueError, + match="Seires is not a supported replacement type!", + ): + dataframe_bf.where(dataframe_bf > 0, dataframe_bf["int64_col"]) + + +def test_drop_column(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "int64_col" + df_pandas = scalars_df.drop(columns=col_name).to_pandas() + pd.testing.assert_index_equal( + df_pandas.columns, scalars_pandas_df.drop(columns=col_name).columns + ) + + +def test_drop_columns(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_names = ["int64_col", "geography_col", "time_col"] + df_pandas = scalars_df.drop(columns=col_names).to_pandas() + pd.testing.assert_index_equal( + df_pandas.columns, scalars_pandas_df.drop(columns=col_names).columns + ) + + +def test_drop_labels_axis_1(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + labels = ["int64_col", "geography_col", "time_col"] + + pd_result = scalars_pandas_df.drop(labels=labels, axis=1) + bf_result = scalars_df.drop(labels=labels, axis=1).to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_drop_with_custom_column_labels(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + rename_mapping = { + "int64_col": "Integer Column", + "string_col": "言語列", + } + dropped_columns = [ + "言語列", + "timestamp_col", + ] + bf_df = scalars_df.rename(columns=rename_mapping).drop(columns=dropped_columns) + bf_result = bf_df.to_pandas() + pd_result = scalars_pandas_df.rename(columns=rename_mapping).drop( + columns=dropped_columns + ) + assert_pandas_df_equal(bf_result, pd_result) + + +def test_df_memory_usage(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = scalars_pandas_df.memory_usage() + bf_result = scalars_df.memory_usage() + + pd.testing.assert_series_equal(pd_result, bf_result, rtol=1.5) + + +def test_df_info(scalars_dfs): + expected = ( + "\n" + "Index: 9 entries, 0 to 8\n" + "Data columns (total 14 columns):\n" + " # Column Non-Null Count Dtype\n" + "--- ------------- ---------------- ------------------------------\n" + " 0 bool_col 8 non-null boolean\n" + " 1 bytes_col 6 non-null binary[pyarrow]\n" + " 2 date_col 7 non-null date32[day][pyarrow]\n" + " 3 datetime_col 6 non-null timestamp[us][pyarrow]\n" + " 4 geography_col 4 non-null geometry\n" + " 5 int64_col 8 non-null Int64\n" + " 6 int64_too 9 non-null Int64\n" + " 7 numeric_col 6 non-null decimal128(38, 9)[pyarrow]\n" + " 8 float64_col 7 non-null Float64\n" + " 9 rowindex_2 9 non-null Int64\n" + " 10 string_col 8 non-null string\n" + " 11 time_col 6 non-null time64[us][pyarrow]\n" + " 12 timestamp_col 6 non-null timestamp[us, tz=UTC][pyarrow]\n" + " 13 duration_col 7 non-null duration[us][pyarrow]\n" + "dtypes: Float64(1), Int64(3), binary[pyarrow](1), boolean(1), date32[day][pyarrow](1), decimal128(38, 9)[pyarrow](1), duration[us][pyarrow](1), geometry(1), string(1), time64[us][pyarrow](1), timestamp[us, tz=UTC][pyarrow](1), timestamp[us][pyarrow](1)\n" + "memory usage: 1341 bytes\n" + ) + + scalars_df, _ = scalars_dfs + bf_result = io.StringIO() + + scalars_df.info(buf=bf_result) + + assert expected == bf_result.getvalue() + + +@pytest.mark.parametrize( + ("include", "exclude"), + [ + ("Int64", None), + (["int"], None), + ("number", None), + ([pd.Int64Dtype(), pd.BooleanDtype()], None), + (None, [pd.Int64Dtype(), pd.BooleanDtype()]), + ("Int64", ["boolean"]), + ], +) +def test_select_dtypes(scalars_dfs, include, exclude): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = scalars_pandas_df.select_dtypes(include=include, exclude=exclude) + bf_result = scalars_df.select_dtypes(include=include, exclude=exclude).to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_drop_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = scalars_pandas_df.drop(index=[4, 1, 2]) + bf_result = scalars_df.drop(index=[4, 1, 2]).to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_drop_pandas_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + drop_index = scalars_pandas_df.iloc[[4, 1, 2]].index + + pd_result = scalars_pandas_df.drop(index=drop_index) + bf_result = scalars_df.drop(index=drop_index).to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_drop_bigframes_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + drop_index = scalars_df.loc[[4, 1, 2]].index + drop_pandas_index = scalars_pandas_df.loc[[4, 1, 2]].index + + pd_result = scalars_pandas_df.drop(index=drop_pandas_index) + bf_result = scalars_df.drop(index=drop_index).to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_drop_bigframes_index_with_na(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + scalars_df = scalars_df.copy() + scalars_pandas_df = scalars_pandas_df.copy() + scalars_df = scalars_df.set_index("bytes_col") + scalars_pandas_df = scalars_pandas_df.set_index("bytes_col") + drop_index = scalars_df.iloc[[3, 5]].index + drop_pandas_index = scalars_pandas_df.iloc[[3, 5]].index + + pd_result = scalars_pandas_df.drop(index=drop_pandas_index) # drop_pandas_index) + bf_result = scalars_df.drop(index=drop_index).to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_drop_bigframes_multiindex(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + scalars_df = scalars_df.copy() + scalars_pandas_df = scalars_pandas_df.copy() + sub_df = scalars_df.iloc[[4, 1, 2]] + sub_pandas_df = scalars_pandas_df.iloc[[4, 1, 2]] + sub_df = sub_df.set_index(["bytes_col", "numeric_col"]) + sub_pandas_df = sub_pandas_df.set_index(["bytes_col", "numeric_col"]) + drop_index = sub_df.index + drop_pandas_index = sub_pandas_df.index + + scalars_df = scalars_df.set_index(["bytes_col", "numeric_col"]) + scalars_pandas_df = scalars_pandas_df.set_index(["bytes_col", "numeric_col"]) + bf_result = scalars_df.drop(index=drop_index).to_pandas() + pd_result = scalars_pandas_df.drop(index=drop_pandas_index) + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_drop_labels_axis_0(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = scalars_pandas_df.drop(labels=[4, 1, 2], axis=0) + bf_result = scalars_df.drop(labels=[4, 1, 2], axis=0).to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_drop_index_and_columns(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + pd_result = scalars_pandas_df.drop(index=[4, 1, 2], columns="int64_col") + bf_result = scalars_df.drop(index=[4, 1, 2], columns="int64_col").to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result) + + +def test_rename(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name_dict = {"bool_col": 1.2345} + df_pandas = scalars_df.rename(columns=col_name_dict).to_pandas() + pd.testing.assert_index_equal( + df_pandas.columns, scalars_pandas_df.rename(columns=col_name_dict).columns + ) + + +def test_df_peek(scalars_dfs_maybe_ordered): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + + peek_result = scalars_df.peek(n=3, force=False, allow_large_results=True) + + pd.testing.assert_index_equal(scalars_pandas_df.columns, peek_result.columns) + assert len(peek_result) == 3 + + +def test_df_peek_with_large_results_not_allowed(scalars_dfs_maybe_ordered): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + + peek_result = scalars_df.peek(n=3, force=False, allow_large_results=False) + + pd.testing.assert_index_equal(scalars_pandas_df.columns, peek_result.columns) + assert len(peek_result) == 3 + + +def test_df_peek_filtered(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + peek_result = scalars_df[scalars_df.int64_col != 0].peek(n=3, force=False) + pd.testing.assert_index_equal(scalars_pandas_df.columns, peek_result.columns) + assert len(peek_result) == 3 + + +def test_df_peek_exception(scalars_dfs): + scalars_df, _ = scalars_dfs + + with pytest.raises(ValueError): + # Window ops aren't compatible with efficient peeking + scalars_df[["int64_col", "int64_too"]].cumsum().peek(n=3, force=False) + + +def test_df_peek_force_default(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + peek_result = scalars_df[["int64_col", "int64_too"]].cumsum().peek(n=3) + pd.testing.assert_index_equal( + scalars_pandas_df[["int64_col", "int64_too"]].columns, peek_result.columns + ) + assert len(peek_result) == 3 + + +def test_df_peek_reset_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + peek_result = ( + scalars_df[["int64_col", "int64_too"]].reset_index(drop=True).peek(n=3) + ) + pd.testing.assert_index_equal( + scalars_pandas_df[["int64_col", "int64_too"]].columns, peek_result.columns + ) + assert len(peek_result) == 3 + + +def test_repr_w_all_rows(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + # Remove columns with flaky formatting, like NUMERIC columns (which use the + # object dtype). Also makes a copy so that mutating the index name doesn't + # break other tests. + scalars_df = scalars_df.drop(columns=["numeric_col"]) + scalars_pandas_df = scalars_pandas_df.drop(columns=["numeric_col"]) + + # When there are 10 or fewer rows, the outputs should be identical. + actual = repr(scalars_df.head(10)) + + with display_options.pandas_repr(bigframes.options.display): + expected = repr(scalars_pandas_df.head(10)) + + assert actual == expected + + +def test_join_repr(scalars_dfs_maybe_ordered): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + + scalars_df = ( + scalars_df[["int64_col"]] + .join(scalars_df.set_index("int64_col")[["int64_too"]]) + .sort_index() + ) + scalars_pandas_df = ( + scalars_pandas_df[["int64_col"]] + .join(scalars_pandas_df.set_index("int64_col")[["int64_too"]]) + .sort_index() + ) + # Pandas join result index name seems to depend on the index values in a way that bigframes can't match exactly + scalars_pandas_df.index.name = None + + actual = repr(scalars_df) + + with display_options.pandas_repr(bigframes.options.display): + expected = repr(scalars_pandas_df) + + assert actual == expected + + +def test_repr_w_display_options(scalars_dfs, session): + metrics = session._metrics + scalars_df, _ = scalars_dfs + # get a pandas df of the expected format + df, _ = scalars_df._block.to_pandas() + pandas_df = df.set_axis(scalars_df._block.column_labels, axis=1) + pandas_df.index.name = scalars_df.index.name + + executions_pre = metrics.execution_count + with bigframes.option_context( + "display.max_rows", 10, "display.max_columns", 5, "display.max_colwidth", 10 + ): + + # When there are 10 or fewer rows, the outputs should be identical except for the extra note. + actual = scalars_df.head(10).__repr__() + executions_post = metrics.execution_count + + with display_options.pandas_repr(bigframes.options.display): + pandas_repr = pandas_df.head(10).__repr__() + + assert actual == pandas_repr + assert (executions_post - executions_pre) <= 3 + + +def test_repr_html_w_all_rows(scalars_dfs, session): + metrics = session._metrics + scalars_df, _ = scalars_dfs + # get a pandas df of the expected format + df, _ = scalars_df._block.to_pandas() + pandas_df = df.set_axis(scalars_df._block.column_labels, axis=1) + pandas_df.index.name = scalars_df.index.name + + executions_pre = metrics.execution_count + # When there are 10 or fewer rows, the outputs should be identical except for the extra note. + actual = scalars_df.head(10)._repr_html_() + executions_post = metrics.execution_count + + with display_options.pandas_repr(bigframes.options.display): + pandas_repr = pandas_df.head(10)._repr_html_() + + expected = ( + pandas_repr + + f"[{len(pandas_df.index)} rows x {len(pandas_df.columns)} columns in total]" + ) + assert actual == expected + assert (executions_post - executions_pre) <= 3 + + +def test_df_column_name_with_space(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name_dict = {"bool_col": "bool col"} + df_pandas = scalars_df.rename(columns=col_name_dict).to_pandas() + pd.testing.assert_index_equal( + df_pandas.columns, scalars_pandas_df.rename(columns=col_name_dict).columns + ) + + +def test_df_column_name_duplicate(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name_dict = {"int64_too": "int64_col"} + df_pandas = scalars_df.rename(columns=col_name_dict).to_pandas() + pd.testing.assert_index_equal( + df_pandas.columns, scalars_pandas_df.rename(columns=col_name_dict).columns + ) + + +def test_get_df_column_name_duplicate(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name_dict = {"int64_too": "int64_col"} + + bf_result = scalars_df.rename(columns=col_name_dict)["int64_col"].to_pandas() + pd_result = scalars_pandas_df.rename(columns=col_name_dict)["int64_col"] + pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) + + +@pytest.mark.parametrize( + ("indices", "axis"), + [ + ([1, 3, 5], 0), + ([2, 4, 6], 1), + ([1, -3, -5, -6], "index"), + ([-2, -4, -6], "columns"), + ], +) +def test_take_df(scalars_dfs, indices, axis): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.take(indices, axis=axis).to_pandas() + pd_result = scalars_pandas_df.take(indices, axis=axis) + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_filter_df(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_bool_series = scalars_df["bool_col"] + bf_result = scalars_df[bf_bool_series].to_pandas() + + pd_bool_series = scalars_pandas_df["bool_col"] + pd_result = scalars_pandas_df[pd_bool_series] + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_read_gbq_direct_to_batches_row_count(unordered_session): + df = unordered_session.read_gbq("bigquery-public-data.usa_names.usa_1910_2013") + iter = df.to_pandas_batches() + assert iter.total_rows == 5552452 + + +def test_df_to_pandas_batches(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + capped_unfiltered_batches = scalars_df.to_pandas_batches(page_size=2, max_results=6) + bf_bool_series = scalars_df["bool_col"] + filtered_batches = scalars_df[bf_bool_series].to_pandas_batches() + + pd_bool_series = scalars_pandas_df["bool_col"] + pd_result = scalars_pandas_df[pd_bool_series] + + assert 6 == capped_unfiltered_batches.total_rows + assert len(pd_result) == filtered_batches.total_rows + assert_pandas_df_equal(pd.concat(filtered_batches), pd_result) + + +@pytest.mark.parametrize( + ("literal", "expected_dtype"), + ( + pytest.param( + 2, + dtypes.INT_DTYPE, + id="INT64", + ), + # ==================================================================== + # NULL values + # + # These are regression tests for b/428999884. It needs to be possible to + # set a column to NULL with a desired type (not just the pandas default + # of float64). + # ==================================================================== + pytest.param(None, dtypes.FLOAT_DTYPE, id="NULL-None"), + pytest.param( + pa.scalar(None, type=pa.int64()), + dtypes.INT_DTYPE, + id="NULL-pyarrow-TIMESTAMP", + ), + pytest.param( + pa.scalar(None, type=pa.timestamp("us", tz="UTC")), + dtypes.TIMESTAMP_DTYPE, + id="NULL-pyarrow-TIMESTAMP", + ), + pytest.param( + pa.scalar(None, type=pa.timestamp("us")), + dtypes.DATETIME_DTYPE, + id="NULL-pyarrow-DATETIME", + ), + ), +) +def test_assign_new_column_w_literal(scalars_dfs, literal, expected_dtype): + scalars_df, scalars_pandas_df = scalars_dfs + df = scalars_df.assign(new_col=literal) + bf_result = df.to_pandas() + + new_col_pd = literal + if isinstance(literal, pa.Scalar): + # PyArrow integer scalars aren't yet supported in pandas Int64Dtype. + new_col_pd = literal.as_py() + + # Pandas might not pick the same dtype as BigFrames, but it should at least + # be castable to it. + pd_result = scalars_pandas_df.assign(new_col=new_col_pd) + pd_result["new_col"] = pd_result["new_col"].astype(expected_dtype) + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_assign_new_column_w_loc(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df.loc[:, "new_col"] = 2 + pd_df.loc[:, "new_col"] = 2 + bf_result = bf_df.to_pandas() + pd_result = pd_df + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("scalar",), + [ + (2.1,), + (None,), + ], +) +def test_assign_new_column_w_setitem(scalars_dfs, scalar): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df["new_col"] = scalar + pd_df["new_col"] = scalar + bf_result = bf_df.to_pandas() + pd_result = pd_df + + # Convert default pandas dtypes `float64` to match BigQuery DataFrames dtypes. + pd_result["new_col"] = pd_result["new_col"].astype("Float64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_assign_new_column_w_setitem_dataframe(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df["int64_col"] = bf_df["int64_too"].to_frame() + pd_df["int64_col"] = pd_df["int64_too"].to_frame() + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_df["int64_col"] = pd_df["int64_col"].astype("Int64") + + pd.testing.assert_frame_equal(bf_df.to_pandas(), pd_df) + + +def test_assign_new_column_w_setitem_dataframe_error(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + + with pytest.raises(ValueError): + bf_df["impossible_col"] = bf_df[["int64_too", "string_col"]] + with pytest.raises(ValueError): + pd_df["impossible_col"] = pd_df[["int64_too", "string_col"]] + + +def test_assign_new_column_w_setitem_list(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + pd_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + bf_result = bf_df.to_pandas() + pd_result = pd_df + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_assign_new_column_w_setitem_list_repeated(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + pd_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + bf_df["new_col_2"] = [1, 3, 2, 5, 4, 7, 6, 9, 8] + pd_df["new_col_2"] = [1, 3, 2, 5, 4, 7, 6, 9, 8] + bf_result = bf_df.to_pandas() + pd_result = pd_df + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + pd_result["new_col_2"] = pd_result["new_col_2"].astype("Int64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_assign_new_column_w_setitem_list_custom_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + + # set the custom index + pd_df = pd_df.set_index(["string_col", "int64_col"]) + bf_df = bf_df.set_index(["string_col", "int64_col"]) + + bf_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + pd_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + bf_result = bf_df.to_pandas() + pd_result = pd_df + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_assign_new_column_w_setitem_list_error(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + + with pytest.raises(ValueError): + pd_df["new_col"] = [1, 2, 3] # should be len 9, is 3 + with pytest.raises(ValueError): + bf_df["new_col"] = [1, 2, 3] + + +@pytest.mark.parametrize( + ("key", "value"), + [ + pytest.param(["int64_col", "int64_too"], 1, id="scalar_to_existing_column"), + pytest.param( + ["int64_col", "int64_too"], [1, 2], id="sequence_to_existing_column" + ), + pytest.param( + ["int64_col", "new_col"], [1, 2], id="sequence_to_partial_new_column" + ), + pytest.param( + ["new_col", "new_col_too"], [1, 2], id="sequence_to_full_new_column" + ), + pytest.param( + pd.Index(("new_col", "new_col_too")), + [1, 2], + id="sequence_to_full_new_column_as_index", + ), + ], +) +def test_setitem_multicolumn_with_literals(scalars_dfs, key, value): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.copy() + pd_result = scalars_pandas_df.copy() + + bf_result[key] = value + pd_result[key] = value + + pd.testing.assert_frame_equal(pd_result, bf_result.to_pandas(), check_dtype=False) + + +def test_setitem_multicolumn_with_literals_different_lengths_raise_error(scalars_dfs): + scalars_df, _ = scalars_dfs + bf_result = scalars_df.copy() + + with pytest.raises(ValueError): + bf_result[["int64_col", "int64_too"]] = [1] + + +def test_setitem_multicolumn_with_dataframes(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.copy() + pd_result = scalars_pandas_df.copy() + + bf_result[["int64_col", "int64_too"]] = bf_result[["int64_too", "int64_col"]] / 2 + pd_result[["int64_col", "int64_too"]] = pd_result[["int64_too", "int64_col"]] / 2 + + pd.testing.assert_frame_equal(pd_result, bf_result.to_pandas(), check_dtype=False) + + +def test_setitem_multicolumn_with_dataframes_series_on_rhs_raise_error(scalars_dfs): + scalars_df, _ = scalars_dfs + bf_result = scalars_df.copy() + + with pytest.raises(ValueError): + bf_result[["int64_col", "int64_too"]] = bf_result["int64_col"] / 2 + + +def test_setitem_multicolumn_with_dataframes_different_lengths_raise_error(scalars_dfs): + scalars_df, _ = scalars_dfs + bf_result = scalars_df.copy() + + with pytest.raises(ValueError): + bf_result[["int64_col"]] = bf_result[["int64_col", "int64_too"]] / 2 + + +def test_assign_existing_column(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + kwargs = {"int64_col": 2} + df = scalars_df.assign(**kwargs) + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.assign(**kwargs) + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["int64_col"] = pd_result["int64_col"].astype("Int64") + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_assign_listlike_to_empty_df(session): + empty_df = dataframe.DataFrame(session=session) + empty_pandas_df = pd.DataFrame() + + bf_result = empty_df.assign(new_col=[1, 2, 3]) + pd_result = empty_pandas_df.assign(new_col=[1, 2, 3]) + + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + pd_result.index = pd_result.index.astype("Int64") + assert_pandas_df_equal(bf_result.to_pandas(), pd_result) + + +def test_assign_to_empty_df_multiindex_error(session): + empty_df = dataframe.DataFrame(session=session) + empty_pandas_df = pd.DataFrame() + + empty_df["empty_col_1"] = typing.cast(series.Series, []) + empty_df["empty_col_2"] = typing.cast(series.Series, []) + empty_pandas_df["empty_col_1"] = [] + empty_pandas_df["empty_col_2"] = [] + empty_df = empty_df.set_index(["empty_col_1", "empty_col_2"]) + empty_pandas_df = empty_pandas_df.set_index(["empty_col_1", "empty_col_2"]) + + with pytest.raises(ValueError): + empty_df.assign(new_col=[1, 2, 3, 4, 5, 6, 7, 8, 9]) + with pytest.raises(ValueError): + empty_pandas_df.assign(new_col=[1, 2, 3, 4, 5, 6, 7, 8, 9]) + + +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_assign_series(scalars_dfs, ordered): + scalars_df, scalars_pandas_df = scalars_dfs + column_name = "int64_col" + df = scalars_df.assign(new_col=scalars_df[column_name]) + bf_result = df.to_pandas(ordered=ordered) + pd_result = scalars_pandas_df.assign(new_col=scalars_pandas_df[column_name]) + + assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) + + +def test_assign_series_overwrite(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + column_name = "int64_col" + df = scalars_df.assign(**{column_name: scalars_df[column_name] + 3}) + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.assign( + **{column_name: scalars_pandas_df[column_name] + 3} + ) + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_assign_sequential(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + kwargs = {"int64_col": 2, "new_col": 3, "new_col2": 4} + df = scalars_df.assign(**kwargs) + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.assign(**kwargs) + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["int64_col"] = pd_result["int64_col"].astype("Int64") + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + pd_result["new_col2"] = pd_result["new_col2"].astype("Int64") + + assert_pandas_df_equal(bf_result, pd_result) + + +# Require an index so that the self-join is consistent each time. +def test_assign_same_table_different_index_performs_self_join( + scalars_df_index, scalars_pandas_df_index +): + column_name = "int64_col" + bf_df = scalars_df_index.assign( + alternative_index=scalars_df_index["rowindex_2"] + 2 + ) + pd_df = scalars_pandas_df_index.assign( + alternative_index=scalars_pandas_df_index["rowindex_2"] + 2 + ) + bf_df_2 = bf_df.set_index("alternative_index") + pd_df_2 = pd_df.set_index("alternative_index") + bf_result = bf_df.assign(new_col=bf_df_2[column_name] * 10).to_pandas() + pd_result = pd_df.assign(new_col=pd_df_2[column_name] * 10) + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +# Different table expression must have Index +def test_assign_different_df( + scalars_df_index, scalars_df_2_index, scalars_pandas_df_index +): + column_name = "int64_col" + df = scalars_df_index.assign(new_col=scalars_df_2_index[column_name]) + bf_result = df.to_pandas() + # Doesn't matter to pandas if it comes from the same DF or a different DF. + pd_result = scalars_pandas_df_index.assign( + new_col=scalars_pandas_df_index[column_name] + ) + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_assign_different_df_w_loc( + scalars_df_index, scalars_df_2_index, scalars_pandas_df_index +): + bf_df = scalars_df_index.copy() + bf_df2 = scalars_df_2_index.copy() + pd_df = scalars_pandas_df_index.copy() + assert "int64_col" in bf_df.columns + assert "int64_col" in pd_df.columns + bf_df.loc[:, "int64_col"] = bf_df2.loc[:, "int64_col"] + 1 + pd_df.loc[:, "int64_col"] = pd_df.loc[:, "int64_col"] + 1 + bf_result = bf_df.to_pandas() + pd_result = pd_df + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["int64_col"] = pd_result["int64_col"].astype("Int64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_assign_different_df_w_setitem( + scalars_df_index, scalars_df_2_index, scalars_pandas_df_index +): + bf_df = scalars_df_index.copy() + bf_df2 = scalars_df_2_index.copy() + pd_df = scalars_pandas_df_index.copy() + assert "int64_col" in bf_df.columns + assert "int64_col" in pd_df.columns + bf_df["int64_col"] = bf_df2["int64_col"] + 1 + pd_df["int64_col"] = pd_df["int64_col"] + 1 + bf_result = bf_df.to_pandas() + pd_result = pd_df + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["int64_col"] = pd_result["int64_col"].astype("Int64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_assign_callable_lambda(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + kwargs = {"new_col": lambda x: x["int64_col"] + x["int64_too"]} + df = scalars_df.assign(**kwargs) + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.assign(**kwargs) + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + + assert_pandas_df_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("axis", "how", "ignore_index", "subset"), + [ + (0, "any", False, None), + (0, "any", True, None), + (0, "all", False, ["bool_col", "time_col"]), + (0, "any", False, ["bool_col", "time_col"]), + (0, "all", False, "time_col"), + (1, "any", False, None), + (1, "all", False, None), + ], +) +def test_df_dropna_by_how(scalars_dfs, axis, how, ignore_index, subset): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + df = scalars_df.dropna(axis=axis, how=how, ignore_index=ignore_index, subset=subset) + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.dropna( + axis=axis, how=how, ignore_index=ignore_index, subset=subset + ) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("axis", "ignore_index", "subset", "thresh"), + [ + (0, False, None, 2), + (0, True, None, 3), + (1, False, None, 2), + ], +) +def test_df_dropna_by_thresh(scalars_dfs, axis, ignore_index, subset, thresh): + """ + Tests that dropna correctly keeps rows/columns with a minimum number + of non-null values. + """ + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + + df_result = scalars_df.dropna( + axis=axis, thresh=thresh, ignore_index=ignore_index, subset=subset + ) + pd_result = scalars_pandas_df.dropna( + axis=axis, thresh=thresh, ignore_index=ignore_index, subset=subset + ) + + bf_result = df_result.to_pandas() + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_df_dropna_range_columns(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + scalars_df = scalars_df.copy() + scalars_pandas_df = scalars_pandas_df.copy() + scalars_df.columns = pandas.RangeIndex(0, len(scalars_df.columns)) + scalars_pandas_df.columns = pandas.RangeIndex(0, len(scalars_pandas_df.columns)) + + df = scalars_df.dropna() + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.dropna() + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_df_interpolate(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + columns = ["int64_col", "int64_too", "float64_col"] + bf_result = scalars_df[columns].interpolate().to_pandas() + # Pandas can only interpolate on "float64" columns + # https://github.com/pandas-dev/pandas/issues/40252 + pd_result = scalars_pandas_df[columns].astype("float64").interpolate() + + pandas.testing.assert_frame_equal( + bf_result, + pd_result, + check_index_type=False, + check_dtype=False, + ) + + +@pytest.mark.parametrize( + "col, fill_value", + [ + (["int64_col", "float64_col"], 3), + (["string_col"], "A"), + (["datetime_col"], pd.Timestamp("2023-01-01")), + ], +) +def test_df_fillna(scalars_dfs, col, fill_value): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[col].fillna(fill_value).to_pandas() + pd_result = scalars_pandas_df[col].fillna(fill_value) + + pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +def test_df_replace_scalar_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.replace(555.555, 3).to_pandas() + pd_result = scalars_pandas_df.replace(555.555, 3) + + # pandas has narrower result types as they are determined dynamically + pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) + + +def test_df_replace_regex_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.replace("^H.l", "Howdy, Planet!", regex=True).to_pandas() + pd_result = scalars_pandas_df.replace("^H.l", "Howdy, Planet!", regex=True) + + pd.testing.assert_frame_equal( + pd_result, + bf_result, + ) + + +def test_df_replace_list_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.replace([555.555, 3.2], 3).to_pandas() + pd_result = scalars_pandas_df.replace([555.555, 3.2], 3) + + # pandas has narrower result types as they are determined dynamically + pd.testing.assert_frame_equal( + pd_result, + bf_result, + check_dtype=False, + ) + + +def test_df_replace_value_dict(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.replace(1, {"int64_col": 100, "int64_too": 200}).to_pandas() + pd_result = scalars_pandas_df.replace(1, {"int64_col": 100, "int64_too": 200}) + + pd.testing.assert_frame_equal( + pd_result, + bf_result, + ) + + +def test_df_ffill(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[["int64_col", "float64_col"]].ffill(limit=1).to_pandas() + pd_result = scalars_pandas_df[["int64_col", "float64_col"]].ffill(limit=1) + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_df_bfill(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[["int64_col", "float64_col"]].bfill().to_pandas() + pd_result = scalars_pandas_df[["int64_col", "float64_col"]].bfill() + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_apply_series_series_callable( + scalars_df_index, + scalars_pandas_df_index, +): + columns = ["int64_too", "int64_col"] + + def foo(series, arg1, arg2, *, kwarg1=0, kwarg2=0): + return series**2 + (arg1 * arg2 % 4) + (kwarg1 * kwarg2 % 7) + + bf_result = ( + scalars_df_index[columns] + .apply(foo, args=(33, 61), kwarg1=52, kwarg2=21) + .to_pandas() + ) + + pd_result = scalars_pandas_df_index[columns].apply( + foo, args=(33, 61), kwarg1=52, kwarg2=21 + ) + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_apply_series_listlike_callable( + scalars_df_index, + scalars_pandas_df_index, +): + columns = ["int64_too", "int64_col"] + bf_result = ( + scalars_df_index[columns].apply(lambda x: [len(x), x.min(), 24]).to_pandas() + ) + + pd_result = scalars_pandas_df_index[columns].apply(lambda x: [len(x), x.min(), 24]) + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result.index = pd_result.index.astype("Int64") + pd_result = pd_result.astype("Int64") + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_apply_series_scalar_callable( + scalars_df_index, + scalars_pandas_df_index, +): + columns = ["int64_too", "int64_col"] + bf_result = scalars_df_index[columns].apply(lambda x: x.sum()) + + pd_result = scalars_pandas_df_index[columns].apply(lambda x: x.sum()) + + pandas.testing.assert_series_equal(bf_result, pd_result) + + +def test_df_pipe( + scalars_df_index, + scalars_pandas_df_index, +): + columns = ["int64_too", "int64_col"] + + def foo(x: int, y: int, df): + return (df + x) % y + + bf_result = ( + scalars_df_index[columns] + .pipe((foo, "df"), x=7, y=9) + .pipe(lambda x: x**2) + .to_pandas() + ) + + pd_result = ( + scalars_pandas_df_index[columns] + .pipe((foo, "df"), x=7, y=9) + .pipe(lambda x: x**2) + ) + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_df_keys( + scalars_df_index, + scalars_pandas_df_index, +): + pandas.testing.assert_index_equal( + scalars_df_index.keys(), scalars_pandas_df_index.keys() + ) + + +def test_df_iter( + scalars_df_index, + scalars_pandas_df_index, +): + for bf_i, df_i in zip(scalars_df_index, scalars_pandas_df_index): + assert bf_i == df_i + + +def test_iterrows( + scalars_df_index, + scalars_pandas_df_index, +): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df_index = scalars_df_index.add_suffix("_suffix", axis=1) + scalars_pandas_df_index = scalars_pandas_df_index.add_suffix("_suffix", axis=1) + for (bf_index, bf_series), (pd_index, pd_series) in zip( + scalars_df_index.iterrows(), scalars_pandas_df_index.iterrows() + ): + assert bf_index == pd_index + pandas.testing.assert_series_equal(bf_series, pd_series) + + +@pytest.mark.parametrize( + ( + "index", + "name", + ), + [ + ( + True, + "my_df", + ), + (False, None), + ], +) +def test_itertuples(scalars_df_index, index, name): + # Numeric has slightly different representation as a result of conversions. + bf_tuples = scalars_df_index.itertuples(index, name) + pd_tuples = scalars_df_index.to_pandas().itertuples(index, name) + for bf_tuple, pd_tuple in zip(bf_tuples, pd_tuples): + assert bf_tuple == pd_tuple + + +def test_df_isin_list_w_null(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + values = ["Hello, World!", 55555, 2.51, pd.NA, True] + bf_result = ( + scalars_df[["int64_col", "float64_col", "string_col", "bool_col"]] + .isin(values) + .to_pandas() + ) + pd_result = scalars_pandas_df[ + ["int64_col", "float64_col", "string_col", "bool_col"] + ].isin(values) + + pandas.testing.assert_frame_equal(bf_result, pd_result.astype("boolean")) + + +def test_df_isin_list_wo_null(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + values = ["Hello, World!", 55555, 2.51, True] + bf_result = ( + scalars_df[["int64_col", "float64_col", "string_col", "bool_col"]] + .isin(values) + .to_pandas() + ) + pd_result = scalars_pandas_df[ + ["int64_col", "float64_col", "string_col", "bool_col"] + ].isin(values) + + pandas.testing.assert_frame_equal(bf_result, pd_result.astype("boolean")) + + +def test_df_isin_dict(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + values = { + "string_col": ["Hello, World!", 55555, 2.51, pd.NA, True], + "int64_col": [5555, 2.51], + "bool_col": [pd.NA], + } + bf_result = ( + scalars_df[["int64_col", "float64_col", "string_col", "bool_col"]] + .isin(values) + .to_pandas() + ) + pd_result = scalars_pandas_df[ + ["int64_col", "float64_col", "string_col", "bool_col"] + ].isin(values) + + pandas.testing.assert_frame_equal(bf_result, pd_result.astype("boolean")) + + +def test_df_cross_merge(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + left_columns = ["int64_col", "float64_col", "rowindex_2"] + right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"] + + left = scalars_df[left_columns] + # Offset the rows somewhat so that outer join can have an effect. + right = scalars_df[right_columns].assign(rowindex_2=scalars_df["rowindex_2"] + 2) + + bf_result = left.merge(right, "cross").to_pandas() + + pd_result = scalars_pandas_df[left_columns].merge( + scalars_pandas_df[right_columns].assign( + rowindex_2=scalars_pandas_df["rowindex_2"] + 2 + ), + "cross", + ) + pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + + +@pytest.mark.parametrize( + ("merge_how",), + [ + ("inner",), + ("outer",), + ("left",), + ("right",), + ], +) +def test_df_merge(scalars_dfs, merge_how): + scalars_df, scalars_pandas_df = scalars_dfs + on = "rowindex_2" + left_columns = ["int64_col", "float64_col", "rowindex_2"] + right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"] + + left = scalars_df[left_columns] + # Offset the rows somewhat so that outer join can have an effect. + right = scalars_df[right_columns].assign(rowindex_2=scalars_df["rowindex_2"] + 2) + + df = left.merge(right, merge_how, on, sort=True) + bf_result = df.to_pandas() + + pd_result = scalars_pandas_df[left_columns].merge( + scalars_pandas_df[right_columns].assign( + rowindex_2=scalars_pandas_df["rowindex_2"] + 2 + ), + merge_how, + on, + sort=True, + ) + + assert_pandas_df_equal( + bf_result, pd_result, ignore_order=True, check_index_type=False + ) + + +@pytest.mark.parametrize( + ("left_on", "right_on"), + [ + (["int64_col", "rowindex_2"], ["int64_col", "rowindex_2"]), + (["rowindex_2", "int64_col"], ["int64_col", "rowindex_2"]), + (["rowindex_2", "float64_col"], ["int64_col", "rowindex_2"]), + ], +) +def test_df_merge_multi_key(scalars_dfs, left_on, right_on): + scalars_df, scalars_pandas_df = scalars_dfs + left_columns = ["int64_col", "float64_col", "rowindex_2"] + right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"] + + left = scalars_df[left_columns] + # Offset the rows somewhat so that outer join can have an effect. + right = scalars_df[right_columns].assign(rowindex_2=scalars_df["rowindex_2"] + 2) + + df = left.merge(right, "outer", left_on=left_on, right_on=right_on, sort=True) + bf_result = df.to_pandas() + + pd_result = scalars_pandas_df[left_columns].merge( + scalars_pandas_df[right_columns].assign( + rowindex_2=scalars_pandas_df["rowindex_2"] + 2 + ), + "outer", + left_on=left_on, + right_on=right_on, + sort=True, + ) + + assert_pandas_df_equal( + bf_result, pd_result, ignore_order=True, check_index_type=False + ) + + +@pytest.mark.parametrize( + ("merge_how",), + [ + ("inner",), + ("outer",), + ("left",), + ("right",), + ], +) +def test_merge_custom_col_name(scalars_dfs, merge_how): + scalars_df, scalars_pandas_df = scalars_dfs + left_columns = ["int64_col", "float64_col"] + right_columns = ["int64_col", "bool_col", "string_col"] + on = "int64_col" + rename_columns = {"float64_col": "f64_col"} + + left = scalars_df[left_columns] + left = left.rename(columns=rename_columns) + right = scalars_df[right_columns] + df = left.merge(right, merge_how, on, sort=True) + bf_result = df.to_pandas() + + pandas_left_df = scalars_pandas_df[left_columns] + pandas_left_df = pandas_left_df.rename(columns=rename_columns) + pandas_right_df = scalars_pandas_df[right_columns] + pd_result = pandas_left_df.merge(pandas_right_df, merge_how, on, sort=True) + + assert_pandas_df_equal( + bf_result, pd_result, ignore_order=True, check_index_type=False + ) + + +@pytest.mark.parametrize( + ("merge_how",), + [ + ("inner",), + ("outer",), + ("left",), + ("right",), + ], +) +def test_merge_left_on_right_on(scalars_dfs, merge_how): + scalars_df, scalars_pandas_df = scalars_dfs + left_columns = ["int64_col", "float64_col", "int64_too"] + right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"] + + left = scalars_df[left_columns] + right = scalars_df[right_columns] + + df = left.merge( + right, merge_how, left_on="int64_too", right_on="rowindex_2", sort=True + ) + bf_result = df.to_pandas() + + pd_result = scalars_pandas_df[left_columns].merge( + scalars_pandas_df[right_columns], + merge_how, + left_on="int64_too", + right_on="rowindex_2", + sort=True, + ) + + assert_pandas_df_equal( + bf_result, pd_result, ignore_order=True, check_index_type=False + ) + + +def test_self_merge_self_w_on_args(): + data = { + "A": pd.Series([1, 2, 3], dtype="Int64"), + "B": pd.Series([1, 2, 3], dtype="Int64"), + "C": pd.Series([100, 200, 300], dtype="Int64"), + "D": pd.Series(["alpha", "beta", "gamma"], dtype="string[pyarrow]"), + } + df = pd.DataFrame(data) + + df1 = df[["A", "C"]] + df2 = df[["B", "C", "D"]] + pd_result = df1.merge(df2, left_on=["A", "C"], right_on=["B", "C"], how="inner") + + bf_df = bpd.DataFrame(data) + + bf_df1 = bf_df[["A", "C"]] + bf_df2 = bf_df[["B", "C", "D"]] + bf_result = bf_df1.merge( + bf_df2, left_on=["A", "C"], right_on=["B", "C"], how="inner" + ).to_pandas() + pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + + +@pytest.mark.parametrize( + ("decimals",), + [ + (2,), + ({"float64_col": 0, "bool_col": 1, "int64_too": -3},), + ({},), + ], +) +def test_dataframe_round(scalars_dfs, decimals): + if pd.__version__.startswith("1."): + pytest.skip("Rounding doesn't work as expected in pandas 1.x") + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.round(decimals).to_pandas() + pd_result = scalars_pandas_df.round(decimals) + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_get_dtypes(scalars_df_default_index): + dtypes = scalars_df_default_index.dtypes + dtypes_dict: Dict[str, bigframes.dtypes.Dtype] = { + "bool_col": pd.BooleanDtype(), + "bytes_col": pd.ArrowDtype(pa.binary()), + "date_col": pd.ArrowDtype(pa.date32()), + "datetime_col": pd.ArrowDtype(pa.timestamp("us")), + "geography_col": gpd.array.GeometryDtype(), + "int64_col": pd.Int64Dtype(), + "int64_too": pd.Int64Dtype(), + "numeric_col": pd.ArrowDtype(pa.decimal128(38, 9)), + "float64_col": pd.Float64Dtype(), + "rowindex": pd.Int64Dtype(), + "rowindex_2": pd.Int64Dtype(), + "string_col": pd.StringDtype(storage="pyarrow"), + "time_col": pd.ArrowDtype(pa.time64("us")), + "timestamp_col": pd.ArrowDtype(pa.timestamp("us", tz="UTC")), + "duration_col": pd.ArrowDtype(pa.duration("us")), + } + pd.testing.assert_series_equal( + dtypes, + pd.Series(dtypes_dict), + ) + + +def test_get_dtypes_array_struct_query(session): + df = session.read_gbq( + """SELECT + [1, 3, 2] AS array_column, + STRUCT( + "a" AS string_field, + 1.2 AS float_field) AS struct_column""" + ) + + dtypes = df.dtypes + pd.testing.assert_series_equal( + dtypes, + pd.Series( + { + "array_column": pd.ArrowDtype(pa.list_(pa.int64())), + "struct_column": pd.ArrowDtype( + pa.struct( + [ + ("string_field", pa.string()), + ("float_field", pa.float64()), + ] + ) + ), + } + ), + ) + + +def test_get_dtypes_array_struct_table(nested_df): + dtypes = nested_df.dtypes + pd.testing.assert_series_equal( + dtypes, + pd.Series( + { + "customer_id": pd.StringDtype(storage="pyarrow"), + "day": pd.ArrowDtype(pa.date32()), + "flag": pd.Int64Dtype(), + "label": pd.ArrowDtype( + pa.struct( + [ + ("key", pa.string()), + ("value", pa.string()), + ] + ), + ), + "event_sequence": pd.ArrowDtype( + pa.list_( + pa.struct( + [ + pa.field( + "data", + pa.list_( + pa.struct( + [ + ("value", pa.float64()), + ("key", pa.string()), + ], + ), + ), + nullable=False, + ), + ("timestamp", pa.timestamp("us", "UTC")), + ("category", pa.string()), + ] + ), + ), + ), + "address": pd.ArrowDtype( + pa.struct( + [ + ("street", pa.string()), + ("city", pa.string()), + ] + ), + ), + } + ), + ) + + +def test_shape(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.shape + pd_result = scalars_pandas_df.shape + + assert bf_result == pd_result + + +@pytest.mark.parametrize( + "reference_table, test_table", + [ + ( + "bigframes-dev.bigframes_tests_sys.base_table", + "bigframes-dev.bigframes_tests_sys.base_table_mat_view", + ), + ( + "bigframes-dev.bigframes_tests_sys.base_table", + "bigframes-dev.bigframes_tests_sys.base_table_view", + ), + ( + "bigframes-dev.bigframes_tests_sys.csv_native_table", + "bigframes-dev.bigframes_tests_sys.csv_external_table", + ), + ], +) +def test_view_and_external_table_shape(session, reference_table, test_table): + reference_df = session.read_gbq(reference_table) + test_df = session.read_gbq(test_table) + + assert test_df.shape == reference_df.shape + + +def test_len(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = len(scalars_df) + pd_result = len(scalars_pandas_df) + + assert bf_result == pd_result + + +@pytest.mark.parametrize( + ("n_rows",), + [ + (50,), + (10000,), + ], +) +@pytest.mark.parametrize( + "write_engine", + ["bigquery_load", "bigquery_streaming", "bigquery_write"], +) +def test_df_len_local(session, n_rows, write_engine): + assert ( + len( + session.read_pandas( + pd.DataFrame(np.random.randint(1, 7, n_rows), columns=["one"]), + write_engine=write_engine, + ) + ) + == n_rows + ) + + +def test_size(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.size + pd_result = scalars_pandas_df.size + + assert bf_result == pd_result + + +def test_ndim(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df.ndim + pd_result = scalars_pandas_df.ndim + + assert bf_result == pd_result + + +def test_empty_false(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.empty + pd_result = scalars_pandas_df.empty + + assert bf_result == pd_result + + +def test_empty_true_column_filter(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df[[]].empty + pd_result = scalars_pandas_df[[]].empty + + assert bf_result == pd_result + + +def test_empty_true_row_filter(scalars_dfs: Tuple[dataframe.DataFrame, pd.DataFrame]): + scalars_df, scalars_pandas_df = scalars_dfs + bf_bool: series.Series = typing.cast(series.Series, scalars_df["bool_col"]) + pd_bool: pd.Series = scalars_pandas_df["bool_col"] + bf_false = bf_bool.notna() & (bf_bool != bf_bool) + pd_false = pd_bool.notna() & (pd_bool != pd_bool) + + bf_result = scalars_df[bf_false].empty + pd_result = scalars_pandas_df[pd_false].empty + + assert pd_result + assert bf_result == pd_result + + +def test_empty_true_memtable(session: bigframes.Session): + bf_df = dataframe.DataFrame(session=session) + pd_df = pd.DataFrame() + + bf_result = bf_df.empty + pd_result = pd_df.empty + + assert pd_result + assert bf_result == pd_result + + +@pytest.mark.parametrize( + ("drop",), + ((True,), (False,)), +) +def test_reset_index(scalars_df_index, scalars_pandas_df_index, drop): + df = scalars_df_index.reset_index(drop=drop) + assert df.index.name is None + + bf_result = df.to_pandas() + pd_result = scalars_pandas_df_index.reset_index(drop=drop) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + # reset_index should maintain the original ordering. + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_reset_index_allow_duplicates(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.copy() + scalars_df_index.index.name = "int64_col" + df = scalars_df_index.reset_index(allow_duplicates=True, drop=False) + assert df.index.name is None + + bf_result = df.to_pandas() + + scalars_pandas_df_index = scalars_pandas_df_index.copy() + scalars_pandas_df_index.index.name = "int64_col" + pd_result = scalars_pandas_df_index.reset_index(allow_duplicates=True, drop=False) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + # reset_index should maintain the original ordering. + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_reset_index_duplicates_error(scalars_df_index): + scalars_df_index = scalars_df_index.copy() + scalars_df_index.index.name = "int64_col" + with pytest.raises(ValueError): + scalars_df_index.reset_index(allow_duplicates=False, drop=False) + + +@pytest.mark.parametrize( + ("drop",), + ((True,), (False,)), +) +def test_reset_index_inplace(scalars_df_index, scalars_pandas_df_index, drop): + df = scalars_df_index.copy() + df.reset_index(drop=drop, inplace=True) + assert df.index.name is None + + bf_result = df.to_pandas() + pd_result = scalars_pandas_df_index.copy() + pd_result.reset_index(drop=drop, inplace=True) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + # reset_index should maintain the original ordering. + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_reset_index_then_filter( + scalars_df_index, + scalars_pandas_df_index, +): + bf_filter = scalars_df_index["bool_col"].fillna(True) + bf_df = scalars_df_index.reset_index()[bf_filter] + bf_result = bf_df.to_pandas() + pd_filter = scalars_pandas_df_index["bool_col"].fillna(True) + pd_result = scalars_pandas_df_index.reset_index()[pd_filter] + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + # reset_index should maintain the original ordering and index keys + # post-filter will have gaps. + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_reset_index_with_unnamed_index( + scalars_df_index, + scalars_pandas_df_index, +): + scalars_df_index = scalars_df_index.copy() + scalars_pandas_df_index = scalars_pandas_df_index.copy() + + scalars_df_index.index.name = None + scalars_pandas_df_index.index.name = None + df = scalars_df_index.reset_index(drop=False) + assert df.index.name is None + + # reset_index(drop=False) creates a new column "index". + assert df.columns[0] == "index" + + bf_result = df.to_pandas() + pd_result = scalars_pandas_df_index.reset_index(drop=False) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + # reset_index should maintain the original ordering. + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_reset_index_with_unnamed_multiindex( + scalars_df_index, + scalars_pandas_df_index, +): + bf_df = dataframe.DataFrame( + ([1, 2, 3], [2, 5, 7]), + index=pd.MultiIndex.from_tuples([("a", "aa"), ("a", "aa")]), + ) + pd_df = pd.DataFrame( + ([1, 2, 3], [2, 5, 7]), + index=pd.MultiIndex.from_tuples([("a", "aa"), ("a", "aa")]), + ) + + bf_df = bf_df.reset_index() + pd_df = pd_df.reset_index() + + assert pd_df.columns[0] == "level_0" + assert bf_df.columns[0] == "level_0" + assert pd_df.columns[1] == "level_1" + assert bf_df.columns[1] == "level_1" + + +def test_reset_index_with_unnamed_index_and_index_column( + scalars_df_index, + scalars_pandas_df_index, +): + scalars_df_index = scalars_df_index.copy() + scalars_pandas_df_index = scalars_pandas_df_index.copy() + + scalars_df_index.index.name = None + scalars_pandas_df_index.index.name = None + df = scalars_df_index.assign(index=scalars_df_index["int64_col"]).reset_index( + drop=False + ) + assert df.index.name is None + + # reset_index(drop=False) creates a new column "level_0" if the "index" column already exists. + assert df.columns[0] == "level_0" + + bf_result = df.to_pandas() + pd_result = scalars_pandas_df_index.assign( + index=scalars_pandas_df_index["int64_col"] + ).reset_index(drop=False) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + # reset_index should maintain the original ordering. + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("drop",), + ( + (True,), + (False,), + ), +) +@pytest.mark.parametrize( + ("append",), + ( + (True,), + (False,), + ), +) +@pytest.mark.parametrize( + ("index_column",), + (("int64_too",), ("string_col",), ("timestamp_col",)), +) +def test_set_index(scalars_dfs, index_column, drop, append): + scalars_df, scalars_pandas_df = scalars_dfs + df = scalars_df.set_index(index_column, append=append, drop=drop) + bf_result = df.to_pandas() + pd_result = scalars_pandas_df.set_index(index_column, append=append, drop=drop) + + # Sort to disambiguate when there are duplicate index labels. + # Note: Doesn't use assert_pandas_df_equal_ignore_ordering because we get + # "ValueError: 'timestamp_col' is both an index level and a column label, + # which is ambiguous" when trying to sort by a column with the same name as + # the index. + bf_result = bf_result.sort_values("rowindex_2") + pd_result = pd_result.sort_values("rowindex_2") + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_set_index_key_error(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + with pytest.raises(KeyError): + scalars_pandas_df.set_index(["not_a_col"]) + with pytest.raises(KeyError): + scalars_df.set_index(["not_a_col"]) + + +@pytest.mark.parametrize( + ("ascending",), + ((True,), (False,)), +) +@pytest.mark.parametrize( + ("na_position",), + (("first",), ("last",)), +) +@pytest.mark.parametrize( + ("axis",), + ((0,), ("columns",)), +) +def test_sort_index(scalars_dfs, ascending, na_position, axis): + index_column = "int64_col" + scalars_df, scalars_pandas_df = scalars_dfs + df = scalars_df.set_index(index_column) + bf_result = df.sort_index( + ascending=ascending, na_position=na_position, axis=axis + ).to_pandas() + pd_result = scalars_pandas_df.set_index(index_column).sort_index( + ascending=ascending, na_position=na_position, axis=axis + ) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_dataframe_sort_index_inplace(scalars_dfs): + index_column = "int64_col" + scalars_df, scalars_pandas_df = scalars_dfs + df = scalars_df.copy().set_index(index_column) + df.sort_index(ascending=False, inplace=True) + bf_result = df.to_pandas() + + pd_result = scalars_pandas_df.set_index(index_column).sort_index(ascending=False) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_df_abs(scalars_dfs_maybe_ordered): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + columns = ["int64_col", "int64_too", "float64_col"] + + bf_result = scalars_df[columns].abs() + pd_result = scalars_pandas_df[columns].abs() + + assert_dfs_equivalent(pd_result, bf_result) + + +def test_df_pos(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = (+scalars_df[["int64_col", "numeric_col"]]).to_pandas() + pd_result = +scalars_pandas_df[["int64_col", "numeric_col"]] + + assert_pandas_df_equal(pd_result, bf_result) + + +def test_df_neg(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = (-scalars_df[["int64_col", "numeric_col"]]).to_pandas() + pd_result = -scalars_pandas_df[["int64_col", "numeric_col"]] + + assert_pandas_df_equal(pd_result, bf_result) + + +def test_df__abs__(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = ( + abs(scalars_df[["int64_col", "numeric_col", "float64_col"]]) + ).to_pandas() + pd_result = abs(scalars_pandas_df[["int64_col", "numeric_col", "float64_col"]]) + + assert_pandas_df_equal(pd_result, bf_result) + + +def test_df_invert(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + columns = ["int64_col", "bool_col"] + + bf_result = (~scalars_df[columns]).to_pandas() + pd_result = ~scalars_pandas_df[columns] + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_df_isnull(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + columns = ["int64_col", "int64_too", "string_col", "bool_col"] + bf_result = scalars_df[columns].isnull().to_pandas() + pd_result = scalars_pandas_df[columns].isnull() + + # One of dtype mismatches to be documented. Here, the `bf_result.dtype` is + # `BooleanDtype` but the `pd_result.dtype` is `bool`. + pd_result["int64_col"] = pd_result["int64_col"].astype(pd.BooleanDtype()) + pd_result["int64_too"] = pd_result["int64_too"].astype(pd.BooleanDtype()) + pd_result["string_col"] = pd_result["string_col"].astype(pd.BooleanDtype()) + pd_result["bool_col"] = pd_result["bool_col"].astype(pd.BooleanDtype()) + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_df_notnull(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + columns = ["int64_col", "int64_too", "string_col", "bool_col"] + bf_result = scalars_df[columns].notnull().to_pandas() + pd_result = scalars_pandas_df[columns].notnull() + + # One of dtype mismatches to be documented. Here, the `bf_result.dtype` is + # `BooleanDtype` but the `pd_result.dtype` is `bool`. + pd_result["int64_col"] = pd_result["int64_col"].astype(pd.BooleanDtype()) + pd_result["int64_too"] = pd_result["int64_too"].astype(pd.BooleanDtype()) + pd_result["string_col"] = pd_result["string_col"].astype(pd.BooleanDtype()) + pd_result["bool_col"] = pd_result["bool_col"].astype(pd.BooleanDtype()) + + assert_pandas_df_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("left_labels", "right_labels", "overwrite", "fill_value"), + [ + (["a", "b", "c"], ["c", "a", "b"], True, None), + (["a", "b", "c"], ["c", "a", "b"], False, None), + (["a", "b", "c"], ["a", "b", "c"], False, 2), + ], + ids=[ + "one_one_match_overwrite", + "one_one_match_no_overwrite", + "exact_match", + ], +) +def test_combine( + scalars_df_index, + scalars_df_2_index, + scalars_pandas_df_index, + left_labels, + right_labels, + overwrite, + fill_value, +): + if pd.__version__.startswith("1."): + pytest.skip("pd.NA vs NaN not handled well in pandas 1.x.") + columns = ["int64_too", "int64_col", "float64_col"] + + bf_df_a = scalars_df_index[columns] + bf_df_a.columns = left_labels + bf_df_b = scalars_df_2_index[columns] + bf_df_b.columns = right_labels + bf_result = bf_df_a.combine( + bf_df_b, + lambda x, y: x**2 + 2 * x * y + y**2, + overwrite=overwrite, + fill_value=fill_value, + ).to_pandas() + + pd_df_a = scalars_pandas_df_index[columns] + pd_df_a.columns = left_labels + pd_df_b = scalars_pandas_df_index[columns] + pd_df_b.columns = right_labels + pd_result = pd_df_a.combine( + pd_df_b, + lambda x, y: x**2 + 2 * x * y + y**2, + overwrite=overwrite, + fill_value=fill_value, + ) + + # Some dtype inconsistency for all-NULL columns + pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +@pytest.mark.parametrize( + ("overwrite", "filter_func"), + [ + (True, None), + (False, None), + (True, lambda x: x.isna() | (x % 2 == 0)), + ], + ids=[ + "default", + "overwritefalse", + "customfilter", + ], +) +def test_df_update(overwrite, filter_func): + if pd.__version__.startswith("1."): + pytest.skip("dtype handled differently in pandas 1.x.") + + index1: pandas.Index = pandas.Index([1, 2, 3, 4], dtype="Int64") + + index2: pandas.Index = pandas.Index([1, 2, 4, 5], dtype="Int64") + pd_df1 = pandas.DataFrame( + {"a": [1, None, 3, 4], "b": [5, 6, None, 8]}, dtype="Int64", index=index1 + ) + pd_df2 = pandas.DataFrame( + {"a": [None, 20, 30, 40], "c": [90, None, 110, 120]}, + dtype="Int64", + index=index2, + ) + + bf_df1 = dataframe.DataFrame(pd_df1) + bf_df2 = dataframe.DataFrame(pd_df2) + + bf_df1.update(bf_df2, overwrite=overwrite, filter_func=filter_func) + pd_df1.update(pd_df2, overwrite=overwrite, filter_func=filter_func) + + pd.testing.assert_frame_equal(bf_df1.to_pandas(), pd_df1) + + +def test_df_idxmin(): + pd_df = pd.DataFrame( + {"a": [1, 2, 3], "b": [7, None, 3], "c": [4, 4, 4]}, index=["x", "y", "z"] + ) + bf_df = dataframe.DataFrame(pd_df) + + bf_result = bf_df.idxmin().to_pandas() + pd_result = pd_df.idxmin() + + pd.testing.assert_series_equal( + bf_result, pd_result, check_index_type=False, check_dtype=False + ) + + +def test_df_idxmax(): + pd_df = pd.DataFrame( + {"a": [1, 2, 3], "b": [7, None, 3], "c": [4, 4, 4]}, index=["x", "y", "z"] + ) + bf_df = dataframe.DataFrame(pd_df) + + bf_result = bf_df.idxmax().to_pandas() + pd_result = pd_df.idxmax() + + pd.testing.assert_series_equal( + bf_result, pd_result, check_index_type=False, check_dtype=False + ) + + +@pytest.mark.parametrize( + ("join", "axis"), + [ + ("outer", None), + ("outer", 0), + ("outer", 1), + ("left", 0), + ("right", 1), + ("inner", None), + ("inner", 1), + ], +) +def test_df_align(join, axis): + + index1: pandas.Index = pandas.Index([1, 2, 3, 4], dtype="Int64") + + index2: pandas.Index = pandas.Index([1, 2, 4, 5], dtype="Int64") + pd_df1 = pandas.DataFrame( + {"a": [1, None, 3, 4], "b": [5, 6, None, 8]}, dtype="Int64", index=index1 + ) + pd_df2 = pandas.DataFrame( + {"a": [None, 20, 30, 40], "c": [90, None, 110, 120]}, + dtype="Int64", + index=index2, + ) + + bf_df1 = dataframe.DataFrame(pd_df1) + bf_df2 = dataframe.DataFrame(pd_df2) + + bf_result1, bf_result2 = bf_df1.align(bf_df2, join=join, axis=axis) + pd_result1, pd_result2 = pd_df1.align(pd_df2, join=join, axis=axis) + + # Don't check dtype as pandas does unnecessary float conversion + assert isinstance(bf_result1, dataframe.DataFrame) and isinstance( + bf_result2, dataframe.DataFrame + ) + pd.testing.assert_frame_equal(bf_result1.to_pandas(), pd_result1, check_dtype=False) + pd.testing.assert_frame_equal(bf_result2.to_pandas(), pd_result2, check_dtype=False) + + +def test_combine_first( + scalars_df_index, + scalars_df_2_index, + scalars_pandas_df_index, +): + if pd.__version__.startswith("1."): + pytest.skip("pd.NA vs NaN not handled well in pandas 1.x.") + columns = ["int64_too", "int64_col", "float64_col"] + + bf_df_a = scalars_df_index[columns].iloc[0:6] + bf_df_a.columns = ["a", "b", "c"] + bf_df_b = scalars_df_2_index[columns].iloc[2:8] + bf_df_b.columns = ["b", "a", "d"] + bf_result = bf_df_a.combine_first(bf_df_b).to_pandas() + + pd_df_a = scalars_pandas_df_index[columns].iloc[0:6] + pd_df_a.columns = ["a", "b", "c"] + pd_df_b = scalars_pandas_df_index[columns].iloc[2:8] + pd_df_b.columns = ["b", "a", "d"] + pd_result = pd_df_a.combine_first(pd_df_b) + + # Some dtype inconsistency for all-NULL columns + pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +@pytest.mark.parametrize( + ("columns", "numeric_only"), + [ + (["bool_col", "int64_col", "float64_col"], True), + (["bool_col", "int64_col", "float64_col"], False), + (["bool_col", "int64_col", "float64_col", "string_col"], True), + pytest.param( + ["bool_col", "int64_col", "float64_col", "string_col"], + False, + marks=pytest.mark.xfail( + raises=NotImplementedError, + ), + ), + ], +) +def test_df_corr_w_numeric_only(scalars_dfs_maybe_ordered, columns, numeric_only): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + + bf_result = scalars_df[columns].corr(numeric_only=numeric_only).to_pandas() + pd_result = scalars_pandas_df[columns].corr(numeric_only=numeric_only) + + # BigFrames and Pandas differ in their data type handling: + # - Column types: BigFrames uses Float64, Pandas uses float64. + # - Index types: BigFrames uses strign, Pandas uses object. + pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) + # Only check row order in ordered mode. + pd.testing.assert_frame_equal( + bf_result, + pd_result, + check_dtype=False, + check_index_type=False, + check_like=~scalars_df._block.session._strictly_ordered, + ) + + +def test_df_corr_w_invalid_parameters(scalars_dfs): + columns = ["int64_too", "int64_col", "float64_col"] + scalars_df, _ = scalars_dfs + + with pytest.raises(NotImplementedError): + scalars_df[columns].corr(method="kendall") + + with pytest.raises(NotImplementedError): + scalars_df[columns].corr(min_periods=1) + + +@pytest.mark.parametrize( + ("columns", "numeric_only"), + [ + (["bool_col", "int64_col", "float64_col"], True), + (["bool_col", "int64_col", "float64_col"], False), + (["bool_col", "int64_col", "float64_col", "string_col"], True), + pytest.param( + ["bool_col", "int64_col", "float64_col", "string_col"], + False, + marks=pytest.mark.xfail( + raises=NotImplementedError, + ), + ), + ], +) +def test_cov_w_numeric_only(scalars_dfs_maybe_ordered, columns, numeric_only): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + bf_result = scalars_df[columns].cov(numeric_only=numeric_only).to_pandas() + pd_result = scalars_pandas_df[columns].cov(numeric_only=numeric_only) + # BigFrames and Pandas differ in their data type handling: + # - Column types: BigFrames uses Float64, Pandas uses float64. + # - Index types: BigFrames uses strign, Pandas uses object. + pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) + # Only check row order in ordered mode. + pd.testing.assert_frame_equal( + bf_result, + pd_result, + check_dtype=False, + check_index_type=False, + check_like=~scalars_df._block.session._strictly_ordered, + ) + + +def test_df_corrwith_df(scalars_dfs_maybe_ordered): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + + l_cols = ["int64_col", "float64_col", "int64_too"] + r_cols = ["int64_too", "float64_col"] + + bf_result = scalars_df[l_cols].corrwith(scalars_df[r_cols]).to_pandas() + pd_result = scalars_pandas_df[l_cols].corrwith(scalars_pandas_df[r_cols]) + + # BigFrames and Pandas differ in their data type handling: + # - Column types: BigFrames uses Float64, Pandas uses float64. + # - Index types: BigFrames uses strign, Pandas uses object. + pd.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_df_corrwith_df_numeric_only(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + l_cols = ["int64_col", "float64_col", "int64_too", "string_col"] + r_cols = ["int64_too", "float64_col", "bool_col"] + + bf_result = ( + scalars_df[l_cols].corrwith(scalars_df[r_cols], numeric_only=True).to_pandas() + ) + pd_result = scalars_pandas_df[l_cols].corrwith( + scalars_pandas_df[r_cols], numeric_only=True + ) + + # BigFrames and Pandas differ in their data type handling: + # - Column types: BigFrames uses Float64, Pandas uses float64. + # - Index types: BigFrames uses strign, Pandas uses object. + pd.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_df_corrwith_df_non_numeric_error(scalars_dfs): + scalars_df, _ = scalars_dfs + + l_cols = ["int64_col", "float64_col", "int64_too", "string_col"] + r_cols = ["int64_too", "float64_col", "bool_col"] + + with pytest.raises(NotImplementedError): + scalars_df[l_cols].corrwith(scalars_df[r_cols], numeric_only=False) + + +def test_df_corrwith_series(scalars_dfs_maybe_ordered): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + + l_cols = ["int64_col", "float64_col", "int64_too"] + r_col = "float64_col" + + bf_result = scalars_df[l_cols].corrwith(scalars_df[r_col]).to_pandas() + pd_result = scalars_pandas_df[l_cols].corrwith(scalars_pandas_df[r_col]) + + # BigFrames and Pandas differ in their data type handling: + # - Column types: BigFrames uses Float64, Pandas uses float64. + # - Index types: BigFrames uses strign, Pandas uses object. + pd.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +@pytest.mark.parametrize( + ("op"), + [ + operator.add, + operator.sub, + operator.mul, + operator.truediv, + operator.floordiv, + operator.eq, + operator.ne, + operator.gt, + operator.ge, + operator.lt, + operator.le, + ], + ids=[ + "add", + "subtract", + "multiply", + "true_divide", + "floor_divide", + "eq", + "ne", + "gt", + "ge", + "lt", + "le", + ], +) +# TODO(garrettwu): deal with NA values +@pytest.mark.parametrize(("other_scalar"), [1, 2.5, 0, 0.0]) +@pytest.mark.parametrize(("reverse_operands"), [True, False]) +def test_scalar_binop(scalars_dfs, op, other_scalar, reverse_operands): + scalars_df, scalars_pandas_df = scalars_dfs + columns = ["int64_col", "float64_col"] + + maybe_reversed_op = (lambda x, y: op(y, x)) if reverse_operands else op + + bf_result = maybe_reversed_op(scalars_df[columns], other_scalar).to_pandas() + pd_result = maybe_reversed_op(scalars_pandas_df[columns], other_scalar) + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_dataframe_string_radd_const(scalars_dfs): + pytest.importorskip( + "pandas", + minversion="2.0.0", + reason="PyArrow string addition requires pandas 2.0+", + ) + + scalars_df, scalars_pandas_df = scalars_dfs + columns = ["string_col", "string_col"] + + bf_result = ("prefix" + scalars_df[columns]).to_pandas() + pd_result = "prefix" + scalars_pandas_df[columns] + + assert_pandas_df_equal(bf_result, pd_result) + + +@pytest.mark.parametrize(("other_scalar"), [1, -2]) +def test_mod(scalars_dfs, other_scalar): + # Zero case excluded as pandas produces 0 result for Int64 inputs rather than NA/NaN. + # This is likely a pandas bug as mod 0 is undefined in other dtypes, and most programming languages. + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = (scalars_df[["int64_col", "int64_too"]] % other_scalar).to_pandas() + pd_result = scalars_pandas_df[["int64_col", "int64_too"]] % other_scalar + + assert_pandas_df_equal(bf_result, pd_result) + + +def test_scalar_binop_str_exception(scalars_dfs): + scalars_df, _ = scalars_dfs + columns = ["string_col"] + with pytest.raises(TypeError, match="Cannot add dtypes"): + (scalars_df[columns] + 1).to_pandas() + + +@pytest.mark.parametrize( + ("op"), + [ + (lambda x, y: x.add(y, axis="index")), + (lambda x, y: x.radd(y, axis="index")), + (lambda x, y: x.sub(y, axis="index")), + (lambda x, y: x.rsub(y, axis="index")), + (lambda x, y: x.mul(y, axis="index")), + (lambda x, y: x.rmul(y, axis="index")), + (lambda x, y: x.truediv(y, axis="index")), + (lambda x, y: x.rtruediv(y, axis="index")), + (lambda x, y: x.floordiv(y, axis="index")), + (lambda x, y: x.floordiv(y, axis="index")), + (lambda x, y: x.gt(y, axis="index")), + (lambda x, y: x.ge(y, axis="index")), + (lambda x, y: x.lt(y, axis="index")), + (lambda x, y: x.le(y, axis="index")), + ], + ids=[ + "add", + "radd", + "sub", + "rsub", + "mul", + "rmul", + "truediv", + "rtruediv", + "floordiv", + "rfloordiv", + "gt", + "ge", + "lt", + "le", + ], +) +def test_series_binop_axis_index( + scalars_dfs, + op, +): + scalars_df, scalars_pandas_df = scalars_dfs + df_columns = ["int64_col", "float64_col"] + series_column = "int64_too" + + bf_result = op(scalars_df[df_columns], scalars_df[series_column]).to_pandas() + pd_result = op(scalars_pandas_df[df_columns], scalars_pandas_df[series_column]) + + assert_pandas_df_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("input"), + [ + ((1000, 2000, 3000)), + (pd.Index([1000, 2000, 3000])), + (pd.Series((1000, 2000), index=["int64_too", "float64_col"])), + ], + ids=[ + "tuple", + "pd_index", + "pd_series", + ], +) +def test_listlike_binop_axis_1_in_memory_data(scalars_dfs, input): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + + df_columns = ["int64_col", "float64_col", "int64_too"] + + bf_result = scalars_df[df_columns].add(input, axis=1).to_pandas() + if hasattr(input, "to_pandas"): + input = input.to_pandas() + pd_result = scalars_pandas_df[df_columns].add(input, axis=1) + + assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + + +def test_df_reverse_binop_pandas(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + + pd_series = pd.Series([100, 200, 300]) + + df_columns = ["int64_col", "float64_col", "int64_too"] + + bf_result = pd_series + scalars_df[df_columns].to_pandas() + pd_result = pd_series + scalars_pandas_df[df_columns] + + assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + + +def test_listlike_binop_axis_1_bf_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + df_columns = ["int64_col", "float64_col", "int64_too"] + + bf_result = ( + scalars_df[df_columns] + .add(bf_indexes.Index([1000, 2000, 3000]), axis=1) + .to_pandas() + ) + pd_result = scalars_pandas_df[df_columns].add(pd.Index([1000, 2000, 3000]), axis=1) + + assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + + +def test_binop_with_self_aggregate(scalars_dfs_maybe_ordered): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + + df_columns = ["int64_col", "float64_col", "int64_too"] + + # Ensure that this takes the optimized single-query path by counting executions + execution_count_before = scalars_df._session._metrics.execution_count + bf_df = scalars_df[df_columns] + bf_result = (bf_df - bf_df.mean()).to_pandas() + execution_count_after = scalars_df._session._metrics.execution_count + + pd_df = scalars_pandas_df[df_columns] + pd_result = pd_df - pd_df.mean() + + executions = execution_count_after - execution_count_before + + assert executions == 1 + assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + + +def test_binop_with_self_aggregate_w_index_reset(scalars_dfs_maybe_ordered): + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + + df_columns = ["int64_col", "float64_col", "int64_too"] + + # Ensure that this takes the optimized single-query path by counting executions + execution_count_before = scalars_df._session._metrics.execution_count + bf_df = scalars_df[df_columns].reset_index(drop=True) + bf_result = (bf_df - bf_df.mean()).to_pandas() + execution_count_after = scalars_df._session._metrics.execution_count + + pd_df = scalars_pandas_df[df_columns].reset_index(drop=True) + pd_result = pd_df - pd_df.mean() + + executions = execution_count_after - execution_count_before + + assert executions == 1 + pd_result.index = pd_result.index.astype("Int64") + assert_pandas_df_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +@pytest.mark.parametrize( + ("left_labels", "right_labels"), + [ + (["a", "a", "b"], ["c", "c", "d"]), + (["a", "b", "c"], ["c", "a", "b"]), + (["a", "c", "c"], ["c", "a", "c"]), + (["a", "b", "c"], ["a", "b", "c"]), + ], + ids=[ + "no_overlap", + "one_one_match", + "multi_match", + "exact_match", + ], +) +def test_binop_df_df_binary_op( + scalars_df_index, + scalars_df_2_index, + scalars_pandas_df_index, + left_labels, + right_labels, +): + if pd.__version__.startswith("1."): + pytest.skip("pd.NA vs NaN not handled well in pandas 1.x.") + columns = ["int64_too", "int64_col", "float64_col"] + + bf_df_a = scalars_df_index[columns] + bf_df_a.columns = left_labels + bf_df_b = scalars_df_2_index[columns] + bf_df_b.columns = right_labels + bf_result = (bf_df_a - bf_df_b).to_pandas() + + pd_df_a = scalars_pandas_df_index[columns] + pd_df_a.columns = left_labels + pd_df_b = scalars_pandas_df_index[columns] + pd_df_b.columns = right_labels + pd_result = pd_df_a - pd_df_b + + # Some dtype inconsistency for all-NULL columns + pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +# Differnt table will only work for explicit index, since default index orders are arbitrary. +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_series_binop_add_different_table( + scalars_df_index, scalars_pandas_df_index, scalars_df_2_index, ordered +): + df_columns = ["int64_col", "float64_col"] + series_column = "int64_too" + + bf_result = ( + scalars_df_index[df_columns] + .add(scalars_df_2_index[series_column], axis="index") + .to_pandas(ordered=ordered) + ) + pd_result = scalars_pandas_df_index[df_columns].add( + scalars_pandas_df_index[series_column], axis="index" + ) + + assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) + + +# TODO(garrettwu): Test series binop with different index + +all_joins = pytest.mark.parametrize( + ("how",), + (("outer",), ("left",), ("right",), ("inner",), ("cross",)), +) + + +@all_joins +def test_join_same_table(scalars_dfs_maybe_ordered, how): + bf_df, pd_df = scalars_dfs_maybe_ordered + + bf_df_a = bf_df.set_index("int64_too")[["string_col", "int64_col"]] + bf_df_a = bf_df_a.sort_index() + + bf_df_b = bf_df.set_index("int64_too")[["float64_col"]] + bf_df_b = bf_df_b[bf_df_b.float64_col > 0] + bf_df_b = bf_df_b.sort_values("float64_col") + + bf_result = bf_df_a.join(bf_df_b, how=how).to_pandas() + + pd_df_a = pd_df.set_index("int64_too")[["string_col", "int64_col"]].sort_index() + pd_df_a = pd_df_a.sort_index() + + pd_df_b = pd_df.set_index("int64_too")[["float64_col"]] + pd_df_b = pd_df_b[pd_df_b.float64_col > 0] + pd_df_b = pd_df_b.sort_values("float64_col") + + pd_result = pd_df_a.join(pd_df_b, how=how) + + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + + +def test_join_incompatible_key_type_error(scalars_dfs): + bf_df, _ = scalars_dfs + + bf_df_a = bf_df.set_index("int64_too")[["string_col", "int64_col"]] + bf_df_a = bf_df_a.sort_index() + + bf_df_b = bf_df.set_index("date_col")[["float64_col"]] + bf_df_b = bf_df_b[bf_df_b.float64_col > 0] + bf_df_b = bf_df_b.sort_values("float64_col") + + with pytest.raises(TypeError): + # joining incompatible date, int columns + bf_df_a.join(bf_df_b, how="left") + + +@all_joins +def test_join_different_table( + scalars_df_index, scalars_df_2_index, scalars_pandas_df_index, how +): + bf_df_a = scalars_df_index[["string_col", "int64_col"]] + bf_df_b = scalars_df_2_index.dropna()[["float64_col"]] + bf_result = bf_df_a.join(bf_df_b, how=how).to_pandas() + pd_df_a = scalars_pandas_df_index[["string_col", "int64_col"]] + pd_df_b = scalars_pandas_df_index.dropna()[["float64_col"]] + pd_result = pd_df_a.join(pd_df_b, how=how) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + + +@all_joins +def test_join_different_table_with_duplicate_column_name( + scalars_df_index, scalars_pandas_df_index, how +): + bf_df_a = scalars_df_index[["string_col", "int64_col", "int64_too"]].rename( + columns={"int64_too": "int64_col"} + ) + bf_df_b = scalars_df_index.dropna()[ + ["string_col", "int64_col", "int64_too"] + ].rename(columns={"int64_too": "int64_col"}) + bf_result = bf_df_a.join(bf_df_b, how=how, lsuffix="_l", rsuffix="_r").to_pandas() + pd_df_a = scalars_pandas_df_index[["string_col", "int64_col", "int64_too"]].rename( + columns={"int64_too": "int64_col"} + ) + pd_df_b = scalars_pandas_df_index.dropna()[ + ["string_col", "int64_col", "int64_too"] + ].rename(columns={"int64_too": "int64_col"}) + pd_result = pd_df_a.join(pd_df_b, how=how, lsuffix="_l", rsuffix="_r") + + # Ensure no inplace changes + pd.testing.assert_index_equal(bf_df_a.columns, pd_df_a.columns) + pd.testing.assert_index_equal(bf_df_b.index.to_pandas(), pd_df_b.index) + pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + + +@all_joins +def test_join_param_on_with_duplicate_column_name_not_on_col( + scalars_df_index, scalars_pandas_df_index, how +): + # This test is for duplicate column names, but the 'on' column is not duplicated. + if how == "cross": + return + bf_df_a = scalars_df_index[ + ["string_col", "datetime_col", "timestamp_col", "int64_too"] + ].rename(columns={"timestamp_col": "datetime_col"}) + bf_df_b = scalars_df_index.dropna()[ + ["string_col", "datetime_col", "timestamp_col"] + ].rename(columns={"timestamp_col": "datetime_col"}) + bf_result = bf_df_a.join( + bf_df_b, on="int64_too", how=how, lsuffix="_l", rsuffix="_r" + ).to_pandas() + pd_df_a = scalars_pandas_df_index[ + ["string_col", "datetime_col", "timestamp_col", "int64_too"] + ].rename(columns={"timestamp_col": "datetime_col"}) + pd_df_b = scalars_pandas_df_index.dropna()[ + ["string_col", "datetime_col", "timestamp_col"] + ].rename(columns={"timestamp_col": "datetime_col"}) + pd_result = pd_df_a.join( + pd_df_b, on="int64_too", how=how, lsuffix="_l", rsuffix="_r" + ) + pd.testing.assert_frame_equal( + bf_result.sort_index(), + pd_result.sort_index(), + check_like=True, + check_index_type=False, + check_names=False, + ) + pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) + + +@pytest.mark.skipif( + pandas.__version__.startswith("1."), reason="bad left join in pandas 1.x" +) +@all_joins +def test_join_param_on_with_duplicate_column_name_on_col( + scalars_df_index, scalars_pandas_df_index, how +): + # This test is for duplicate column names, and the 'on' column is duplicated. + if how == "cross": + return + bf_df_a = scalars_df_index[ + ["string_col", "datetime_col", "timestamp_col", "int64_too"] + ].rename(columns={"timestamp_col": "datetime_col"}) + bf_df_b = scalars_df_index.dropna()[ + ["string_col", "datetime_col", "timestamp_col", "int64_too"] + ].rename(columns={"timestamp_col": "datetime_col"}) + bf_result = bf_df_a.join( + bf_df_b, on="int64_too", how=how, lsuffix="_l", rsuffix="_r" + ).to_pandas() + pd_df_a = scalars_pandas_df_index[ + ["string_col", "datetime_col", "timestamp_col", "int64_too"] + ].rename(columns={"timestamp_col": "datetime_col"}) + pd_df_b = scalars_pandas_df_index.dropna()[ + ["string_col", "datetime_col", "timestamp_col", "int64_too"] + ].rename(columns={"timestamp_col": "datetime_col"}) + pd_result = pd_df_a.join( + pd_df_b, on="int64_too", how=how, lsuffix="_l", rsuffix="_r" + ) + pd.testing.assert_frame_equal( + bf_result.sort_index(), + pd_result.sort_index(), + check_like=True, + check_index_type=False, + check_names=False, + ) + pd.testing.assert_index_equal(bf_result.columns, pd_result.columns) + + +@all_joins +def test_join_param_on(scalars_dfs, how): + bf_df, pd_df = scalars_dfs + + bf_df_a = bf_df[["string_col", "int64_col", "rowindex_2"]] + bf_df_a = bf_df_a.assign(rowindex_2=bf_df_a["rowindex_2"] + 2) + bf_df_b = bf_df[["float64_col"]] + + if how == "cross": + with pytest.raises(ValueError): + bf_df_a.join(bf_df_b, on="rowindex_2", how=how) + else: + bf_result = bf_df_a.join(bf_df_b, on="rowindex_2", how=how).to_pandas() + + pd_df_a = pd_df[["string_col", "int64_col", "rowindex_2"]] + pd_df_a = pd_df_a.assign(rowindex_2=pd_df_a["rowindex_2"] + 2) + pd_df_b = pd_df[["float64_col"]] + pd_result = pd_df_a.join(pd_df_b, on="rowindex_2", how=how) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + + +@all_joins +def test_df_join_series(scalars_dfs, how): + bf_df, pd_df = scalars_dfs + + bf_df_a = bf_df[["string_col", "int64_col", "rowindex_2"]] + bf_df_a = bf_df_a.assign(rowindex_2=bf_df_a["rowindex_2"] + 2) + bf_series_b = bf_df["float64_col"] + + if how == "cross": + with pytest.raises(ValueError): + bf_df_a.join(bf_series_b, on="rowindex_2", how=how) + else: + bf_result = bf_df_a.join(bf_series_b, on="rowindex_2", how=how).to_pandas() + + pd_df_a = pd_df[["string_col", "int64_col", "rowindex_2"]] + pd_df_a = pd_df_a.assign(rowindex_2=pd_df_a["rowindex_2"] + 2) + pd_series_b = pd_df["float64_col"] + pd_result = pd_df_a.join(pd_series_b, on="rowindex_2", how=how) + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) + + +@pytest.mark.parametrize( + ("by", "ascending", "na_position"), + [ + ("int64_col", True, "first"), + (["bool_col", "int64_col"], True, "last"), + ("int64_col", False, "first"), + (["bool_col", "int64_col"], [False, True], "last"), + (["bool_col", "int64_col"], [True, False], "first"), + ], +) +def test_dataframe_sort_values( + scalars_df_index, scalars_pandas_df_index, by, ascending, na_position +): + # Test needs values to be unique + bf_result = scalars_df_index.sort_values( + by, ascending=ascending, na_position=na_position + ).to_pandas() + pd_result = scalars_pandas_df_index.sort_values( + by, ascending=ascending, na_position=na_position + ) + + pandas.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("by", "ascending", "na_position"), + [ + ("int64_col", True, "first"), + (["bool_col", "int64_col"], True, "last"), + ], +) +def test_dataframe_sort_values_inplace( + scalars_df_index, scalars_pandas_df_index, by, ascending, na_position +): + # Test needs values to be unique + bf_sorted = scalars_df_index.copy() + bf_sorted.sort_values( + by, ascending=ascending, na_position=na_position, inplace=True + ) + bf_result = bf_sorted.to_pandas() + pd_result = scalars_pandas_df_index.sort_values( + by, ascending=ascending, na_position=na_position + ) + + pandas.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_dataframe_sort_values_invalid_input(scalars_df_index): + with pytest.raises(KeyError): + scalars_df_index.sort_values(by=scalars_df_index["int64_col"]) + + +def test_dataframe_sort_values_stable(scalars_df_index, scalars_pandas_df_index): + bf_result = ( + scalars_df_index.sort_values("int64_col", kind="stable") + .sort_values("bool_col", kind="stable") + .to_pandas() + ) + pd_result = scalars_pandas_df_index.sort_values( + "int64_col", kind="stable" + ).sort_values("bool_col", kind="stable") + + pandas.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("operator", "columns"), + [ + pytest.param(lambda x: x.cumsum(), ["float64_col", "int64_too"]), + pytest.param(lambda x: x.cumprod(), ["float64_col", "int64_too"]), + pytest.param( + lambda x: x.cumprod(), + ["string_col"], + marks=pytest.mark.xfail( + raises=ValueError, + ), + ), + ], + ids=[ + "cumsum", + "cumprod", + "non-numeric", + ], +) +def test_dataframe_numeric_analytic_op( + scalars_df_index, scalars_pandas_df_index, operator, columns +): + # TODO: Add nullable ints (pandas 1.x has poor behavior on these) + bf_series = operator(scalars_df_index[columns]) + pd_series = operator(scalars_pandas_df_index[columns]) + bf_result = bf_series.to_pandas() + pd.testing.assert_frame_equal(pd_series, bf_result, check_dtype=False) + + +@pytest.mark.parametrize( + ("operator"), + [ + (lambda x: x.cummin()), + (lambda x: x.cummax()), + (lambda x: x.shift(2)), + (lambda x: x.shift(-2)), + ], + ids=[ + "cummin", + "cummax", + "shiftpostive", + "shiftnegative", + ], +) +def test_dataframe_general_analytic_op( + scalars_df_index, scalars_pandas_df_index, operator +): + col_names = ["int64_too", "float64_col", "int64_col", "bool_col"] + bf_series = operator(scalars_df_index[col_names]) + pd_series = operator(scalars_pandas_df_index[col_names]) + bf_result = bf_series.to_pandas() + pd.testing.assert_frame_equal( + pd_series, + bf_result, + ) + + +@pytest.mark.parametrize( + ("periods",), + [ + (1,), + (2,), + (-1,), + ], +) +def test_dataframe_diff(scalars_df_index, scalars_pandas_df_index, periods): + col_names = ["int64_too", "float64_col", "int64_col"] + bf_result = scalars_df_index[col_names].diff(periods=periods).to_pandas() + pd_result = scalars_pandas_df_index[col_names].diff(periods=periods) + pd.testing.assert_frame_equal( + pd_result, + bf_result, + ) + + +@pytest.mark.parametrize( + ("periods",), + [ + (1,), + (2,), + (-1,), + ], +) +def test_dataframe_pct_change(scalars_df_index, scalars_pandas_df_index, periods): + col_names = ["int64_too", "float64_col", "int64_col"] + bf_result = scalars_df_index[col_names].pct_change(periods=periods).to_pandas() + pd_result = scalars_pandas_df_index[col_names].pct_change(periods=periods) + pd.testing.assert_frame_equal( + pd_result, + bf_result, + ) + + +def test_dataframe_agg_single_string(scalars_dfs): + numeric_cols = ["int64_col", "int64_too", "float64_col"] + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df[numeric_cols].agg("sum").to_pandas() + pd_result = scalars_pandas_df[numeric_cols].agg("sum") + + assert bf_result.dtype == "Float64" + pd.testing.assert_series_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +@pytest.mark.parametrize( + ("agg",), + ( + ("sum",), + ("size",), + ), +) +def test_dataframe_agg_int_single_string(scalars_dfs, agg): + numeric_cols = ["int64_col", "int64_too", "bool_col"] + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df[numeric_cols].agg(agg).to_pandas() + pd_result = scalars_pandas_df[numeric_cols].agg(agg) + + assert bf_result.dtype == "Int64" + pd.testing.assert_series_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +def test_dataframe_agg_multi_string(scalars_dfs_maybe_ordered): + numeric_cols = ["int64_col", "int64_too", "float64_col"] + aggregations = [ + "sum", + "mean", + "median", + "std", + "var", + "min", + "max", + "nunique", + "count", + ] + scalars_df, scalars_pandas_df = scalars_dfs_maybe_ordered + bf_result = scalars_df[numeric_cols].agg(aggregations) + pd_result = scalars_pandas_df[numeric_cols].agg(aggregations) + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + # Drop median, as it's an approximation. + bf_median = bf_result.loc["median", :] + bf_result = bf_result.drop(labels=["median"]) + pd_result = pd_result.drop(labels=["median"]) + + assert_dfs_equivalent(pd_result, bf_result, check_index_type=False) + + # Double-check that median is at least plausible. + assert ( + (bf_result.loc["min", :] <= bf_median) & (bf_median <= bf_result.loc["max", :]) + ).all() + + +def test_dataframe_agg_int_multi_string(scalars_dfs): + numeric_cols = ["int64_col", "int64_too", "bool_col"] + aggregations = [ + "sum", + "nunique", + "count", + "size", + ] + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[numeric_cols].agg(aggregations).to_pandas() + pd_result = scalars_pandas_df[numeric_cols].agg(aggregations) + + for dtype in bf_result.dtypes: + assert dtype == "Int64" + + # Pandas may produce narrower numeric types + # Pandas has object index type + pd.testing.assert_frame_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +def test_df_transpose(): + # Include some floats to ensure type coercion + values = [[0, 3.5, True], [1, 4.5, False], [2, 6.5, None]] + # Test complex case of both axes being multi-indices with non-unique elements + + columns: pandas.Index = pd.Index( + ["A", "B", "A"], dtype=pd.StringDtype(storage="pyarrow") + ) + columns_multi = pd.MultiIndex.from_arrays([columns, columns], names=["c1", "c2"]) + + index: pandas.Index = pd.Index( + ["b", "a", "a"], dtype=pd.StringDtype(storage="pyarrow") + ) + rows_multi = pd.MultiIndex.from_arrays([index, index], names=["r1", "r2"]) + + pd_df = pandas.DataFrame(values, index=rows_multi, columns=columns_multi) + bf_df = dataframe.DataFrame(values, index=rows_multi, columns=columns_multi) + + pd_result = pd_df.T + bf_result = bf_df.T.to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False) + + +def test_df_transpose_error(): + with pytest.raises(TypeError, match="Cannot coerce.*to a common type."): + dataframe.DataFrame([[1, "hello"], [2, "world"]]).transpose() + + +def test_df_transpose_repeated_uses_cache(): + bf_df = dataframe.DataFrame([[1, 2.5], [2, 3.5]]) + pd_df = pandas.DataFrame([[1, 2.5], [2, 3.5]]) + # Transposing many times so that operation will fail from complexity if not using cache + for i in range(10): + # Cache still works even with simple scalar binop + bf_df = bf_df.transpose() + i + pd_df = pd_df.transpose() + i + + pd.testing.assert_frame_equal( + pd_df, bf_df.to_pandas(), check_dtype=False, check_index_type=False + ) + + +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_df_stack(scalars_dfs, ordered): + if pandas.__version__.startswith("1.") or pandas.__version__.startswith("2.0"): + pytest.skip("pandas <2.1 uses different stack implementation") + scalars_df, scalars_pandas_df = scalars_dfs + # To match bigquery dataframes + scalars_pandas_df = scalars_pandas_df.copy() + scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") + # Can only stack identically-typed columns + columns = ["int64_col", "int64_too", "rowindex_2"] + + bf_result = scalars_df[columns].stack().to_pandas(ordered=ordered) + pd_result = scalars_pandas_df[columns].stack(future_stack=True) + + # Pandas produces NaN, where bq dataframes produces pd.NA + assert_series_equal( + bf_result, pd_result, check_dtype=False, ignore_order=not ordered + ) + + +def test_df_melt_default(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + # To match bigquery dataframes + scalars_pandas_df = scalars_pandas_df.copy() + scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") + # Can only stack identically-typed columns + columns = ["int64_col", "int64_too", "rowindex_2"] + + bf_result = scalars_df[columns].melt().to_pandas() + pd_result = scalars_pandas_df[columns].melt() + + # Pandas produces int64 index, Bigframes produces Int64 (nullable) + pd.testing.assert_frame_equal( + bf_result, + pd_result, + check_index_type=False, + check_dtype=False, + ) + + +def test_df_melt_parameterized(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + # To match bigquery dataframes + scalars_pandas_df = scalars_pandas_df.copy() + scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") + # Can only stack identically-typed columns + + bf_result = scalars_df.melt( + var_name="alice", + value_name="bob", + id_vars=["string_col"], + value_vars=["int64_col", "int64_too"], + ).to_pandas() + pd_result = scalars_pandas_df.melt( + var_name="alice", + value_name="bob", + id_vars=["string_col"], + value_vars=["int64_col", "int64_too"], + ) + + # Pandas produces int64 index, Bigframes produces Int64 (nullable) + pd.testing.assert_frame_equal( + bf_result, pd_result, check_index_type=False, check_dtype=False + ) + + +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_df_unstack(scalars_dfs, ordered): + scalars_df, scalars_pandas_df = scalars_dfs + # To match bigquery dataframes + scalars_pandas_df = scalars_pandas_df.copy() + scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") + # Can only stack identically-typed columns + columns = [ + "rowindex_2", + "int64_col", + "int64_too", + ] + + # unstack on mono-index produces series + bf_result = scalars_df[columns].unstack().to_pandas(ordered=ordered) + pd_result = scalars_pandas_df[columns].unstack() + + # Pandas produces NaN, where bq dataframes produces pd.NA + assert_series_equal( + bf_result, pd_result, check_dtype=False, ignore_order=not ordered + ) + + +@pytest.mark.parametrize( + ("values", "index", "columns"), + [ + ("int64_col", "int64_too", ["string_col"]), + (["int64_col"], "int64_too", ["string_col"]), + (["int64_col", "float64_col"], "int64_too", ["string_col"]), + ], +) +def test_df_pivot(scalars_dfs, values, index, columns): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.pivot( + values=values, index=index, columns=columns + ).to_pandas() + pd_result = scalars_pandas_df.pivot(values=values, index=index, columns=columns) + + # Pandas produces NaN, where bq dataframes produces pd.NA + bf_result = bf_result.fillna(float("nan")) + pd_result = pd_result.fillna(float("nan")) + pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +@pytest.mark.parametrize( + ("values", "index", "columns"), + [ + (["goals", "assists"], ["team_name", "season"], ["position"]), + (["goals", "assists"], ["season"], ["team_name", "position"]), + ], +) +def test_df_pivot_hockey(hockey_df, hockey_pandas_df, values, index, columns): + bf_result = ( + hockey_df.reset_index() + .pivot(values=values, index=index, columns=columns) + .to_pandas() + ) + pd_result = hockey_pandas_df.reset_index().pivot( + values=values, index=index, columns=columns + ) + + # Pandas produces NaN, where bq dataframes produces pd.NA + pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +@pytest.mark.parametrize( + ("values", "index", "columns", "aggfunc"), + [ + (("culmen_length_mm", "body_mass_g"), "species", "sex", "std"), + (["body_mass_g", "culmen_length_mm"], ("species", "island"), "sex", "sum"), + ("body_mass_g", "sex", ["island", "species"], "mean"), + ("culmen_depth_mm", "island", "species", "max"), + ], +) +def test_df_pivot_table( + penguins_df_default_index, + penguins_pandas_df_default_index, + values, + index, + columns, + aggfunc, +): + bf_result = penguins_df_default_index.pivot_table( + values=values, index=index, columns=columns, aggfunc=aggfunc + ).to_pandas() + pd_result = penguins_pandas_df_default_index.pivot_table( + values=values, index=index, columns=columns, aggfunc=aggfunc + ) + pd.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_column_type=False + ) + + +def test_ipython_key_completions_with_drop(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_names = "string_col" + bf_dataframe = scalars_df.drop(columns=col_names) + pd_dataframe = scalars_pandas_df.drop(columns=col_names) + expected = pd_dataframe.columns.tolist() + + results = bf_dataframe._ipython_key_completions_() + + assert col_names not in results + assert results == expected + # _ipython_key_completions_ is called with square brackets + # so only column names are relevant with tab completion + assert "to_gbq" not in results + assert "merge" not in results + assert "drop" not in results + + +def test_ipython_key_completions_with_rename(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name_dict = {"string_col": "a_renamed_column"} + bf_dataframe = scalars_df.rename(columns=col_name_dict) + pd_dataframe = scalars_pandas_df.rename(columns=col_name_dict) + expected = pd_dataframe.columns.tolist() + + results = bf_dataframe._ipython_key_completions_() + + assert "string_col" not in results + assert "a_renamed_column" in results + assert results == expected + # _ipython_key_completions_ is called with square brackets + # so only column names are relevant with tab completion + assert "to_gbq" not in results + assert "merge" not in results + assert "drop" not in results + + +def test__dir__with_drop(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_names = "string_col" + bf_dataframe = scalars_df.drop(columns=col_names) + pd_dataframe = scalars_pandas_df.drop(columns=col_names) + expected = pd_dataframe.columns.tolist() + + results = dir(bf_dataframe) + + assert col_names not in results + assert frozenset(expected) <= frozenset(results) + # __dir__ is called with a '.' and displays all methods, columns names, etc. + assert "to_gbq" in results + assert "merge" in results + assert "drop" in results + + +def test__dir__with_rename(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + col_name_dict = {"string_col": "a_renamed_column"} + bf_dataframe = scalars_df.rename(columns=col_name_dict) + pd_dataframe = scalars_pandas_df.rename(columns=col_name_dict) + expected = pd_dataframe.columns.tolist() + + results = dir(bf_dataframe) + + assert "string_col" not in results + assert "a_renamed_column" in results + assert frozenset(expected) <= frozenset(results) + # __dir__ is called with a '.' and displays all methods, columns names, etc. + assert "to_gbq" in results + assert "merge" in results + assert "drop" in results + + +def test_loc_select_columns_w_repeats(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index[["int64_col", "int64_col", "int64_too"]].to_pandas() + pd_result = scalars_pandas_df_index[["int64_col", "int64_col", "int64_too"]] + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("start", "stop", "step"), + [ + (0, 0, None), + (None, None, None), + (1, None, None), + (None, 4, None), + (None, None, 2), + (None, 50000000000, 1), + (5, 4, None), + (3, None, 2), + (1, 7, 2), + (1, 7, 50000000000), + ], +) +def test_iloc_slice(scalars_df_index, scalars_pandas_df_index, start, stop, step): + bf_result = scalars_df_index.iloc[start:stop:step].to_pandas() + pd_result = scalars_pandas_df_index.iloc[start:stop:step] + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + ("start", "stop", "step"), + [ + (0, 0, None), + ], +) +def test_iloc_slice_after_cache( + scalars_df_index, scalars_pandas_df_index, start, stop, step +): + scalars_df_index.cache() + bf_result = scalars_df_index.iloc[start:stop:step].to_pandas() + pd_result = scalars_pandas_df_index.iloc[start:stop:step] + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_iloc_slice_zero_step(scalars_df_index): + with pytest.raises(ValueError): + scalars_df_index.iloc[0:0:0] + + +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_iloc_slice_nested(scalars_df_index, scalars_pandas_df_index, ordered): + bf_result = scalars_df_index.iloc[1:].iloc[1:].to_pandas(ordered=ordered) + pd_result = scalars_pandas_df_index.iloc[1:].iloc[1:] + + assert_pandas_df_equal(bf_result, pd_result, ignore_order=not ordered) + + +@pytest.mark.parametrize( + "index", + [0, 5, -2, (2,)], +) +def test_iloc_single_integer(scalars_df_index, scalars_pandas_df_index, index): + bf_result = scalars_df_index.iloc[index] + pd_result = scalars_pandas_df_index.iloc[index] + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +@pytest.mark.parametrize( + "index", + [(2, 5), (5, 0), (0, 0)], +) +def test_iloc_tuple(scalars_df_index, scalars_pandas_df_index, index): + bf_result = scalars_df_index.iloc[index] + pd_result = scalars_pandas_df_index.iloc[index] + + assert bf_result == pd_result + + +@pytest.mark.parametrize( + "index", + [(slice(None), [1, 2, 3]), (slice(1, 7, 2), [2, 5, 3])], +) +def test_iloc_tuple_multi_columns(scalars_df_index, scalars_pandas_df_index, index): + bf_result = scalars_df_index.iloc[index].to_pandas() + pd_result = scalars_pandas_df_index.iloc[index] + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_iloc_tuple_multi_columns_single_row(scalars_df_index, scalars_pandas_df_index): + index = (2, [2, 1, 3, -4]) + bf_result = scalars_df_index.iloc[index] + pd_result = scalars_pandas_df_index.iloc[index] + pd.testing.assert_series_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("index", "error"), + [ + ((1, 1, 1), pd.errors.IndexingError), + (("asd", "asd", "asd"), pd.errors.IndexingError), + (("asd"), TypeError), + ], +) +def test_iloc_tuple_errors(scalars_df_index, scalars_pandas_df_index, index, error): + with pytest.raises(error): + scalars_df_index.iloc[index] + with pytest.raises(error): + scalars_pandas_df_index.iloc[index] + + +@pytest.mark.parametrize( + "index", + [(2, 5), (5, 0), (0, 0)], +) +def test_iat(scalars_df_index, scalars_pandas_df_index, index): + bf_result = scalars_df_index.iat[index] + pd_result = scalars_pandas_df_index.iat[index] + + assert bf_result == pd_result + + +@pytest.mark.parametrize( + ("index", "error"), + [ + (0, TypeError), + ("asd", ValueError), + ((1, 2, 3), TypeError), + (("asd", "asd"), ValueError), + ], +) +def test_iat_errors(scalars_df_index, scalars_pandas_df_index, index, error): + with pytest.raises(error): + scalars_pandas_df_index.iat[index] + with pytest.raises(error): + scalars_df_index.iat[index] + + +def test_iloc_single_integer_out_of_bound_error(scalars_df_index): + with pytest.raises(IndexError, match="single positional indexer is out-of-bounds"): + scalars_df_index.iloc[99] + + +def test_loc_bool_series(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.loc[scalars_df_index.bool_col].to_pandas() + pd_result = scalars_pandas_df_index.loc[scalars_pandas_df_index.bool_col] + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_loc_list_select_rows_and_columns(scalars_df_index, scalars_pandas_df_index): + idx_list = [0, 3, 5] + bf_result = scalars_df_index.loc[idx_list, ["bool_col", "int64_col"]].to_pandas() + pd_result = scalars_pandas_df_index.loc[idx_list, ["bool_col", "int64_col"]] + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_loc_select_column(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.loc[:, "int64_col"].to_pandas() + pd_result = scalars_pandas_df_index.loc[:, "int64_col"] + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_loc_select_with_column_condition(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.loc[:, scalars_df_index.dtypes == "Int64"].to_pandas() + pd_result = scalars_pandas_df_index.loc[ + :, scalars_pandas_df_index.dtypes == "Int64" + ] + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_loc_select_with_column_condition_bf_series( + scalars_df_index, scalars_pandas_df_index +): + # (b/347072677) GEOGRAPH type doesn't support DISTINCT op + columns = [ + item for item in scalars_pandas_df_index.columns if item != "geography_col" + ] + scalars_df_index = scalars_df_index[columns] + scalars_pandas_df_index = scalars_pandas_df_index[columns] + + size_half = len(scalars_pandas_df_index) / 2 + bf_result = scalars_df_index.loc[ + :, scalars_df_index.nunique() > size_half + ].to_pandas() + pd_result = scalars_pandas_df_index.loc[ + :, scalars_pandas_df_index.nunique() > size_half + ] + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_loc_single_index_with_duplicate(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.set_index("string_col", drop=False) + scalars_pandas_df_index = scalars_pandas_df_index.set_index( + "string_col", drop=False + ) + index = "Hello, World!" + bf_result = scalars_df_index.loc[index] + pd_result = scalars_pandas_df_index.loc[index] + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_single_index_no_duplicate(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.set_index("int64_too", drop=False) + scalars_pandas_df_index = scalars_pandas_df_index.set_index("int64_too", drop=False) + index = -2345 + bf_result = scalars_df_index.loc[index] + pd_result = scalars_pandas_df_index.loc[index] + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_at_with_duplicate(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.set_index("string_col", drop=False) + scalars_pandas_df_index = scalars_pandas_df_index.set_index( + "string_col", drop=False + ) + index = "Hello, World!" + bf_result = scalars_df_index.at[index, "int64_too"] + pd_result = scalars_pandas_df_index.at[index, "int64_too"] + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_at_no_duplicate(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.set_index("int64_too", drop=False) + scalars_pandas_df_index = scalars_pandas_df_index.set_index("int64_too", drop=False) + index = -2345 + bf_result = scalars_df_index.at[index, "string_col"] + pd_result = scalars_pandas_df_index.at[index, "string_col"] + assert bf_result == pd_result + + +def test_loc_setitem_bool_series_scalar_new_col(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df.loc[bf_df["int64_too"] == 0, "new_col"] = 99 + pd_df.loc[pd_df["int64_too"] == 0, "new_col"] = 99 + + # pandas uses float64 instead + pd_df["new_col"] = pd_df["new_col"].astype("Float64") + + pd.testing.assert_frame_equal( + bf_df.to_pandas(), + pd_df, + ) + + +@pytest.mark.parametrize( + ("col", "value"), + [ + ("string_col", "hello"), + ("int64_col", 3), + ("float64_col", 3.5), + ], +) +def test_loc_setitem_bool_series_scalar_existing_col(scalars_dfs, col, value): + if pd.__version__.startswith("1."): + pytest.skip("this loc overload not supported in pandas 1.x.") + + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df.loc[bf_df["int64_too"] == 1, col] = value + pd_df.loc[pd_df["int64_too"] == 1, col] = value + + pd.testing.assert_frame_equal( + bf_df.to_pandas(), + pd_df, + ) + + +def test_loc_setitem_bool_series_scalar_error(scalars_dfs): + if pd.__version__.startswith("1."): + pytest.skip("this loc overload not supported in pandas 1.x.") + + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + + with pytest.raises(Exception): + bf_df.loc[bf_df["int64_too"] == 1, "string_col"] = 99 + with pytest.raises(Exception): + pd_df.loc[pd_df["int64_too"] == 1, "string_col"] = 99 + + +@pytest.mark.parametrize( + ("col", "op"), + [ + # Int aggregates + pytest.param("int64_col", lambda x: x.sum(), id="int-sum"), + pytest.param("int64_col", lambda x: x.min(), id="int-min"), + pytest.param("int64_col", lambda x: x.max(), id="int-max"), + pytest.param("int64_col", lambda x: x.count(), id="int-count"), + pytest.param("int64_col", lambda x: x.nunique(), id="int-nunique"), + # Float aggregates + pytest.param("float64_col", lambda x: x.count(), id="float-count"), + pytest.param("float64_col", lambda x: x.nunique(), id="float-nunique"), + # Bool aggregates + pytest.param("bool_col", lambda x: x.sum(), id="bool-sum"), + pytest.param("bool_col", lambda x: x.count(), id="bool-count"), + pytest.param("bool_col", lambda x: x.nunique(), id="bool-nunique"), + # String aggregates + pytest.param("string_col", lambda x: x.count(), id="string-count"), + pytest.param("string_col", lambda x: x.nunique(), id="string-nunique"), + ], +) +def test_dataframe_aggregate_int(scalars_df_index, scalars_pandas_df_index, col, op): + bf_result = op(scalars_df_index[[col]]).to_pandas() + pd_result = op(scalars_pandas_df_index[[col]]) + + # Check dtype separately + assert bf_result.dtype == "Int64" + # Is otherwise "object" dtype + pd_result.index = pd_result.index.astype("string[pyarrow]") + # Pandas may produce narrower numeric types + assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) + + +@pytest.mark.parametrize( + ("col", "op"), + [ + pytest.param("bool_col", lambda x: x.min(), id="bool-min"), + pytest.param("bool_col", lambda x: x.max(), id="bool-max"), + ], +) +def test_dataframe_aggregate_bool(scalars_df_index, scalars_pandas_df_index, col, op): + bf_result = op(scalars_df_index[[col]]).to_pandas() + pd_result = op(scalars_pandas_df_index[[col]]) + + # Check dtype separately + assert bf_result.dtype == "boolean" + + # Pandas may produce narrower numeric types + # Pandas has object index type + pd_result.index = pd_result.index.astype("string[pyarrow]") + assert_series_equal(pd_result, bf_result, check_dtype=False, check_index_type=False) + + +@pytest.mark.parametrize( + ("op", "bf_dtype"), + [ + (lambda x: x.sum(numeric_only=True), "Float64"), + (lambda x: x.mean(numeric_only=True), "Float64"), + (lambda x: x.min(numeric_only=True), "Float64"), + (lambda x: x.max(numeric_only=True), "Float64"), + (lambda x: x.std(numeric_only=True), "Float64"), + (lambda x: x.var(numeric_only=True), "Float64"), + (lambda x: x.count(numeric_only=False), "Int64"), + (lambda x: x.nunique(), "Int64"), + ], + ids=["sum", "mean", "min", "max", "std", "var", "count", "nunique"], +) +def test_dataframe_aggregates(scalars_dfs_maybe_ordered, op, bf_dtype): + scalars_df_index, scalars_pandas_df_index = scalars_dfs_maybe_ordered + col_names = ["int64_too", "float64_col", "string_col", "int64_col", "bool_col"] + bf_series = op(scalars_df_index[col_names]) + bf_result = bf_series + pd_result = op(scalars_pandas_df_index[col_names]) + + # Check dtype separately + assert bf_result.dtype == bf_dtype + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + # Pandas has object index type + pd_result.index = pd_result.index.astype("string[pyarrow]") + assert_series_equivalent( + pd_result, + bf_result, + check_dtype=False, + check_index_type=False, + ) + + +@pytest.mark.parametrize( + ("op"), + [ + (lambda x: x.sum(axis=1, numeric_only=True)), + (lambda x: x.mean(axis=1, numeric_only=True)), + (lambda x: x.min(axis=1, numeric_only=True)), + (lambda x: x.max(axis=1, numeric_only=True)), + (lambda x: x.std(axis=1, numeric_only=True)), + (lambda x: x.var(axis=1, numeric_only=True)), + ], + ids=["sum", "mean", "min", "max", "std", "var"], +) +def test_dataframe_aggregates_axis_1(scalars_df_index, scalars_pandas_df_index, op): + col_names = ["int64_too", "int64_col", "float64_col", "bool_col", "string_col"] + bf_result = op(scalars_df_index[col_names]).to_pandas() + pd_result = op(scalars_pandas_df_index[col_names]) + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + # Pandas has object index type + pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + + +def test_dataframe_aggregates_median(scalars_df_index, scalars_pandas_df_index): + col_names = ["int64_too", "float64_col", "int64_col", "bool_col"] + bf_result = scalars_df_index[col_names].median(numeric_only=True).to_pandas() + pd_result = scalars_pandas_df_index[col_names].agg(["min", "max"]) + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + # Median is an approximation, but double-check that median is plausible. + for col in col_names: + assert (pd_result.loc["min", col] <= bf_result[col]) and ( + bf_result[col] <= pd_result.loc["max", col] + ) + + +def test_dataframe_aggregates_quantile_mono(scalars_df_index, scalars_pandas_df_index): + q = 0.45 + col_names = ["int64_too", "int64_col", "float64_col"] + bf_result = scalars_df_index[col_names].quantile(q=q).to_pandas() + pd_result = scalars_pandas_df_index[col_names].quantile(q=q) + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + pd.testing.assert_series_equal(bf_result, pd_result, check_index_type=False) + + +def test_dataframe_aggregates_quantile_multi(scalars_df_index, scalars_pandas_df_index): + q = [0, 0.33, 0.67, 1.0] + col_names = ["int64_too", "int64_col", "float64_col"] + bf_result = scalars_df_index[col_names].quantile(q=q).to_pandas() + pd_result = scalars_pandas_df_index[col_names].quantile(q=q) + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + pd_result.index = pd_result.index.astype("Float64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("op"), + [ + (lambda x: x.all(bool_only=True)), + (lambda x: x.any(bool_only=True)), + (lambda x: x.all(axis=1, bool_only=True)), + (lambda x: x.any(axis=1, bool_only=True)), + ], + ids=["all_axis0", "any_axis0", "all_axis1", "any_axis1"], +) +def test_dataframe_bool_aggregates(scalars_df_index, scalars_pandas_df_index, op): + # Pandas will drop nullable 'boolean' dtype so we convert first to bool, then cast back later + scalars_df_index = scalars_df_index.assign( + bool_col=scalars_df_index.bool_col.fillna(False) + ) + scalars_pandas_df_index = scalars_pandas_df_index.assign( + bool_col=scalars_pandas_df_index.bool_col.fillna(False).astype("bool") + ) + bf_series = op(scalars_df_index) + pd_series = op(scalars_pandas_df_index).astype("boolean") + bf_result = bf_series.to_pandas() + + pd_series.index = pd_series.index.astype(bf_result.index.dtype) + pd.testing.assert_series_equal(pd_series, bf_result, check_index_type=False) + + +def test_dataframe_prod(scalars_df_index, scalars_pandas_df_index): + col_names = ["int64_too", "float64_col"] + bf_series = scalars_df_index[col_names].prod() + pd_series = scalars_pandas_df_index[col_names].prod() + bf_result = bf_series.to_pandas() + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_series = pd_series.astype("Float64") + # Pandas has object index type + pd.testing.assert_series_equal(pd_series, bf_result, check_index_type=False) + + +def test_df_skew_too_few_values(scalars_dfs): + columns = ["float64_col", "int64_col"] + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[columns].head(2).skew().to_pandas() + pd_result = scalars_pandas_df[columns].head(2).skew() + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + + +@pytest.mark.parametrize( + ("ordered"), + [ + (True), + (False), + ], +) +def test_df_skew(scalars_dfs, ordered): + columns = ["float64_col", "int64_col"] + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[columns].skew().to_pandas(ordered=ordered) + pd_result = scalars_pandas_df[columns].skew() + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + assert_series_equal( + pd_result, bf_result, check_index_type=False, ignore_order=not ordered + ) + + +def test_df_kurt_too_few_values(scalars_dfs): + columns = ["float64_col", "int64_col"] + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[columns].head(2).kurt().to_pandas() + pd_result = scalars_pandas_df[columns].head(2).kurt() + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + + +def test_df_kurt(scalars_dfs): + columns = ["float64_col", "int64_col"] + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df[columns].kurt().to_pandas() + pd_result = scalars_pandas_df[columns].kurt() + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + + pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + + +@pytest.mark.parametrize( + ("frac", "n", "random_state"), + [ + (None, 4, None), + (0.5, None, None), + (None, 4, 10), + (0.5, None, 10), + (None, None, None), + ], + ids=[ + "n_wo_random_state", + "frac_wo_random_state", + "n_w_random_state", + "frac_w_random_state", + "n_default", + ], +) +def test_sample(scalars_dfs, frac, n, random_state): + scalars_df, _ = scalars_dfs + df = scalars_df.sample(frac=frac, n=n, random_state=random_state) + bf_result = df.to_pandas() + + n = 1 if n is None else n + expected_sample_size = round(frac * scalars_df.shape[0]) if frac is not None else n + assert bf_result.shape[0] == expected_sample_size + assert bf_result.shape[1] == scalars_df.shape[1] + + +def test_sample_determinism(penguins_df_default_index): + df = penguins_df_default_index.sample(n=100, random_state=12345).head(15) + bf_result = df.to_pandas() + bf_result2 = df.to_pandas() + + pandas.testing.assert_frame_equal(bf_result, bf_result2) + + +def test_sample_raises_value_error(scalars_dfs): + scalars_df, _ = scalars_dfs + with pytest.raises( + ValueError, match="Only one of 'n' or 'frac' parameter can be specified." + ): + scalars_df.sample(frac=0.5, n=4) + + +def test_sample_args_sort(scalars_dfs): + scalars_df, _ = scalars_dfs + index = [4, 3, 2, 5, 1, 0] + scalars_df = scalars_df.iloc[index] + + kwargs = {"frac": 1.0, "random_state": 333} + + df = scalars_df.sample(**kwargs).to_pandas() + assert df.index.values != index + assert df.index.values != sorted(index) + + df = scalars_df.sample(sort="random", **kwargs).to_pandas() + assert df.index.values != index + assert df.index.values != sorted(index) + + df = scalars_df.sample(sort=True, **kwargs).to_pandas() + assert df.index.values == sorted(index) + + df = scalars_df.sample(sort=False, **kwargs).to_pandas() + assert df.index.values == index + + +@pytest.mark.parametrize( + ("axis",), + [ + (None,), + (0,), + (1,), + ], +) +def test_df_add_prefix(scalars_df_index, scalars_pandas_df_index, axis): + if pd.__version__.startswith("1."): + pytest.skip("add_prefix axis parameter not supported in pandas 1.x.") + bf_result = scalars_df_index.add_prefix("prefix_", axis).to_pandas() + + pd_result = scalars_pandas_df_index.add_prefix("prefix_", axis) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + check_index_type=False, + ) + + +@pytest.mark.parametrize( + ("axis",), + [ + (0,), + (1,), + ], +) +def test_df_add_suffix(scalars_df_index, scalars_pandas_df_index, axis): + if pd.__version__.startswith("1."): + pytest.skip("add_prefix axis parameter not supported in pandas 1.x.") + bf_result = scalars_df_index.add_suffix("_suffix", axis).to_pandas() + + pd_result = scalars_pandas_df_index.add_suffix("_suffix", axis) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + check_index_type=False, + ) + + +def test_df_astype_error_error(session): + input = pd.DataFrame(["hello", "world", "3.11", "4000"]) + with pytest.raises(ValueError): + session.read_pandas(input).astype("Float64", errors="bad_value") + + +def test_df_columns_filter_items(scalars_df_index, scalars_pandas_df_index): + if pd.__version__.startswith("2.0") or pd.__version__.startswith("1."): + pytest.skip("pandas filter items behavior different pre-2.1") + bf_result = scalars_df_index.filter(items=["string_col", "int64_col"]).to_pandas() + + pd_result = scalars_pandas_df_index.filter(items=["string_col", "int64_col"]) + # Ignore column ordering as pandas order differently depending on version + pd.testing.assert_frame_equal( + bf_result.sort_index(axis=1), + pd_result.sort_index(axis=1), + ) + + +def test_df_columns_filter_like(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.filter(like="64_col").to_pandas() + + pd_result = scalars_pandas_df_index.filter(like="64_col") + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_columns_filter_regex(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.filter(regex="^[^_]+$").to_pandas() + + pd_result = scalars_pandas_df_index.filter(regex="^[^_]+$") + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_rows_filter_items(scalars_df_index, scalars_pandas_df_index): + if pd.__version__.startswith("2.0") or pd.__version__.startswith("1."): + pytest.skip("pandas filter items behavior different pre-2.1") + bf_result = scalars_df_index.filter(items=[5, 1, 3], axis=0).to_pandas() + + pd_result = scalars_pandas_df_index.filter(items=[5, 1, 3], axis=0) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + # Ignore ordering as pandas order differently depending on version + assert_pandas_df_equal( + bf_result, + pd_result, + ignore_order=True, + check_names=False, + ) + + +def test_df_rows_filter_like(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.copy().set_index("string_col") + scalars_pandas_df_index = scalars_pandas_df_index.copy().set_index("string_col") + + bf_result = scalars_df_index.filter(like="ello", axis=0).to_pandas() + + pd_result = scalars_pandas_df_index.filter(like="ello", axis=0) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_rows_filter_regex(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.copy().set_index("string_col") + scalars_pandas_df_index = scalars_pandas_df_index.copy().set_index("string_col") + + bf_result = scalars_df_index.filter(regex="^[GH].*", axis=0).to_pandas() + + pd_result = scalars_pandas_df_index.filter(regex="^[GH].*", axis=0) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_reindex_rows_list(scalars_dfs_maybe_ordered): + scalars_df_index, scalars_pandas_df_index = scalars_dfs_maybe_ordered + bf_result = scalars_df_index.reindex(index=[5, 1, 3, 99, 1]) + + pd_result = scalars_pandas_df_index.reindex(index=[5, 1, 3, 99, 1]) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + assert_dfs_equivalent( + pd_result, + bf_result, + ) + + +def test_df_reindex_rows_index(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.reindex( + index=pd.Index([5, 1, 3, 99, 1], name="newname") + ).to_pandas() + + pd_result = scalars_pandas_df_index.reindex( + index=pd.Index([5, 1, 3, 99, 1], name="newname") + ) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_reindex_nonunique(scalars_df_index): + with pytest.raises(ValueError): + # int64_too is non-unique + scalars_df_index.set_index("int64_too").reindex( + index=[5, 1, 3, 99, 1], validate=True + ) + + +def test_df_reindex_columns(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.reindex( + columns=["not_a_col", "int64_col", "int64_too"] + ).to_pandas() + + pd_result = scalars_pandas_df_index.reindex( + columns=["not_a_col", "int64_col", "int64_too"] + ) + + # Pandas uses float64 as default for newly created empty column, bf uses Float64 + pd_result.not_a_col = pd_result.not_a_col.astype(pandas.Float64Dtype()) + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_reindex_columns_with_same_order(scalars_df_index, scalars_pandas_df_index): + # First, make sure the two dataframes have the same columns in order. + columns = ["int64_col", "int64_too"] + bf = scalars_df_index[columns] + pd_df = scalars_pandas_df_index[columns] + + bf_result = bf.reindex(columns=columns).to_pandas() + pd_result = pd_df.reindex(columns=columns) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_equals_identical(scalars_df_index, scalars_pandas_df_index): + unsupported = [ + "geography_col", + ] + scalars_df_index = scalars_df_index.drop(columns=unsupported) + scalars_pandas_df_index = scalars_pandas_df_index.drop(columns=unsupported) + + bf_result = scalars_df_index.equals(scalars_df_index) + pd_result = scalars_pandas_df_index.equals(scalars_pandas_df_index) + + assert pd_result == bf_result + + +def test_df_equals_series(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index[["int64_col"]].equals(scalars_df_index["int64_col"]) + pd_result = scalars_pandas_df_index[["int64_col"]].equals( + scalars_pandas_df_index["int64_col"] + ) + + assert pd_result == bf_result + + +def test_df_equals_different_dtype(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_col", "int64_too"] + scalars_df_index = scalars_df_index[columns] + scalars_pandas_df_index = scalars_pandas_df_index[columns] + + bf_modified = scalars_df_index.copy() + bf_modified = bf_modified.astype("Float64") + + pd_modified = scalars_pandas_df_index.copy() + pd_modified = pd_modified.astype("Float64") + + bf_result = scalars_df_index.equals(bf_modified) + pd_result = scalars_pandas_df_index.equals(pd_modified) + + assert pd_result == bf_result + + +def test_df_equals_different_values(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_col", "int64_too"] + scalars_df_index = scalars_df_index[columns] + scalars_pandas_df_index = scalars_pandas_df_index[columns] + + bf_modified = scalars_df_index.copy() + bf_modified["int64_col"] = bf_modified.int64_col + 1 + + pd_modified = scalars_pandas_df_index.copy() + pd_modified["int64_col"] = pd_modified.int64_col + 1 + + bf_result = scalars_df_index.equals(bf_modified) + pd_result = scalars_pandas_df_index.equals(pd_modified) + + assert pd_result == bf_result + + +def test_df_equals_extra_column(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_col", "int64_too"] + more_columns = ["int64_col", "int64_too", "float64_col"] + + bf_result = scalars_df_index[columns].equals(scalars_df_index[more_columns]) + pd_result = scalars_pandas_df_index[columns].equals( + scalars_pandas_df_index[more_columns] + ) + + assert pd_result == bf_result + + +def test_df_reindex_like(scalars_df_index, scalars_pandas_df_index): + reindex_target_bf = scalars_df_index.reindex( + columns=["not_a_col", "int64_col", "int64_too"], index=[5, 1, 3, 99, 1] + ) + bf_result = scalars_df_index.reindex_like(reindex_target_bf).to_pandas() + + reindex_target_pd = scalars_pandas_df_index.reindex( + columns=["not_a_col", "int64_col", "int64_too"], index=[5, 1, 3, 99, 1] + ) + pd_result = scalars_pandas_df_index.reindex_like(reindex_target_pd) + + # Pandas uses float64 as default for newly created empty column, bf uses Float64 + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + # Pandas uses float64 as default for newly created empty column, bf uses Float64 + pd_result.not_a_col = pd_result.not_a_col.astype(pandas.Float64Dtype()) + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_values(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.values + + pd_result = scalars_pandas_df_index.values + # Numpy isn't equipped to compare non-numeric objects, so convert back to dataframe + pd.testing.assert_frame_equal( + pd.DataFrame(bf_result), pd.DataFrame(pd_result), check_dtype=False + ) + + +def test_df_to_numpy(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.to_numpy() + + pd_result = scalars_pandas_df_index.to_numpy() + # Numpy isn't equipped to compare non-numeric objects, so convert back to dataframe + pd.testing.assert_frame_equal( + pd.DataFrame(bf_result), pd.DataFrame(pd_result), check_dtype=False + ) + + +def test_df___array__(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.__array__() + + pd_result = scalars_pandas_df_index.__array__() + # Numpy isn't equipped to compare non-numeric objects, so convert back to dataframe + pd.testing.assert_frame_equal( + pd.DataFrame(bf_result), pd.DataFrame(pd_result), check_dtype=False + ) + + +@pytest.mark.parametrize( + ("key",), + [ + ("hello",), + (2,), + ("int64_col",), + (None,), + ], +) +def test_df_contains(scalars_df_index, scalars_pandas_df_index, key): + bf_result = key in scalars_df_index + pd_result = key in scalars_pandas_df_index + + assert bf_result == pd_result + + +def test_df_getattr_attribute_error_when_pandas_has(scalars_df_index): + # swapaxes is implemented in pandas but not in bigframes + with pytest.raises(AttributeError): + scalars_df_index.swapaxes() + + +def test_df_getattr_attribute_error(scalars_df_index): + with pytest.raises(AttributeError): + scalars_df_index.not_a_method() + + +def test_df_getattr_axes(): + df = dataframe.DataFrame( + [[1, 1, 1], [1, 1, 1]], columns=["index", "columns", "my_column"] + ) + assert isinstance(df.index, bigframes.core.indexes.Index) + assert isinstance(df.columns, pandas.Index) + assert isinstance(df.my_column, series.Series) + + +def test_df_setattr_index(): + pd_df = pandas.DataFrame( + [[1, 1, 1], [1, 1, 1]], columns=["index", "columns", "my_column"] + ) + bf_df = dataframe.DataFrame(pd_df) + + pd_df.index = pandas.Index([4, 5]) + bf_df.index = [4, 5] + + assert_pandas_df_equal( + pd_df, bf_df.to_pandas(), check_index_type=False, check_dtype=False + ) + + +def test_df_setattr_columns(): + pd_df = pandas.DataFrame( + [[1, 1, 1], [1, 1, 1]], columns=["index", "columns", "my_column"] + ) + bf_df = dataframe.DataFrame(pd_df) + + pd_df.columns = typing.cast(pandas.Index, pandas.Index([4, 5, 6])) + + bf_df.columns = pandas.Index([4, 5, 6]) + + assert_pandas_df_equal( + pd_df, bf_df.to_pandas(), check_index_type=False, check_dtype=False + ) + + +def test_df_setattr_modify_column(): + pd_df = pandas.DataFrame( + [[1, 1, 1], [1, 1, 1]], columns=["index", "columns", "my_column"] + ) + bf_df = dataframe.DataFrame(pd_df) + pd_df.my_column = [4, 5] + bf_df.my_column = [4, 5] + + assert_pandas_df_equal( + pd_df, bf_df.to_pandas(), check_index_type=False, check_dtype=False + ) + + +def test_loc_list_string_index(scalars_df_index, scalars_pandas_df_index): + index_list = scalars_pandas_df_index.string_col.iloc[[0, 1, 1, 5]].values + + scalars_df_index = scalars_df_index.set_index("string_col") + scalars_pandas_df_index = scalars_pandas_df_index.set_index("string_col") + + bf_result = scalars_df_index.loc[index_list].to_pandas() + pd_result = scalars_pandas_df_index.loc[index_list] + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_loc_list_integer_index(scalars_df_index, scalars_pandas_df_index): + index_list = [3, 2, 1, 3, 2, 1] + + bf_result = scalars_df_index.loc[index_list] + pd_result = scalars_pandas_df_index.loc[index_list] + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_list_multiindex(scalars_dfs_maybe_ordered): + scalars_df_index, scalars_pandas_df_index = scalars_dfs_maybe_ordered + scalars_df_multiindex = scalars_df_index.set_index(["string_col", "int64_col"]) + scalars_pandas_df_multiindex = scalars_pandas_df_index.set_index( + ["string_col", "int64_col"] + ) + index_list = [("Hello, World!", -234892), ("Hello, World!", 123456789)] + + bf_result = scalars_df_multiindex.loc[index_list] + pd_result = scalars_pandas_df_multiindex.loc[index_list] + + assert_dfs_equivalent( + pd_result, + bf_result, + ) + + +@pytest.mark.parametrize( + "index_list", + [ + [0, 1, 2, 3, 4, 4], + [0, 0, 0, 5, 4, 7, -2, -5, 3], + [-1, -2, -3, -4, -5, -5], + ], +) +def test_iloc_list(scalars_df_index, scalars_pandas_df_index, index_list): + bf_result = scalars_df_index.iloc[index_list] + pd_result = scalars_pandas_df_index.iloc[index_list] + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +@pytest.mark.parametrize( + "index_list", + [ + [0, 1, 2, 3, 4, 4], + [0, 0, 0, 5, 4, 7, -2, -5, 3], + [-1, -2, -3, -4, -5, -5], + ], +) +def test_iloc_list_partial_ordering( + scalars_df_partial_ordering, scalars_pandas_df_index, index_list +): + bf_result = scalars_df_partial_ordering.iloc[index_list] + pd_result = scalars_pandas_df_index.iloc[index_list] + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_iloc_list_multiindex(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + scalars_df = scalars_df.copy() + scalars_pandas_df = scalars_pandas_df.copy() + scalars_df = scalars_df.set_index(["bytes_col", "numeric_col"]) + scalars_pandas_df = scalars_pandas_df.set_index(["bytes_col", "numeric_col"]) + + index_list = [0, 0, 0, 5, 4, 7] + + bf_result = scalars_df.iloc[index_list] + pd_result = scalars_pandas_df.iloc[index_list] + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_iloc_empty_list(scalars_df_index, scalars_pandas_df_index): + + index_list: List[int] = [] + + bf_result = scalars_df_index.iloc[index_list] + pd_result = scalars_pandas_df_index.iloc[index_list] + + bf_result = bf_result.to_pandas() + assert bf_result.shape == pd_result.shape # types are known to be different + + +def test_rename_axis(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.rename_axis("newindexname") + pd_result = scalars_pandas_df_index.rename_axis("newindexname") + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_rename_axis_nonstring(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.rename_axis((4,)) + pd_result = scalars_pandas_df_index.rename_axis((4,)) + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_bf_series_string_index(scalars_df_index, scalars_pandas_df_index): + pd_string_series = scalars_pandas_df_index.string_col.iloc[[0, 5, 1, 1, 5]] + bf_string_series = scalars_df_index.string_col.iloc[[0, 5, 1, 1, 5]] + + scalars_df_index = scalars_df_index.set_index("string_col") + scalars_pandas_df_index = scalars_pandas_df_index.set_index("string_col") + + bf_result = scalars_df_index.loc[bf_string_series] + pd_result = scalars_pandas_df_index.loc[pd_string_series] + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_bf_series_multiindex(scalars_df_index, scalars_pandas_df_index): + pd_string_series = scalars_pandas_df_index.string_col.iloc[[0, 5, 1, 1, 5]] + bf_string_series = scalars_df_index.string_col.iloc[[0, 5, 1, 1, 5]] + + scalars_df_multiindex = scalars_df_index.set_index(["string_col", "int64_col"]) + scalars_pandas_df_multiindex = scalars_pandas_df_index.set_index( + ["string_col", "int64_col"] + ) + + bf_result = scalars_df_multiindex.loc[bf_string_series] + pd_result = scalars_pandas_df_multiindex.loc[pd_string_series] + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_bf_index_integer_index(scalars_df_index, scalars_pandas_df_index): + pd_index = scalars_pandas_df_index.iloc[[0, 5, 1, 1, 5]].index + bf_index = scalars_df_index.iloc[[0, 5, 1, 1, 5]].index + + bf_result = scalars_df_index.loc[bf_index] + pd_result = scalars_pandas_df_index.loc[pd_index] + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_loc_bf_index_integer_index_renamed_col( + scalars_df_index, scalars_pandas_df_index +): + scalars_df_index = scalars_df_index.rename(columns={"int64_col": "rename"}) + scalars_pandas_df_index = scalars_pandas_df_index.rename( + columns={"int64_col": "rename"} + ) + + pd_index = scalars_pandas_df_index.iloc[[0, 5, 1, 1, 5]].index + bf_index = scalars_df_index.iloc[[0, 5, 1, 1, 5]].index + + bf_result = scalars_df_index.loc[bf_index] + pd_result = scalars_pandas_df_index.loc[pd_index] + + pd.testing.assert_frame_equal( + bf_result.to_pandas(), + pd_result, + ) + + +@pytest.mark.parametrize( + ("subset"), + [ + None, + "bool_col", + ["bool_col", "int64_too"], + ], +) +@pytest.mark.parametrize( + ("keep",), + [ + ("first",), + ("last",), + (False,), + ], +) +def test_df_drop_duplicates(scalars_df_index, scalars_pandas_df_index, keep, subset): + columns = ["bool_col", "int64_too", "int64_col"] + bf_df = scalars_df_index[columns].drop_duplicates(subset, keep=keep).to_pandas() + pd_df = scalars_pandas_df_index[columns].drop_duplicates(subset, keep=keep) + pd.testing.assert_frame_equal( + pd_df, + bf_df, + ) + + +@pytest.mark.parametrize( + ("keep",), + [ + ("first",), + ("last",), + (False,), + ], +) +def test_df_drop_duplicates_w_json(json_df, keep): + bf_df = json_df.drop_duplicates(keep=keep).to_pandas() + + # drop_duplicates relies on pa.compute.dictionary_encode, which is incompatible + # with Arrow string extension types. Temporary conversion to standard Pandas + # strings is required. + json_pandas_df = json_df.to_pandas() + json_pandas_df["json_col"] = json_pandas_df["json_col"].astype( + pd.StringDtype(storage="pyarrow") + ) + + pd_df = json_pandas_df.drop_duplicates(keep=keep) + pd_df["json_col"] = pd_df["json_col"].astype(dtypes.JSON_DTYPE) + pd.testing.assert_frame_equal( + pd_df, + bf_df, + ) + + +@pytest.mark.parametrize( + ("subset"), + [ + None, + ["bool_col"], + ], +) +@pytest.mark.parametrize( + ("keep",), + [ + ("first",), + ("last",), + (False,), + ], +) +def test_df_duplicated(scalars_df_index, scalars_pandas_df_index, keep, subset): + columns = ["bool_col", "int64_too", "int64_col"] + bf_series = scalars_df_index[columns].duplicated(subset, keep=keep).to_pandas() + pd_series = scalars_pandas_df_index[columns].duplicated(subset, keep=keep) + pd.testing.assert_series_equal(pd_series, bf_series, check_dtype=False) + + +def test_df_from_dict_columns_orient(): + data = {"a": [1, 2], "b": [3.3, 2.4]} + bf_result = dataframe.DataFrame.from_dict(data, orient="columns").to_pandas() + pd_result = pd.DataFrame.from_dict(data, orient="columns") + assert_pandas_df_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +def test_df_from_dict_index_orient(): + data = {"a": [1, 2], "b": [3.3, 2.4]} + bf_result = dataframe.DataFrame.from_dict( + data, orient="index", columns=["col1", "col2"] + ).to_pandas() + pd_result = pd.DataFrame.from_dict(data, orient="index", columns=["col1", "col2"]) + assert_pandas_df_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +def test_df_from_dict_tight_orient(): + data = { + "index": [("i1", "i2"), ("i3", "i4")], + "columns": ["col1", "col2"], + "data": [[1, 2.6], [3, 4.5]], + "index_names": ["in1", "in2"], + "column_names": ["column_axis"], + } + + bf_result = dataframe.DataFrame.from_dict(data, orient="tight").to_pandas() + pd_result = pd.DataFrame.from_dict(data, orient="tight") + assert_pandas_df_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +def test_df_from_records(): + records = ((1, "a"), (2.5, "b"), (3.3, "c"), (4.9, "d")) + + bf_result = dataframe.DataFrame.from_records( + records, columns=["c1", "c2"] + ).to_pandas() + pd_result = pd.DataFrame.from_records(records, columns=["c1", "c2"]) + assert_pandas_df_equal( + pd_result, bf_result, check_dtype=False, check_index_type=False + ) + + +def test_df_to_dict(scalars_df_index, scalars_pandas_df_index): + unsupported = ["numeric_col"] # formatted differently + bf_result = scalars_df_index.drop(columns=unsupported).to_dict() + pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_dict() + + assert bf_result == pd_result + + +def test_df_to_excel(scalars_df_index, scalars_pandas_df_index): + unsupported = ["timestamp_col"] + with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: + scalars_df_index.drop(columns=unsupported).to_excel(bf_result_file) + scalars_pandas_df_index.drop(columns=unsupported).to_excel(pd_result_file) + bf_result = bf_result_file.read() + pd_result = bf_result_file.read() + + assert bf_result == pd_result + + +def test_df_to_latex(scalars_df_index, scalars_pandas_df_index): + unsupported = ["numeric_col"] # formatted differently + bf_result = scalars_df_index.drop(columns=unsupported).to_latex() + pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_latex() + + assert bf_result == pd_result + + +def test_df_to_json_local_str(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.to_json() + # default_handler for arrow types that have no default conversion + pd_result = scalars_pandas_df_index.to_json(default_handler=str) + + assert bf_result == pd_result + + +def test_df_to_json_local_file(scalars_df_index, scalars_pandas_df_index): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + # duration not fully supported at pandas level + scalars_df_index = scalars_df_index.drop(columns="duration_col") + scalars_pandas_df_index = scalars_pandas_df_index.drop(columns="duration_col") + with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: + scalars_df_index.to_json(bf_result_file, orient="table") + # default_handler for arrow types that have no default conversion + scalars_pandas_df_index.to_json( + pd_result_file, orient="table", default_handler=str + ) + + bf_result = bf_result_file.read() + pd_result = pd_result_file.read() + + assert bf_result == pd_result + + +def test_df_to_csv_local_str(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.to_csv() + # default_handler for arrow types that have no default conversion + pd_result = scalars_pandas_df_index.to_csv() + + assert bf_result == pd_result + + +def test_df_to_csv_local_file(scalars_df_index, scalars_pandas_df_index): + with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: + scalars_df_index.to_csv(bf_result_file) + scalars_pandas_df_index.to_csv(pd_result_file) + + bf_result = bf_result_file.read() + pd_result = pd_result_file.read() + + assert bf_result == pd_result + + +def test_df_to_parquet_local_bytes(scalars_df_index, scalars_pandas_df_index): + # GEOGRAPHY not supported in parquet export. + unsupported = ["geography_col"] + + bf_result = scalars_df_index.drop(columns=unsupported).to_parquet() + # default_handler for arrow types that have no default conversion + pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_parquet() + + assert bf_result == pd_result + + +def test_df_to_parquet_local_file(scalars_df_index, scalars_pandas_df_index): + # GEOGRAPHY not supported in parquet export. + unsupported = ["geography_col"] + with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: + scalars_df_index.drop(columns=unsupported).to_parquet(bf_result_file) + scalars_pandas_df_index.drop(columns=unsupported).to_parquet(pd_result_file) + + bf_result = bf_result_file.read() + pd_result = pd_result_file.read() + + assert bf_result == pd_result + + +def test_df_to_records(scalars_df_index, scalars_pandas_df_index): + unsupported = ["numeric_col"] + bf_result = scalars_df_index.drop(columns=unsupported).to_records() + pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_records() + + for bfi, pdi in zip(bf_result, pd_result): + for bfj, pdj in zip(bfi, pdi): + assert pd.isna(bfj) and pd.isna(pdj) or bfj == pdj + + +def test_df_to_string(scalars_df_index, scalars_pandas_df_index): + unsupported = ["numeric_col"] # formatted differently + + bf_result = scalars_df_index.drop(columns=unsupported).to_string() + pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_string() + + assert bf_result == pd_result + + +def test_df_to_html(scalars_df_index, scalars_pandas_df_index): + unsupported = ["numeric_col"] # formatted differently + + bf_result = scalars_df_index.drop(columns=unsupported).to_html() + pd_result = scalars_pandas_df_index.drop(columns=unsupported).to_html() + + assert bf_result == pd_result + + +def test_df_to_markdown(scalars_df_index, scalars_pandas_df_index): + # Nulls have bug from tabulate https://github.com/astanin/python-tabulate/issues/231 + bf_result = scalars_df_index.dropna().to_markdown() + pd_result = scalars_pandas_df_index.dropna().to_markdown() + + assert bf_result == pd_result + + +def test_df_to_pickle(scalars_df_index, scalars_pandas_df_index): + with tempfile.TemporaryFile() as bf_result_file, tempfile.TemporaryFile() as pd_result_file: + scalars_df_index.to_pickle(bf_result_file) + scalars_pandas_df_index.to_pickle(pd_result_file) + bf_result = bf_result_file.read() + pd_result = pd_result_file.read() + + assert bf_result == pd_result + + +def test_df_to_orc(scalars_df_index, scalars_pandas_df_index): + unsupported = [ + "numeric_col", + "bytes_col", + "date_col", + "datetime_col", + "time_col", + "timestamp_col", + "geography_col", + "duration_col", + ] + + bf_result_file = tempfile.TemporaryFile() + pd_result_file = tempfile.TemporaryFile() + scalars_df_index.drop(columns=unsupported).to_orc(bf_result_file) + scalars_pandas_df_index.drop(columns=unsupported).reset_index().to_orc( + pd_result_file + ) + bf_result = bf_result_file.read() + pd_result = bf_result_file.read() + + assert bf_result == pd_result + + +@pytest.mark.parametrize( + ("expr",), + [ + ("new_col = int64_col + int64_too",), + ("new_col = (rowindex > 3) | bool_col",), + ("int64_too = bool_col\nnew_col2 = rowindex",), + ], +) +def test_df_eval(scalars_dfs, expr): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.eval(expr).to_pandas() + pd_result = scalars_pandas_df.eval(expr) + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("expr",), + [ + ("int64_col > int64_too",), + ("bool_col",), + ("((int64_col - int64_too) % @local_var) == 0",), + ], +) +def test_df_query(scalars_dfs, expr): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + # local_var is referenced in expressions + local_var = 3 # NOQA + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.query(expr).to_pandas() + pd_result = scalars_pandas_df.query(expr) + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("subset", "normalize", "ascending", "dropna"), + [ + (None, False, False, False), + (None, True, True, True), + ("bool_col", True, False, True), + ], +) +def test_df_value_counts(scalars_dfs, subset, normalize, ascending, dropna): + if pd.__version__.startswith("1."): + pytest.skip("pandas 1.x produces different column labels.") + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = ( + scalars_df[["string_col", "bool_col"]] + .value_counts(subset, normalize=normalize, ascending=ascending, dropna=dropna) + .to_pandas() + ) + pd_result = scalars_pandas_df[["string_col", "bool_col"]].value_counts( + subset, normalize=normalize, ascending=ascending, dropna=dropna + ) + + pd.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +@pytest.mark.parametrize( + ("na_option", "method", "ascending", "numeric_only", "pct"), + [ + ("keep", "average", True, True, True), + ("top", "min", False, False, False), + ("bottom", "max", False, False, True), + ("top", "first", False, False, False), + ("bottom", "dense", False, False, True), + ], +) +def test_df_rank_with_nulls( + scalars_df_index, + scalars_pandas_df_index, + na_option, + method, + ascending, + numeric_only, + pct, +): + unsupported_columns = ["geography_col"] + bf_result = ( + scalars_df_index.drop(columns=unsupported_columns) + .rank( + na_option=na_option, + method=method, + ascending=ascending, + numeric_only=numeric_only, + pct=pct, + ) + .to_pandas() + ) + pd_result = ( + scalars_pandas_df_index.drop(columns=unsupported_columns) + .rank( + na_option=na_option, + method=method, + ascending=ascending, + numeric_only=numeric_only, + pct=pct, + ) + .astype(pd.Float64Dtype()) + ) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_bool_interpretation_error(scalars_df_index): + with pytest.raises(ValueError): + True if scalars_df_index else False + + +def test_query_job_setters(scalars_df_default_index: dataframe.DataFrame): + # if allow_large_results=False, might not create query job + with bigframes.option_context("compute.allow_large_results", True): + job_ids = set() + repr(scalars_df_default_index) + assert scalars_df_default_index.query_job is not None + job_ids.add(scalars_df_default_index.query_job.job_id) + scalars_df_default_index.to_pandas(allow_large_results=True) + job_ids.add(scalars_df_default_index.query_job.job_id) + + assert len(job_ids) == 2 + + +def test_df_cached(scalars_df_index): + df = scalars_df_index.set_index(["int64_too", "int64_col"]).sort_values( + "string_col" + ) + df = df[df["rowindex_2"] % 2 == 0] + + df_cached_copy = df.cache() + pandas.testing.assert_frame_equal(df.to_pandas(), df_cached_copy.to_pandas()) + + +def test_df_cached_many_index_cols(scalars_df_index): + index_cols = [ + "int64_too", + "time_col", + "int64_col", + "bool_col", + "date_col", + "timestamp_col", + "string_col", + ] + df = scalars_df_index.set_index(index_cols) + df = df[df["rowindex_2"] % 2 == 0] + + df_cached_copy = df.cache() + pandas.testing.assert_frame_equal(df.to_pandas(), df_cached_copy.to_pandas()) + + +def test_assign_after_binop_row_joins(): + pd_df = pd.DataFrame( + { + "idx1": [1, 1, 1, 1, 2, 2, 2, 2], + "idx2": [10, 10, 20, 20, 10, 10, 20, 20], + "metric1": [10, 14, 2, 13, 6, 2, 9, 5], + "metric2": [25, -3, 8, 2, -1, 0, 0, -4], + }, + dtype=pd.Int64Dtype(), + ).set_index(["idx1", "idx2"]) + bf_df = dataframe.DataFrame(pd_df) + + # Expect implicit joiner to be used, preserving input cardinality rather than getting relational join + bf_df["metric_diff"] = bf_df.metric1 - bf_df.metric2 + pd_df["metric_diff"] = pd_df.metric1 - pd_df.metric2 + + assert_pandas_df_equal(bf_df.to_pandas(), pd_df) + + +def test_df_cache_with_implicit_join(scalars_df_index): + """expectation is that cache will be used, but no explicit join will be performed""" + df = scalars_df_index[["int64_col", "int64_too"]].sort_index().reset_index() + 3 + df.cache() + bf_result = df + (df * 2) + sql = bf_result.sql + + # Very crude asserts, want sql to not use join and not use base table, only reference cached table + assert "JOIN" not in sql + assert "bigframes_testing" not in sql + + +def test_df_dot_inline(session): + df1 = pd.DataFrame([[1, 2, 3], [2, 5, 7]]) + df2 = pd.DataFrame([[2, 4, 8], [1, 5, 10], [3, 6, 9]]) + + bf1 = session.read_pandas(df1) + bf2 = session.read_pandas(df2) + bf_result = bf1.dot(bf2).to_pandas() + pd_result = df1.dot(df2) + + # Patch pandas dtypes for testing parity + # Pandas uses int64 instead of Int64 (nullable) dtype. + for name in pd_result.columns: + pd_result[name] = pd_result[name].astype(pd.Int64Dtype()) + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_dot( + matrix_2by3_df, matrix_2by3_pandas_df, matrix_3by4_df, matrix_3by4_pandas_df +): + bf_result = matrix_2by3_df.dot(matrix_3by4_df).to_pandas() + pd_result = matrix_2by3_pandas_df.dot(matrix_3by4_pandas_df) + + # Patch pandas dtypes for testing parity + # Pandas result is object instead of Int64 (nullable) dtype. + for name in pd_result.columns: + pd_result[name] = pd_result[name].astype(pd.Int64Dtype()) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_dot_operator( + matrix_2by3_df, matrix_2by3_pandas_df, matrix_3by4_df, matrix_3by4_pandas_df +): + bf_result = (matrix_2by3_df @ matrix_3by4_df).to_pandas() + pd_result = matrix_2by3_pandas_df @ matrix_3by4_pandas_df + + # Patch pandas dtypes for testing parity + # Pandas result is object instead of Int64 (nullable) dtype. + for name in pd_result.columns: + pd_result[name] = pd_result[name].astype(pd.Int64Dtype()) + + pd.testing.assert_frame_equal( + bf_result, + pd_result, + ) + + +def test_df_dot_series_inline(): + left = [[1, 2, 3], [2, 5, 7]] + right = [2, 1, 3] + + bf1 = dataframe.DataFrame(left) + bf2 = series.Series(right) + bf_result = bf1.dot(bf2).to_pandas() + + df1 = pd.DataFrame(left) + df2 = pd.Series(right) + pd_result = df1.dot(df2) + + # Patch pandas dtypes for testing parity + # Pandas result is int64 instead of Int64 (nullable) dtype. + pd_result = pd_result.astype(pd.Int64Dtype()) + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_df_dot_series( + matrix_2by3_df, matrix_2by3_pandas_df, matrix_3by4_df, matrix_3by4_pandas_df +): + bf_result = matrix_2by3_df.dot(matrix_3by4_df["x"]).to_pandas() + pd_result = matrix_2by3_pandas_df.dot(matrix_3by4_pandas_df["x"]) + + # Patch pandas dtypes for testing parity + # Pandas result is object instead of Int64 (nullable) dtype. + pd_result = pd_result.astype(pd.Int64Dtype()) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +def test_df_dot_operator_series( + matrix_2by3_df, matrix_2by3_pandas_df, matrix_3by4_df, matrix_3by4_pandas_df +): + bf_result = (matrix_2by3_df @ matrix_3by4_df["x"]).to_pandas() + pd_result = matrix_2by3_pandas_df @ matrix_3by4_pandas_df["x"] + + # Patch pandas dtypes for testing parity + # Pandas result is object instead of Int64 (nullable) dtype. + pd_result = pd_result.astype(pd.Int64Dtype()) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + +# TODO(tswast): We may be able to re-enable this test after we break large +# queries up in https://github.com/googleapis/python-bigquery-dataframes/pull/427 +@pytest.mark.skipif( + sys.version_info >= (3, 12), + # See: https://github.com/python/cpython/issues/112282 + reason="setrecursionlimit has no effect on the Python C stack since Python 3.12.", +) +def test_recursion_limit(scalars_df_index): + scalars_df_index = scalars_df_index[["int64_too", "int64_col", "float64_col"]] + for i in range(400): + scalars_df_index = scalars_df_index + 4 + scalars_df_index.to_pandas() + + +@pytest.mark.skipif( + reason="b/366477265: Skip until query complexity error can be reliably triggered." +) +def test_query_complexity_error(scalars_df_index): + # This test requires automatic caching/query decomposition to be turned off + bf_df = scalars_df_index + for _ in range(8): + bf_df = bf_df.merge(bf_df, on="int64_col").head(30) + bf_df = bf_df[bf_df.columns[:20]] + + with pytest.raises( + bigframes.exceptions.QueryComplexityError, match=r"Try using DataFrame\.cache" + ): + bf_df.to_pandas() + + +def test_query_complexity_repeated_joins( + scalars_df_index, scalars_pandas_df_index, with_multiquery_execution +): + pd_df = scalars_pandas_df_index + bf_df = scalars_df_index + for _ in range(8): + # recursively join, resuling in 2^8 - 1 = 255 joins + pd_df = pd_df.merge(pd_df, on="int64_col").head(30) + pd_df = pd_df[pd_df.columns[:20]] + bf_df = bf_df.merge(bf_df, on="int64_col").head(30) + bf_df = bf_df[bf_df.columns[:20]] + + bf_result = bf_df.to_pandas() + pd_result = pd_df + assert_pandas_df_equal(bf_result, pd_result, check_index_type=False) + + +def test_query_complexity_repeated_subtrees( + scalars_df_index, scalars_pandas_df_index, with_multiquery_execution +): + # Recursively union the data, if fully inlined has 10^5 identical root tables. + pd_df = scalars_pandas_df_index + bf_df = scalars_df_index + for _ in range(5): + pd_df = pd.concat(10 * [pd_df]).head(5) + bf_df = bpd.concat(10 * [bf_df]).head(5) + bf_result = bf_df.to_pandas() + pd_result = pd_df + assert_pandas_df_equal(bf_result, pd_result) + + +@pytest.mark.skipif( + sys.version_info >= (3, 12), + # See: https://github.com/python/cpython/issues/112282 + reason="setrecursionlimit has no effect on the Python C stack since Python 3.12.", +) +def test_query_complexity_repeated_analytic(scalars_df_index, scalars_pandas_df_index): + bf_df = scalars_df_index[["int64_col", "int64_too"]] + pd_df = scalars_pandas_df_index[["int64_col", "int64_too"]] + # Uses LAG analytic operator, each in a new SELECT + for _ in range(50): + bf_df = bf_df.diff() + pd_df = pd_df.diff() + bf_result = bf_df.to_pandas() + pd_result = pd_df + assert_pandas_df_equal(bf_result, pd_result) + + +def test_to_gbq_and_create_dataset(session, scalars_df_index, dataset_id_not_created): + dataset_id = dataset_id_not_created + destination_table = f"{dataset_id}.scalars_df" + + result_table = scalars_df_index.to_gbq(destination_table) + assert ( + result_table == destination_table + if destination_table + else result_table is not None + ) + + loaded_scalars_df_index = session.read_gbq(result_table) + assert not loaded_scalars_df_index.empty + + +def test_read_gbq_to_pandas_no_exec(unordered_session: bigframes.Session): + metrics = unordered_session._metrics + execs_pre = metrics.execution_count + df = unordered_session.read_gbq("bigquery-public-data.ml_datasets.penguins") + df.to_pandas() + execs_post = metrics.execution_count + assert df.shape == (344, 7) + assert execs_pre == execs_post + + +def test_to_gbq_table_labels(scalars_df_index): + destination_table = "bigframes-dev.bigframes_tests_sys.table_labels" + result_table = scalars_df_index.to_gbq( + destination_table, labels={"test": "labels"}, if_exists="replace" + ) + client = scalars_df_index._session.bqclient + table = client.get_table(result_table) + assert table.labels + assert table.labels["test"] == "labels" + + +@pytest.mark.parametrize( + ("col_names", "ignore_index"), + [ + pytest.param(["A"], False, id="one_array_false"), + pytest.param(["A"], True, id="one_array_true"), + pytest.param(["B"], False, id="one_float_false"), + pytest.param(["B"], True, id="one_float_true"), + pytest.param(["A", "C"], False, id="two_arrays_false"), + pytest.param(["A", "C"], True, id="two_arrays_true"), + ], +) +def test_dataframe_explode(col_names, ignore_index, session): + data = { + "A": [[0, 1, 2], [], [3, 4]], + "B": 3, + "C": [["a", "b", "c"], np.nan, ["d", "e"]], + } + + metrics = session._metrics + df = bpd.DataFrame(data, session=session) + pd_df = df.to_pandas() + pd_result = pd_df.explode(col_names, ignore_index=ignore_index) + bf_result = df.explode(col_names, ignore_index=ignore_index) + + # Check that to_pandas() results in at most a single query execution + execs_pre = metrics.execution_count + bf_materialized = bf_result.to_pandas() + execs_post = metrics.execution_count + + pd.testing.assert_frame_equal( + bf_materialized, + pd_result, + check_index_type=False, + check_dtype=False, + ) + # we test this property on this method in particular as compilation + # is non-deterministic and won't use the query cache as implemented + assert execs_post - execs_pre <= 1 + + +@pytest.mark.parametrize( + ("ignore_index", "ordered"), + [ + pytest.param(True, True, id="include_index_ordered"), + pytest.param(True, False, id="include_index_unordered"), + pytest.param(False, True, id="ignore_index_ordered"), + ], +) +def test_dataframe_explode_reserve_order(ignore_index, ordered): + data = { + "a": [np.random.randint(0, 10, 10) for _ in range(10)], + "b": [np.random.randint(0, 10, 10) for _ in range(10)], + } + df = bpd.DataFrame(data) + pd_df = pd.DataFrame(data) + + res = df.explode(["a", "b"], ignore_index=ignore_index).to_pandas(ordered=ordered) + pd_res = pd_df.explode(["a", "b"], ignore_index=ignore_index).astype( + pd.Int64Dtype() + ) + pd.testing.assert_frame_equal( + res if ordered else res.sort_index(), + pd_res, + check_index_type=False, + ) + + +@pytest.mark.parametrize( + ("col_names"), + [ + pytest.param([], id="empty", marks=pytest.mark.xfail(raises=ValueError)), + pytest.param( + ["A", "A"], id="duplicate", marks=pytest.mark.xfail(raises=ValueError) + ), + pytest.param("unknown", id="unknown", marks=pytest.mark.xfail(raises=KeyError)), + ], +) +def test_dataframe_explode_xfail(col_names): + df = bpd.DataFrame({"A": [[0, 1, 2], [], [3, 4]]}) + df.explode(col_names) + + +@pytest.mark.parametrize( + ("on", "rule", "origin"), + [ + pytest.param("datetime_col", "100D", "start"), + pytest.param("datetime_col", "30W", "start"), + pytest.param("datetime_col", "5M", "epoch"), + pytest.param("datetime_col", "3Q", "start_day"), + pytest.param("datetime_col", "3YE", "start"), + pytest.param( + "int64_col", "100D", "start", marks=pytest.mark.xfail(raises=TypeError) + ), + pytest.param( + "datetime_col", "100D", "end", marks=pytest.mark.xfail(raises=ValueError) + ), + ], +) +def test__resample_with_column( + scalars_df_index, scalars_pandas_df_index, on, rule, origin +): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + bf_result = ( + scalars_df_index._resample(rule=rule, on=on, origin=origin)[ + ["int64_col", "int64_too"] + ] + .max() + .to_pandas() + ) + pd_result = scalars_pandas_df_index.resample(rule=rule, on=on, origin=origin)[ + ["int64_col", "int64_too"] + ].max() + pd.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +@pytest.mark.parametrize( + ("append", "level", "col", "rule"), + [ + pytest.param(False, None, "timestamp_col", "100d"), + pytest.param(True, 1, "timestamp_col", "1200h"), + pytest.param(False, None, "datetime_col", "100d"), + ], +) +def test__resample_with_index( + scalars_df_index, scalars_pandas_df_index, append, level, col, rule +): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df_index = scalars_df_index.set_index(col, append=append) + scalars_pandas_df_index = scalars_pandas_df_index.set_index(col, append=append) + bf_result = ( + scalars_df_index[["int64_col", "int64_too"]] + ._resample(rule=rule, level=level) + .min() + .to_pandas() + ) + pd_result = ( + scalars_pandas_df_index[["int64_col", "int64_too"]] + .resample(rule=rule, level=level) + .min() + ) + assert_pandas_df_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("rule", "origin", "data"), + [ + ( + "5h", + "epoch", + { + "timestamp_col": pd.date_range( + start="2021-01-01 13:00:00", periods=30, freq="1h" + ), + "int64_col": range(30), + "int64_too": range(10, 40), + }, + ), + ( + "75min", + "start_day", + { + "timestamp_col": pd.date_range( + start="2021-01-01 13:00:00", periods=30, freq="10min" + ), + "int64_col": range(30), + "int64_too": range(10, 40), + }, + ), + ( + "7s", + "epoch", + { + "timestamp_col": pd.date_range( + start="2021-01-01 13:00:00", periods=30, freq="1s" + ), + "int64_col": range(30), + "int64_too": range(10, 40), + }, + ), + ], +) +def test__resample_start_time(rule, origin, data): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + col = "timestamp_col" + scalars_df_index = bpd.DataFrame(data).set_index(col) + scalars_pandas_df_index = pd.DataFrame(data).set_index(col) + scalars_pandas_df_index.index.name = None + + bf_result = scalars_df_index._resample(rule=rule, origin=origin).min().to_pandas() + + pd_result = scalars_pandas_df_index.resample(rule=rule, origin=origin).min() + + pd.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +@pytest.mark.parametrize( + "dtype", + [ + pytest.param("string[pyarrow]", id="type-string"), + pytest.param(pd.StringDtype(storage="pyarrow"), id="type-literal"), + pytest.param( + {"bool_col": "string[pyarrow]", "int64_col": pd.Float64Dtype()}, + id="multiple-types", + ), + ], +) +def test_df_astype(scalars_dfs, dtype): + bf_df, pd_df = scalars_dfs + target_cols = ["bool_col", "int64_col"] + bf_df = bf_df[target_cols] + pd_df = pd_df[target_cols] + + bf_result = bf_df.astype(dtype).to_pandas() + pd_result = pd_df.astype(dtype) + + pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + + +def test_df_astype_python_types(scalars_dfs): + bf_df, pd_df = scalars_dfs + target_cols = ["bool_col", "int64_col"] + bf_df = bf_df[target_cols] + pd_df = pd_df[target_cols] + + bf_result = bf_df.astype({"bool_col": str, "int64_col": float}).to_pandas() + pd_result = pd_df.astype( + {"bool_col": "string[pyarrow]", "int64_col": pd.Float64Dtype()} + ) + + pd.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + + +def test_astype_invalid_type_fail(scalars_dfs): + bf_df, _ = scalars_dfs + + with pytest.raises(TypeError, match=r".*Share your use case with.*"): + bf_df.astype(123) + + +def test_agg_with_dict_lists_strings(scalars_dfs): + bf_df, pd_df = scalars_dfs + agg_funcs = { + "int64_too": ["min", "max"], + "int64_col": ["min", "count"], + } + + bf_result = bf_df.agg(agg_funcs).to_pandas() + pd_result = pd_df.agg(agg_funcs) + + pd.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_agg_with_dict_lists_callables(scalars_dfs): + bf_df, pd_df = scalars_dfs + agg_funcs = { + "int64_too": [np.min, np.max], + "int64_col": [np.min, np.var], + } + + bf_result = bf_df.agg(agg_funcs).to_pandas() + pd_result = pd_df.agg(agg_funcs) + + pd.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_agg_with_dict_list_and_str(scalars_dfs): + bf_df, pd_df = scalars_dfs + agg_funcs = { + "int64_too": ["min", "max"], + "int64_col": "sum", + } + + bf_result = bf_df.agg(agg_funcs).to_pandas() + pd_result = pd_df.agg(agg_funcs) + + pd.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_agg_with_dict_strs(scalars_dfs): + bf_df, pd_df = scalars_dfs + agg_funcs = { + "int64_too": "min", + "int64_col": "sum", + "float64_col": "max", + } + + bf_result = bf_df.agg(agg_funcs).to_pandas() + pd_result = pd_df.agg(agg_funcs) + pd_result.index = pd_result.index.astype("string[pyarrow]") + + pd.testing.assert_series_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + +def test_agg_with_dict_containing_non_existing_col_raise_key_error(scalars_dfs): + bf_df, _ = scalars_dfs + agg_funcs = { + "int64_too": ["min", "max"], + "nonexisting_col": ["count"], + } + + with pytest.raises(KeyError): + bf_df.agg(agg_funcs) From 6801ca4dfef8928e8a056df46dcade5e55859f4c Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 27 Oct 2025 23:49:05 +0000 Subject: [PATCH 33/53] notebook update --- notebooks/dataframes/anywidget_mode.ipynb | 66 +++++++---------------- 1 file changed, 19 insertions(+), 47 deletions(-) diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index 154afea7e1..62caa4c7ee 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "id": "d10bfca4", "metadata": {}, "outputs": [], @@ -32,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "ca22f059", "metadata": {}, "outputs": [], @@ -50,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "1bc5aaf3", "metadata": {}, "outputs": [], @@ -69,7 +69,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "f289d250", "metadata": {}, "outputs": [ @@ -96,7 +96,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "42bb02ab", "metadata": {}, "outputs": [ @@ -123,7 +123,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "id": "ce250157", "metadata": {}, "outputs": [ @@ -142,7 +142,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "6e46f6d1352043a4baee57fa089f2b0c", + "model_id": "1d718cdbafcb42898120637cdb3fa267", "version_major": 2, "version_minor": 0 }, @@ -160,7 +160,7 @@ "Computation deferred. Computation will process 171.4 MB" ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -179,22 +179,10 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "id": "6920d49b", "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "✅ Completed. " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "text/html": [ @@ -217,7 +205,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "88d370b617b545809eb7bb8e5c66ea0e", + "model_id": "519297c3ad19403aa844cbeabcd5eb44", "version_major": 2, "version_minor": 0 }, @@ -251,7 +239,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "12b68f15", "metadata": {}, "outputs": [ @@ -288,24 +276,10 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "id": "a9d5d13a", "metadata": {}, "outputs": [ - { - "data": { - "text/html": [ - "✅ Completed. \n", - " Query processed 171.4 MB in a moment of slot time.\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "text/html": [ @@ -330,7 +304,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "dec19e8788b74219b88bccfc65e3b9c0", + "model_id": "37ba207603aa40a38c9786a210e712fd", "version_major": 2, "version_minor": 0 }, @@ -361,7 +335,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "id": "added-cell-1", "metadata": {}, "outputs": [ @@ -369,7 +343,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 85.9 kB in 21 seconds of slot time.\n", + " Query processed 85.9 kB in 23 seconds of slot time.\n", " " ], "text/plain": [ @@ -383,11 +357,11 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:969: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dataframe.py:869: UserWarning: Converting JSON columns to strings for display. This is temporary and will be removed when the frontend supports JSON types.\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dataframe.py:867: UserWarning: Converting JSON columns to strings for display. This is temporary and will be removed when the frontend supports JSON types.\n", " warnings.warn(\n" ] }, @@ -408,7 +382,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "774357b4083c47c8a5e1fd33bb6af188", + "model_id": "379998ea9a744e7b8afd9c1bcb36548d", "version_major": 2, "version_minor": 0 }, @@ -426,7 +400,7 @@ "Computation deferred. Computation will process 0 Bytes" ] }, - "execution_count": 11, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -447,7 +421,6 @@ ], "metadata": { "kernelspec": { - "display_name": "3.10.18", "display_name": "3.10.18", "language": "python", "name": "python3" @@ -463,7 +436,6 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.18" - "version": "3.10.18" } }, "nbformat": 4, From 6c3567b7d573dc36e136841c0a2fac6453a3fa76 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 28 Oct 2025 00:07:05 +0000 Subject: [PATCH 34/53] call API on local data for complier.py --- bigframes/core/compile/polars/compiler.py | 31 ++++------------------- 1 file changed, 5 insertions(+), 26 deletions(-) diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index 0a6605b222..754294ec2f 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -622,32 +622,11 @@ def compile_readlocal(self, node: nodes.ReadLocalNode): for scan_item in node.scan_list.items } - # Workaround for PyArrow bug https://github.com/apache/arrow/issues/45262 - # Convert JSON columns to strings before Polars processing - arrow_data = node.local_data_source.data - schema = arrow_data.schema - - # Check if any columns are JSON type - json_field_indices = [ - i - for i, field in enumerate(schema) - if pa.types.is_extension_type(field.type) - and field.type.extension_name == "google:sqlType:json" - ] - - if json_field_indices: - # Convert JSON columns to string columns - new_arrays = [] - new_fields = [] - for i, field in enumerate(schema): - if i in json_field_indices: - # Cast JSON to string - new_arrays.append(arrow_data.column(i).cast(pa.string())) - new_fields.append(pa.field(field.name, pa.string())) - else: - new_arrays.append(arrow_data.column(i)) - new_fields.append(field) - arrow_data = pa.table(new_arrays, schema=pa.schema(new_fields)) + if hasattr(node.local_data_source, "to_arrow"): + schema, batches = node.local_data_source.to_arrow(json_type="string") + arrow_data = pa.Table.from_batches(batches, schema) + else: + arrow_data = node.local_data_source.data lazy_frame = cast(pl.DataFrame, pl.from_arrow(arrow_data)).lazy() lazy_frame = lazy_frame.select(cols_to_read.keys()).rename(cols_to_read) From dba9051306312ced3b05ca253f189e73ad688021 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 28 Oct 2025 00:32:06 +0000 Subject: [PATCH 35/53] add more testcase --- bigframes/display/anywidget.py | 2 + notebooks/dataframes/anywidget_mode.ipynb | 18 ++- tests/system/small/test_anywidget.py | 131 ++++++++++++++++++++++ tests/unit/test_dataframe.py | 23 ++++ tests/unit/test_polars_compiler.py | 86 ++++++++++++++ 5 files changed, 255 insertions(+), 5 deletions(-) create mode 100644 tests/unit/test_polars_compiler.py diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index cf5d4e6310..8930c611e9 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -231,6 +231,8 @@ def _set_table_html(self) -> None: cached_data = self._cached_data else: break + + # Get the data for the current page page_data = cached_data.iloc[start:end] # Generate HTML table diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index 62caa4c7ee..744971f69e 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -142,7 +142,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "1d718cdbafcb42898120637cdb3fa267", + "model_id": "93dd10072d564a02a0278817d14855a9", "version_major": 2, "version_minor": 0 }, @@ -205,7 +205,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "519297c3ad19403aa844cbeabcd5eb44", + "model_id": "6e2538d446e344ac8505e4706730243e", "version_major": 2, "version_minor": 0 }, @@ -304,7 +304,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "37ba207603aa40a38c9786a210e712fd", + "model_id": "d6faf367ea5d44ad9d275506d870557a", "version_major": 2, "version_minor": 0 }, @@ -333,6 +333,14 @@ "The `AI.GENERATE` function in BigQuery returns results in a JSON column. While BigQuery's JSON type is not natively supported by the underlying Arrow `to_pandas_batches()` method used in anywidget mode ([Apache Arrow issue #45262](https://github.com/apache/arrow/issues/45262)), BigQuery Dataframes automatically converts JSON columns to strings for display. This allows you to view the results of generative AI functions seamlessly." ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "fdadcad6", + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": 10, @@ -343,7 +351,7 @@ "data": { "text/html": [ "✅ Completed. \n", - " Query processed 85.9 kB in 23 seconds of slot time.\n", + " Query processed 85.9 kB in 24 seconds of slot time.\n", " " ], "text/plain": [ @@ -382,7 +390,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "379998ea9a744e7b8afd9c1bcb36548d", + "model_id": "b6d6f3bacc2c43fc9a335e6039db12a5", "version_major": 2, "version_minor": 0 }, diff --git a/tests/system/small/test_anywidget.py b/tests/system/small/test_anywidget.py index 890d591de5..0587e13916 100644 --- a/tests/system/small/test_anywidget.py +++ b/tests/system/small/test_anywidget.py @@ -527,6 +527,137 @@ def test_json_column_anywidget_mode(mock_display, json_df: bf.dataframe.DataFram assert result == "" +def mock_execute_result_with_params( + self, schema, total_rows_val, arrow_batches_val, *args, **kwargs +): + """ + Mocks an execution result with configurable total_rows and arrow_batches. + """ + from bigframes.session.executor import ExecuteResult + + return ExecuteResult( + iter(arrow_batches_val), + schema=schema, + query_job=None, + total_bytes=None, + total_rows=total_rows_val, + ) + + +def test_widget_row_count_should_be_immutable_after_creation( + paginated_bf_df: bf.dataframe.DataFrame, +): + """ + Given a widget created with a specific configuration when global display + options are changed later, the widget's original row_count should remain + unchanged. + """ + from bigframes.display.anywidget import TableWidget + + # Use a context manager to ensure the option is reset + with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): + widget = TableWidget(paginated_bf_df) + initial_row_count = widget.row_count + + # Change a global option that could influence row count + bf.options.display.max_rows = 10 + + # Verify the row count remains immutable. + assert widget.row_count == initial_row_count + + +class FaultyIterator: + def __iter__(self): + return self + + def __next__(self): + raise ValueError("Simulated read error") + + +def test_widget_should_fallback_to_zero_rows_with_invalid_total_rows( + paginated_bf_df: bf.dataframe.DataFrame, + monkeypatch: pytest.MonkeyPatch, +): + """ + Given an internal component fails to return valid execution data, + when the TableWidget is created, its error_message should be set and displayed. + """ + # Patch the executor's 'execute' method to simulate an error. + monkeypatch.setattr( + "bigframes.session.bq_caching_executor.BigQueryCachingExecutor.execute", + lambda self, *args, **kwargs: mock_execute_result_with_params( + self, paginated_bf_df._block.expr.schema, None, [], *args, **kwargs + ), + ) + + # Create the TableWidget under the error condition. + with bf.option_context("display.repr_mode", "anywidget"): + from bigframes.display.anywidget import TableWidget + + # The widget should handle the faulty data from the mock without crashing. + widget = TableWidget(paginated_bf_df) + + # The widget should have an error message and display it in the HTML. + assert widget.row_count == 0 + assert widget._error_message is not None + assert "Could not determine total row count" in widget._error_message + assert widget._error_message in widget.table_html + + +def test_widget_row_count_reflects_actual_data_available( + paginated_bf_df: bf.dataframe.DataFrame, +): + """ + Test that widget row_count reflects the actual data available, + regardless of theoretical limits. + """ + from bigframes.display.anywidget import TableWidget + + # Set up display options that define a page size. + with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): + widget = TableWidget(paginated_bf_df) + + # The widget should report the total rows in the DataFrame, + # not limited by page_size (which only affects pagination) + assert widget.row_count == EXPECTED_ROW_COUNT + assert widget.page_size == 2 # Respects the display option + + # TODO(shuowei): Add tests for custom index and multiindex # This may not be necessary for the SQL Cell use case but should be # considered for completeness. + + +@pytest.fixture(scope="module") +def empty_json_df(session: bf.Session) -> bf.dataframe.DataFrame: + """Create an empty DataFrame with a JSON column for testing.""" + import bigframes.dtypes + + pandas_df = pd.DataFrame( + { + "a": pd.Series(dtype="int64"), + "b": pd.Series(dtype=bigframes.dtypes.JSON_DTYPE), + } + ) + return session.read_pandas(pandas_df) + + +def test_empty_widget_with_json_column(empty_json_df: bf.dataframe.DataFrame): + """Given an empty DataFrame with a JSON column, the widget should render table headers.""" + with bf.option_context("display.repr_mode", "anywidget"): + from bigframes.display.anywidget import TableWidget + + widget = TableWidget(empty_json_df) + html = widget.table_html + + assert widget.row_count == 0 + assert " bigframes.dataframe.DataFrame: + """Create a DataFrame with a JSON column for testing.""" + import bigframes.dtypes + + pandas_df = pd.DataFrame( + { + "a": [1], + "b": ['{"c": 2, "d": 3}'], + } + ) + pandas_df["b"] = pandas_df["b"].astype(bigframes.dtypes.JSON_DTYPE) + return polars_session.read_pandas(pandas_df) + + +def test_to_pandas_batches_with_json_column(json_df: bigframes.dataframe.DataFrame): + """Test that JSON columns are converted to strings in to_pandas_batches.""" + batches = list(json_df._to_pandas_batches(page_size=10)) + assert len(batches) > 0 + # Verify the JSON column is now string type + assert batches[0]["b"].dtype == pd.StringDtype(storage="pyarrow") diff --git a/tests/unit/test_polars_compiler.py b/tests/unit/test_polars_compiler.py new file mode 100644 index 0000000000..fd620825cc --- /dev/null +++ b/tests/unit/test_polars_compiler.py @@ -0,0 +1,86 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +import polars as pl +import pytest + +import bigframes as bf +import bigframes.core.compile.polars.compiler as polars_compiler +import bigframes.core.nodes as nodes +import bigframes.operations.json_ops as json_ops + + +def test_polars_to_json_string(): + """Test ToJSONString operation in Polars compiler.""" + compiler = polars_compiler.PolarsExpressionCompiler() + op = json_ops.ToJSONString() + # Polars doesn't have a native JSON type, it uses strings. + # The operation is a cast to string. + input_expr = pl.lit('{"b": 2}', dtype=pl.String) + result = compiler.compile_op(op, input_expr) + + df = pl.DataFrame({"a": ['{"b": 2}']}).lazy() + result_df = df.with_columns(result.alias("b")).collect() + assert result_df["b"][0] == '{"b": 2}' + assert result_df["b"].dtype == pl.String + + +def test_polars_parse_json(): + """Test ParseJSON operation in Polars compiler.""" + compiler = polars_compiler.PolarsExpressionCompiler() + op = json_ops.ParseJSON() + input_expr = pl.lit('{"b": 2}', dtype=pl.String) + result = compiler.compile_op(op, input_expr) + + df = pl.DataFrame({"a": ['{"b": 2}']}).lazy() + result_df = df.with_columns(result.alias("b")).collect() + # The result of json_decode is a struct + assert isinstance(result_df["b"][0], dict) + assert result_df["b"][0] == {"b": 2} + + +@pytest.mark.skip(reason="Polars does not have json_extract on string expressions") +def test_polars_json_extract(): + """Test JSONExtract operation in Polars compiler.""" + compiler = polars_compiler.PolarsExpressionCompiler() + op = json_ops.JSONExtract(json_path="$.b") + input_expr = pl.lit('{"a": 1, "b": "hello"}', dtype=pl.String) + result = compiler.compile_op(op, input_expr) + + df = pl.DataFrame({"a": ['{"b": "world"}']}).lazy() + result_df = df.with_columns(result.alias("b")).collect() + # json_extract returns a JSON encoded string + assert result_df["b"][0] == '"world"' + + +def test_readlocal_with_json_column(polars_session): + """Test ReadLocalNode compilation with JSON columns.""" + pandas_df = pd.DataFrame({"data": ['{"key": "value"}']}) + pandas_df["data"] = pandas_df["data"].astype(bf.dtypes.JSON_DTYPE) + bf_df = polars_session.read_pandas(pandas_df) + + node = bf_df._block.expr.node + # Traverse the node tree to find the ReadLocalNode + while not isinstance(node, nodes.ReadLocalNode): + node = node.child + assert isinstance(node, nodes.ReadLocalNode) + + compiler = polars_compiler.PolarsCompiler() + lazy_frame = compiler.compile_node(node) + result_df = lazy_frame.collect() + + # The compiler should have converted the JSON column to string. + assert result_df.schema["column_0"] == pl.String + assert result_df["column_0"][0] == '{"key":"value"}' From 0420c64a9020ab4f97fc8c471176507e93b7173b Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 28 Oct 2025 00:51:33 +0000 Subject: [PATCH 36/53] modfiy polars import --- tests/unit/test_polars_compiler.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/tests/unit/test_polars_compiler.py b/tests/unit/test_polars_compiler.py index fd620825cc..95be7d5d00 100644 --- a/tests/unit/test_polars_compiler.py +++ b/tests/unit/test_polars_compiler.py @@ -13,9 +13,19 @@ # limitations under the License. import pandas as pd -import polars as pl import pytest +try: + import polars as pl + + POLARS_INSTALLED = True +except ImportError: + POLARS_INSTALLED = False + +if not POLARS_INSTALLED: + pytest.skip("polars is not installed", allow_module_level=True) + + import bigframes as bf import bigframes.core.compile.polars.compiler as polars_compiler import bigframes.core.nodes as nodes @@ -48,10 +58,9 @@ def test_polars_parse_json(): result_df = df.with_columns(result.alias("b")).collect() # The result of json_decode is a struct assert isinstance(result_df["b"][0], dict) - assert result_df["b"][0] == {"b": 2} + assert result_df["b"][0]["b"] == 2 -@pytest.mark.skip(reason="Polars does not have json_extract on string expressions") def test_polars_json_extract(): """Test JSONExtract operation in Polars compiler.""" compiler = polars_compiler.PolarsExpressionCompiler() @@ -59,10 +68,10 @@ def test_polars_json_extract(): input_expr = pl.lit('{"a": 1, "b": "hello"}', dtype=pl.String) result = compiler.compile_op(op, input_expr) - df = pl.DataFrame({"a": ['{"b": "world"}']}).lazy() + df = pl.DataFrame({"a": ['{"a": 1, "b": "hello"}']}).lazy() result_df = df.with_columns(result.alias("b")).collect() - # json_extract returns a JSON encoded string - assert result_df["b"][0] == '"world"' + # json_path_match returns the raw string value + assert result_df["b"][0] == "hello" def test_readlocal_with_json_column(polars_session): From 907cf2c1728a95ddf3dd5b05e2b7917dbbd21ff1 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 29 Oct 2025 07:07:46 +0000 Subject: [PATCH 37/53] fix failed tests --- bigframes/bigquery/_operations/ai.py | 7 +++++++ bigframes/core/compile/polars/compiler.py | 6 +++--- bigframes/ml/llm.py | 11 ++++++++++- bigframes/series.py | 8 -------- tests/system/small/test_dataframe.py | 12 ++++++++++++ 5 files changed, 32 insertions(+), 12 deletions(-) diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index 8579f7f298..07f81d87f5 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -123,6 +123,13 @@ def generate( if output_schema is None: output_schema_str = None else: + # Validate output schema types + for col_name, col_type in output_schema.items(): + if col_type.upper() == "JSON": + raise ValueError( + "JSON type is not supported in output_schema. " + "Supported types are: STRING, INT64, FLOAT64, BOOL, ARRAY, and STRUCT." + ) output_schema_str = ", ".join( [f"{name} {sql_type}" for name, sql_type in output_schema.items()] ) diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index 754294ec2f..e939f80120 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -434,13 +434,13 @@ def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: @compile_op.register(json_ops.ParseJSON) def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: - # Parse string as JSON - this should decode, not encode - return input.str.json_decode() + # In Polars, JSON is stored as string, so no decoding needed + return input @compile_op.register(json_ops.JSONExtract) def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: assert isinstance(op, json_ops.JSONExtract) - return input.str.json_extract(json_path=op.json_path) + return input.str.json_path_match(op.json_path) @compile_op.register(arr_ops.ToArrayOp) def _(self, op: ops.ToArrayOp, *inputs: pl.Expr) -> pl.Expr: diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 531a043c45..edede34e8f 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -731,8 +731,17 @@ def predict( "ground_with_google_search": ground_with_google_search, } if output_schema: + supported_dtypes = ( + "int64", + "float64", + "bool", + "string", + "array", + "struct", + ) output_schema = { - k: utils.standardize_type(v) for k, v in output_schema.items() + k: utils.standardize_type(v, supported_dtypes=supported_dtypes) + for k, v in output_schema.items() } options["output_schema"] = output_schema return self._predict_and_retry( diff --git a/bigframes/series.py b/bigframes/series.py index 5448045092..5177bd0f33 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -611,14 +611,6 @@ def astype( raise ValueError("Argument 'errors' must be one of 'raise' or 'null'") dtype = bigframes.dtypes.bigframes_type(dtype) - # BigQuery doesn't support CAST(json_col AS STRING), but it does support - # TO_JSON_STRING(json_col). - if ( - self.dtype == bigframes.dtypes.JSON_DTYPE - and dtype == bigframes.dtypes.STRING_DTYPE - ): - return self._apply_unary_op(ops.json_ops.ToJSONString()) - return self._apply_unary_op( bigframes.operations.AsTypeOp(to_type=dtype, safe=(errors == "null")) ) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 79f8efd00f..a0c0e41a1b 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -6142,3 +6142,15 @@ def test_agg_with_dict_containing_non_existing_col_raise_key_error(scalars_dfs): with pytest.raises(KeyError): bf_df.agg(agg_funcs) + + +def test_to_pandas_batches_with_json_columns(session): + """Test that JSON columns are properly handled in to_pandas_batches.""" + # Create a DataFrame with JSON column + df = session.read_gbq('SELECT JSON \'{"key": "value"}\' as json_col') + + # This should not raise an error + batches = df._to_pandas_batches(page_size=10) + next(batches) + + # TODO From 2459aa4a2479cc4bd19ba8fbfc0159aa903b38b2 Mon Sep 17 00:00:00 2001 From: jialuoo Date: Wed, 29 Oct 2025 10:47:43 -0700 Subject: [PATCH 38/53] chore: Migrate minimum_op operator to SQLGlot (#2205) --- .../compile/sqlglot/expressions/comparison_ops.py | 5 +++++ .../test_comparison_ops/test_minimum_op/out.sql | 14 ++++++++++++++ .../sqlglot/expressions/test_comparison_ops.py | 7 +++++++ 3 files changed, 26 insertions(+) create mode 100644 tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_minimum_op/out.sql diff --git a/bigframes/core/compile/sqlglot/expressions/comparison_ops.py b/bigframes/core/compile/sqlglot/expressions/comparison_ops.py index eb08144b8a..e77b8b50a5 100644 --- a/bigframes/core/compile/sqlglot/expressions/comparison_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/comparison_ops.py @@ -109,6 +109,11 @@ def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: return sge.LTE(this=left_expr, expression=right_expr) +@register_binary_op(ops.minimum_op) +def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: + return sge.Least(this=left.expr, expressions=right.expr) + + @register_binary_op(ops.ne_op) def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: left_expr = _coerce_bool_to_int(left) diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_minimum_op/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_minimum_op/out.sql new file mode 100644 index 0000000000..429c3d2861 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_comparison_ops/test_minimum_op/out.sql @@ -0,0 +1,14 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0`, + `float64_col` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + LEAST(`bfcol_0`, `bfcol_1`) AS `bfcol_2` + FROM `bfcte_0` +) +SELECT + `bfcol_2` AS `int64_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/test_comparison_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_comparison_ops.py index 6c3eb64414..f278a15f3c 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_comparison_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_comparison_ops.py @@ -110,6 +110,13 @@ def test_le_numeric(scalar_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(bf_df.sql, "out.sql") +def test_minimum_op(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col", "float64_col"]] + sql = utils._apply_binary_op(bf_df, ops.minimum_op, "int64_col", "float64_col") + + snapshot.assert_match(sql, "out.sql") + + def test_ne_numeric(scalar_types_df: bpd.DataFrame, snapshot): bf_df = scalar_types_df[["int64_col", "bool_col"]] From 3dbee0760516863f4dfc455dcbd777ea26e83da7 Mon Sep 17 00:00:00 2001 From: jialuoo Date: Wed, 29 Oct 2025 10:50:49 -0700 Subject: [PATCH 39/53] chore: Migrate round_op operator to SQLGlot (#2204) This commit migrates the `round_op` operator from the Ibis compiler to the SQLGlot compiler. --- .../sqlglot/expressions/numeric_ops.py | 8 ++ .../test_numeric_ops/test_round/out.sql | 81 +++++++++++++++++++ .../sqlglot/expressions/test_numeric_ops.py | 14 ++++ 3 files changed, 103 insertions(+) create mode 100644 tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_round/out.sql diff --git a/bigframes/core/compile/sqlglot/expressions/numeric_ops.py b/bigframes/core/compile/sqlglot/expressions/numeric_ops.py index 8ca884b900..afc0d9d01c 100644 --- a/bigframes/core/compile/sqlglot/expressions/numeric_ops.py +++ b/bigframes/core/compile/sqlglot/expressions/numeric_ops.py @@ -377,6 +377,14 @@ def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: return result +@register_binary_op(ops.round_op) +def _(expr: TypedExpr, n_digits: TypedExpr) -> sge.Expression: + rounded = sge.Round(this=expr.expr, decimals=n_digits.expr) + if expr.dtype == dtypes.INT_DTYPE: + return sge.Cast(this=rounded, to="INT64") + return rounded + + @register_binary_op(ops.sub_op) def _(left: TypedExpr, right: TypedExpr) -> sge.Expression: if dtypes.is_numeric(left.dtype) and dtypes.is_numeric(right.dtype): diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_round/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_round/out.sql new file mode 100644 index 0000000000..8513c8d63f --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_numeric_ops/test_round/out.sql @@ -0,0 +1,81 @@ +WITH `bfcte_0` AS ( + SELECT + `int64_col` AS `bfcol_0`, + `float64_col` AS `bfcol_1`, + `rowindex` AS `bfcol_2` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + `bfcol_2` AS `bfcol_6`, + `bfcol_0` AS `bfcol_7`, + `bfcol_1` AS `bfcol_8`, + CAST(ROUND(`bfcol_0`, 0) AS INT64) AS `bfcol_9` + FROM `bfcte_0` +), `bfcte_2` AS ( + SELECT + *, + `bfcol_6` AS `bfcol_14`, + `bfcol_7` AS `bfcol_15`, + `bfcol_8` AS `bfcol_16`, + `bfcol_9` AS `bfcol_17`, + CAST(ROUND(`bfcol_7`, 1) AS INT64) AS `bfcol_18` + FROM `bfcte_1` +), `bfcte_3` AS ( + SELECT + *, + `bfcol_14` AS `bfcol_24`, + `bfcol_15` AS `bfcol_25`, + `bfcol_16` AS `bfcol_26`, + `bfcol_17` AS `bfcol_27`, + `bfcol_18` AS `bfcol_28`, + CAST(ROUND(`bfcol_15`, -1) AS INT64) AS `bfcol_29` + FROM `bfcte_2` +), `bfcte_4` AS ( + SELECT + *, + `bfcol_24` AS `bfcol_36`, + `bfcol_25` AS `bfcol_37`, + `bfcol_26` AS `bfcol_38`, + `bfcol_27` AS `bfcol_39`, + `bfcol_28` AS `bfcol_40`, + `bfcol_29` AS `bfcol_41`, + ROUND(`bfcol_26`, 0) AS `bfcol_42` + FROM `bfcte_3` +), `bfcte_5` AS ( + SELECT + *, + `bfcol_36` AS `bfcol_50`, + `bfcol_37` AS `bfcol_51`, + `bfcol_38` AS `bfcol_52`, + `bfcol_39` AS `bfcol_53`, + `bfcol_40` AS `bfcol_54`, + `bfcol_41` AS `bfcol_55`, + `bfcol_42` AS `bfcol_56`, + ROUND(`bfcol_38`, 1) AS `bfcol_57` + FROM `bfcte_4` +), `bfcte_6` AS ( + SELECT + *, + `bfcol_50` AS `bfcol_66`, + `bfcol_51` AS `bfcol_67`, + `bfcol_52` AS `bfcol_68`, + `bfcol_53` AS `bfcol_69`, + `bfcol_54` AS `bfcol_70`, + `bfcol_55` AS `bfcol_71`, + `bfcol_56` AS `bfcol_72`, + `bfcol_57` AS `bfcol_73`, + ROUND(`bfcol_52`, -1) AS `bfcol_74` + FROM `bfcte_5` +) +SELECT + `bfcol_66` AS `rowindex`, + `bfcol_67` AS `int64_col`, + `bfcol_68` AS `float64_col`, + `bfcol_69` AS `int_round_0`, + `bfcol_70` AS `int_round_1`, + `bfcol_71` AS `int_round_m1`, + `bfcol_72` AS `float_round_0`, + `bfcol_73` AS `float_round_1`, + `bfcol_74` AS `float_round_m1` +FROM `bfcte_6` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/test_numeric_ops.py b/tests/unit/core/compile/sqlglot/expressions/test_numeric_ops.py index fe9a53a558..ab9fe53092 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_numeric_ops.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_numeric_ops.py @@ -167,6 +167,20 @@ def test_pos(scalar_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(sql, "out.sql") +def test_round(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col", "float64_col"]] + + bf_df["int_round_0"] = bf_df["int64_col"].round(0) + bf_df["int_round_1"] = bf_df["int64_col"].round(1) + bf_df["int_round_m1"] = bf_df["int64_col"].round(-1) + + bf_df["float_round_0"] = bf_df["float64_col"].round(0) + bf_df["float_round_1"] = bf_df["float64_col"].round(1) + bf_df["float_round_m1"] = bf_df["float64_col"].round(-1) + + snapshot.assert_match(bf_df.sql, "out.sql") + + def test_sqrt(scalar_types_df: bpd.DataFrame, snapshot): col_name = "float64_col" bf_df = scalar_types_df[[col_name]] From d99f1ef9fac916edf9f3a3113a2da7a156a7c147 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Wed, 29 Oct 2025 12:03:45 -0700 Subject: [PATCH 40/53] fix: Improve error handling in blob operations (#2194) * add error handling for audio_transcribe * add error handling for pdf functions * add eror handling for image functions * final touch * restore rename * update notebook to better reflect our new code change * return None on error with verbose=False for image functions * define typing module in udf * only use local variable * Refactor code --- bigframes/blob/_functions.py | 285 ++++++++---- bigframes/operations/blob.py | 137 ++++-- .../multimodal/multimodal_dataframe.ipynb | 426 ++++++++++++++---- 3 files changed, 625 insertions(+), 223 deletions(-) diff --git a/bigframes/blob/_functions.py b/bigframes/blob/_functions.py index 2a11974b8d..3dfe38811b 100644 --- a/bigframes/blob/_functions.py +++ b/bigframes/blob/_functions.py @@ -14,6 +14,7 @@ from dataclasses import dataclass import inspect +import typing from typing import Callable, Iterable, Union import google.cloud.bigquery as bigquery @@ -70,6 +71,12 @@ def _input_bq_signature(self): def _output_bq_type(self): sig = inspect.signature(self._func) + return_annotation = sig.return_annotation + origin = typing.get_origin(return_annotation) + if origin is Union: + args = typing.get_args(return_annotation) + if len(args) == 2 and args[1] is type(None): + return _PYTHON_TO_BQ_TYPES[args[0]] return _PYTHON_TO_BQ_TYPES[sig.return_annotation] def _create_udf(self): @@ -78,7 +85,7 @@ def _create_udf(self): self._session._anon_dataset_manager.generate_unique_resource_id() ) - func_body = inspect.getsource(self._func) + func_body = "import typing\n" + inspect.getsource(self._func) func_name = self._func.__name__ packages = str(list(self._requirements)) @@ -120,43 +127,50 @@ def udf(self): def exif_func(src_obj_ref_rt: str, verbose: bool) -> str: - import io - import json + try: + import io + import json - from PIL import ExifTags, Image - import requests - from requests import adapters + from PIL import ExifTags, Image + import requests + from requests import adapters - result_dict = {"status": "", "content": "{}"} - try: session = requests.Session() session.mount("https://", adapters.HTTPAdapter(max_retries=3)) src_obj_ref_rt_json = json.loads(src_obj_ref_rt) - src_url = src_obj_ref_rt_json["access_urls"]["read_url"] response = session.get(src_url, timeout=30) + response.raise_for_status() bts = response.content image = Image.open(io.BytesIO(bts)) exif_data = image.getexif() exif_dict = {} + if exif_data: for tag, value in exif_data.items(): tag_name = ExifTags.TAGS.get(tag, tag) - # Pillow might return bytes, which are not serializable. - if isinstance(value, bytes): - value = value.decode("utf-8", "replace") - exif_dict[tag_name] = value - result_dict["content"] = json.dumps(exif_dict) - except Exception as e: - result_dict["status"] = str(e) + # Convert non-serializable types to strings + try: + json.dumps(value) + exif_dict[tag_name] = value + except (TypeError, ValueError): + exif_dict[tag_name] = str(value) + + if verbose: + return json.dumps({"status": "", "content": json.dumps(exif_dict)}) + else: + return json.dumps(exif_dict) - if verbose: - return json.dumps(result_dict) - else: - return result_dict["content"] + except Exception as e: + # Return error as JSON with error field + error_result = {"status": f"{type(e).__name__}: {str(e)}", "content": "{}"} + if verbose: + return json.dumps(error_result) + else: + return "{}" exif_func_def = FunctionDef(exif_func, ["pillow", "requests"]) @@ -170,12 +184,10 @@ def image_blur_func( ksize_y: int, ext: str, verbose: bool, -) -> str: - import json - - result_dict = {"status": "", "content": dst_obj_ref_rt} - +) -> typing.Optional[str]: try: + import json + import cv2 as cv # type: ignore import numpy as np import requests @@ -193,35 +205,52 @@ def image_blur_func( dst_url = dst_obj_ref_rt_json["access_urls"]["write_url"] response = session.get(src_url, timeout=30) + response.raise_for_status() # Raise exception for HTTP errors bts = response.content nparr = np.frombuffer(bts, np.uint8) img = cv.imdecode(nparr, cv.IMREAD_UNCHANGED) + + if img is None: + raise ValueError( + "Failed to decode image - possibly corrupted or unsupported format" + ) + img_blurred = cv.blur(img, ksize=(ksize_x, ksize_y)) - bts = cv.imencode(ext, img_blurred)[1].tobytes() + success, encoded = cv.imencode(ext, img_blurred) + if not success: + raise ValueError(f"Failed to encode image with extension {ext}") + + bts = encoded.tobytes() ext = ext.replace(".", "") ext_mappings = {"jpg": "jpeg", "tif": "tiff"} ext = ext_mappings.get(ext, ext) content_type = "image/" + ext - session.put( + put_response = session.put( url=dst_url, data=bts, - headers={ - "Content-Type": content_type, - }, + headers={"Content-Type": content_type}, timeout=30, ) + put_response.raise_for_status() - except Exception as e: - result_dict["status"] = str(e) + if verbose: + return json.dumps({"status": "", "content": dst_obj_ref_rt}) + else: + return dst_obj_ref_rt - if verbose: - return json.dumps(result_dict) - else: - return result_dict["content"] + except Exception as e: + if verbose: + error_result = { + "status": f"Error: {type(e).__name__}: {str(e)}", + "content": "", + } + return json.dumps(error_result) + else: + return None image_blur_def = FunctionDef(image_blur_func, ["opencv-python", "numpy", "requests"]) @@ -233,9 +262,6 @@ def image_blur_to_bytes_func( import base64 import json - status = "" - content = b"" - try: import cv2 as cv # type: ignore import numpy as np @@ -251,22 +277,36 @@ def image_blur_to_bytes_func( src_url = src_obj_ref_rt_json["access_urls"]["read_url"] response = session.get(src_url, timeout=30) + response.raise_for_status() bts = response.content nparr = np.frombuffer(bts, np.uint8) img = cv.imdecode(nparr, cv.IMREAD_UNCHANGED) + if img is None: + raise ValueError( + "Failed to decode image - possibly corrupted or unsupported format" + ) img_blurred = cv.blur(img, ksize=(ksize_x, ksize_y)) - content = cv.imencode(ext, img_blurred)[1].tobytes() + success, encoded = cv.imencode(ext, img_blurred) + if not success: + raise ValueError(f"Failed to encode image with extension {ext}") + content = encoded.tobytes() + + encoded_content = base64.b64encode(content).decode("utf-8") + result_dict = {"status": "", "content": encoded_content} + if verbose: + return json.dumps(result_dict) + else: + return result_dict["content"] except Exception as e: - status = str(e) - - encoded_content = base64.b64encode(content).decode("utf-8") - result_dict = {"status": status, "content": encoded_content} - if verbose: - return json.dumps(result_dict) - else: - return result_dict["content"] + status = f"Error: {type(e).__name__}: {str(e)}" + encoded_content = base64.b64encode(b"").decode("utf-8") + result_dict = {"status": status, "content": encoded_content} + if verbose: + return json.dumps(result_dict) + else: + return result_dict["content"] image_blur_to_bytes_def = FunctionDef( @@ -283,12 +323,10 @@ def image_resize_func( fy: float, ext: str, verbose: bool, -) -> str: - import json - - result_dict = {"status": "", "content": dst_obj_ref_rt} - +) -> typing.Optional[str]: try: + import json + import cv2 as cv # type: ignore import numpy as np import requests @@ -306,20 +344,28 @@ def image_resize_func( dst_url = dst_obj_ref_rt_json["access_urls"]["write_url"] response = session.get(src_url, timeout=30) + response.raise_for_status() bts = response.content nparr = np.frombuffer(bts, np.uint8) img = cv.imdecode(nparr, cv.IMREAD_UNCHANGED) + if img is None: + raise ValueError( + "Failed to decode image - possibly corrupted or unsupported format" + ) img_resized = cv.resize(img, dsize=(dsize_x, dsize_y), fx=fx, fy=fy) - bts = cv.imencode(ext, img_resized)[1].tobytes() + success, encoded = cv.imencode(ext, img_resized) + if not success: + raise ValueError(f"Failed to encode image with extension {ext}") + bts = encoded.tobytes() ext = ext.replace(".", "") ext_mappings = {"jpg": "jpeg", "tif": "tiff"} ext = ext_mappings.get(ext, ext) content_type = "image/" + ext - session.put( + put_response = session.put( url=dst_url, data=bts, headers={ @@ -327,14 +373,22 @@ def image_resize_func( }, timeout=30, ) + put_response.raise_for_status() - except Exception as e: - result_dict["status"] = str(e) + if verbose: + return json.dumps({"status": "", "content": dst_obj_ref_rt}) + else: + return dst_obj_ref_rt - if verbose: - return json.dumps(result_dict) - else: - return result_dict["content"] + except Exception as e: + if verbose: + error_result = { + "status": f"Error: {type(e).__name__}: {str(e)}", + "content": "", + } + return json.dumps(error_result) + else: + return None image_resize_def = FunctionDef( @@ -354,9 +408,6 @@ def image_resize_to_bytes_func( import base64 import json - status = "" - content = b"" - try: import cv2 as cv # type: ignore import numpy as np @@ -372,22 +423,36 @@ def image_resize_to_bytes_func( src_url = src_obj_ref_rt_json["access_urls"]["read_url"] response = session.get(src_url, timeout=30) + response.raise_for_status() bts = response.content nparr = np.frombuffer(bts, np.uint8) img = cv.imdecode(nparr, cv.IMREAD_UNCHANGED) + if img is None: + raise ValueError( + "Failed to decode image - possibly corrupted or unsupported format" + ) img_resized = cv.resize(img, dsize=(dsize_x, dsize_y), fx=fx, fy=fy) - content = cv.imencode(".jpeg", img_resized)[1].tobytes() + success, encoded = cv.imencode(ext, img_resized) + if not success: + raise ValueError(f"Failed to encode image with extension {ext}") + content = encoded.tobytes() + + encoded_content = base64.b64encode(content).decode("utf-8") + result_dict = {"status": "", "content": encoded_content} + if verbose: + return json.dumps(result_dict) + else: + return result_dict["content"] except Exception as e: - status = str(e) - - encoded_content = base64.b64encode(content).decode("utf-8") - result_dict = {"status": status, "content": encoded_content} - if verbose: - return json.dumps(result_dict) - else: - return result_dict["content"] + status = f"Error: {type(e).__name__}: {str(e)}" + encoded_content = base64.b64encode(b"").decode("utf-8") + result_dict = {"status": status, "content": encoded_content} + if verbose: + return json.dumps(result_dict) + else: + return result_dict["content"] image_resize_to_bytes_def = FunctionDef( @@ -403,12 +468,10 @@ def image_normalize_func( norm_type: str, ext: str, verbose: bool, -) -> str: - import json - - result_dict = {"status": "", "content": dst_obj_ref_rt} - +) -> typing.Optional[str]: try: + import json + import cv2 as cv # type: ignore import numpy as np import requests @@ -433,22 +496,30 @@ def image_normalize_func( dst_url = dst_obj_ref_rt_json["access_urls"]["write_url"] response = session.get(src_url, timeout=30) + response.raise_for_status() bts = response.content nparr = np.frombuffer(bts, np.uint8) img = cv.imdecode(nparr, cv.IMREAD_UNCHANGED) + if img is None: + raise ValueError( + "Failed to decode image - possibly corrupted or unsupported format" + ) img_normalized = cv.normalize( img, None, alpha=alpha, beta=beta, norm_type=norm_type_mapping[norm_type] ) - bts = cv.imencode(ext, img_normalized)[1].tobytes() + success, encoded = cv.imencode(ext, img_normalized) + if not success: + raise ValueError(f"Failed to encode image with extension {ext}") + bts = encoded.tobytes() ext = ext.replace(".", "") ext_mappings = {"jpg": "jpeg", "tif": "tiff"} ext = ext_mappings.get(ext, ext) content_type = "image/" + ext - session.put( + put_response = session.put( url=dst_url, data=bts, headers={ @@ -456,14 +527,22 @@ def image_normalize_func( }, timeout=30, ) + put_response.raise_for_status() - except Exception as e: - result_dict["status"] = str(e) + if verbose: + return json.dumps({"status": "", "content": dst_obj_ref_rt}) + else: + return dst_obj_ref_rt - if verbose: - return json.dumps(result_dict) - else: - return result_dict["content"] + except Exception as e: + if verbose: + error_result = { + "status": f"Error: {type(e).__name__}: {str(e)}", + "content": "", + } + return json.dumps(error_result) + else: + return None image_normalize_def = FunctionDef( @@ -482,8 +561,6 @@ def image_normalize_to_bytes_func( import base64 import json - result_dict = {"status": "", "content": ""} - try: import cv2 as cv # type: ignore import numpy as np @@ -506,25 +583,39 @@ def image_normalize_to_bytes_func( src_url = src_obj_ref_rt_json["access_urls"]["read_url"] response = session.get(src_url, timeout=30) + response.raise_for_status() bts = response.content nparr = np.frombuffer(bts, np.uint8) img = cv.imdecode(nparr, cv.IMREAD_UNCHANGED) + if img is None: + raise ValueError( + "Failed to decode image - possibly corrupted or unsupported format" + ) img_normalized = cv.normalize( img, None, alpha=alpha, beta=beta, norm_type=norm_type_mapping[norm_type] ) - bts = cv.imencode(".jpeg", img_normalized)[1].tobytes() + success, encoded = cv.imencode(ext, img_normalized) + if not success: + raise ValueError(f"Failed to encode image with extension {ext}") + content = encoded.tobytes() - content_b64 = base64.b64encode(bts).decode("utf-8") - result_dict["content"] = content_b64 + encoded_content = base64.b64encode(content).decode("utf-8") + result_dict = {"status": "", "content": encoded_content} - except Exception as e: - result_dict["status"] = str(e) + if verbose: + return json.dumps(result_dict) + else: + return result_dict["content"] - if verbose: - return json.dumps(result_dict) - else: - return result_dict["content"] + except Exception as e: + status = f"Error: {type(e).__name__}: {str(e)}" + encoded_content = base64.b64encode(b"").decode("utf-8") + result_dict = {"status": status, "content": encoded_content} + if verbose: + return json.dumps(result_dict) + else: + return result_dict["content"] image_normalize_to_bytes_def = FunctionDef( diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index 1f6b75a8f5..577de458f4 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -193,6 +193,20 @@ def _df_apply_udf( return s + def _apply_udf_or_raise_error( + self, df: bigframes.dataframe.DataFrame, udf, operation_name: str + ) -> bigframes.series.Series: + """Helper to apply UDF with consistent error handling.""" + try: + res = self._df_apply_udf(df, udf) + except Exception as e: + raise RuntimeError(f"{operation_name} UDF execution failed: {e}") from e + + if res is None: + raise RuntimeError(f"{operation_name} returned None result") + + return res + def read_url(self) -> bigframes.series.Series: """Retrieve the read URL of the Blob. @@ -343,6 +357,10 @@ def exif( Returns: bigframes.series.Series: JSON series of key-value pairs if verbose=False, or struct with status and content if verbose=True. + + Raises: + ValueError: If engine is not 'pillow'. + RuntimeError: If EXIF extraction fails or returns invalid structure. """ if engine is None or engine.casefold() != "pillow": raise ValueError("Must specify the engine, supported value is 'pillow'.") @@ -364,22 +382,28 @@ def exif( container_memory=container_memory, ).udf() - res = self._df_apply_udf(df, exif_udf) + res = self._apply_udf_or_raise_error(df, exif_udf, "EXIF extraction") if verbose: - exif_content_series = bbq.parse_json( - res._apply_unary_op(ops.JSONValue(json_path="$.content")) - ).rename("exif_content") - exif_status_series = res._apply_unary_op( - ops.JSONValue(json_path="$.status") - ) + try: + exif_content_series = bbq.parse_json( + res._apply_unary_op(ops.JSONValue(json_path="$.content")) + ).rename("exif_content") + exif_status_series = res._apply_unary_op( + ops.JSONValue(json_path="$.status") + ) + except Exception as e: + raise RuntimeError(f"Failed to parse EXIF JSON result: {e}") from e results_df = bpd.DataFrame( {"status": exif_status_series, "content": exif_content_series} ) results_struct = bbq.struct(results_df).rename("exif_results") return results_struct else: - return bbq.parse_json(res) + try: + return bbq.parse_json(res) + except Exception as e: + raise RuntimeError(f"Failed to parse EXIF JSON result: {e}") from e def image_blur( self, @@ -411,6 +435,10 @@ def image_blur( Returns: bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ. If verbose=True, returns struct with status and content. + + Raises: + ValueError: If engine is not 'opencv' or parameters are invalid. + RuntimeError: If image blur operation fails. """ if engine is None or engine.casefold() != "opencv": raise ValueError("Must specify the engine, supported value is 'opencv'.") @@ -437,7 +465,7 @@ def image_blur( df["ksize_x"], df["ksize_y"] = ksize df["ext"] = ext # type: ignore df["verbose"] = verbose - res = self._df_apply_udf(df, image_blur_udf) + res = self._apply_udf_or_raise_error(df, image_blur_udf, "Image blur") if verbose: blurred_content_b64_series = res._apply_unary_op( @@ -486,7 +514,7 @@ def image_blur( df["ext"] = ext # type: ignore df["verbose"] = verbose - res = self._df_apply_udf(df, image_blur_udf) + res = self._apply_udf_or_raise_error(df, image_blur_udf, "Image blur") res.cache() # to execute the udf if verbose: @@ -540,6 +568,10 @@ def image_resize( Returns: bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ. If verbose=True, returns struct with status and content. + + Raises: + ValueError: If engine is not 'opencv' or parameters are invalid. + RuntimeError: If image resize operation fails. """ if engine is None or engine.casefold() != "opencv": raise ValueError("Must specify the engine, supported value is 'opencv'.") @@ -570,11 +602,11 @@ def image_resize( container_memory=container_memory, ).udf() - df["dsize_x"], df["dsizye_y"] = dsize + df["dsize_x"], df["dsize_y"] = dsize df["fx"], df["fy"] = fx, fy df["ext"] = ext # type: ignore df["verbose"] = verbose - res = self._df_apply_udf(df, image_resize_udf) + res = self._apply_udf_or_raise_error(df, image_resize_udf, "Image resize") if verbose: resized_content_b64_series = res._apply_unary_op( @@ -620,12 +652,12 @@ def image_resize( dst_rt = dst.blob.get_runtime_json_str(mode="RW") df = df.join(dst_rt, how="outer") - df["dsize_x"], df["dsizye_y"] = dsize + df["dsize_x"], df["dsize_y"] = dsize df["fx"], df["fy"] = fx, fy df["ext"] = ext # type: ignore df["verbose"] = verbose - res = self._df_apply_udf(df, image_resize_udf) + res = self._apply_udf_or_raise_error(df, image_resize_udf, "Image resize") res.cache() # to execute the udf if verbose: @@ -679,6 +711,10 @@ def image_normalize( Returns: bigframes.series.Series: blob Series if destination is GCS. Or bytes Series if destination is BQ. If verbose=True, returns struct with status and content. + + Raises: + ValueError: If engine is not 'opencv' or parameters are invalid. + RuntimeError: If image normalize operation fails. """ if engine is None or engine.casefold() != "opencv": raise ValueError("Must specify the engine, supported value is 'opencv'.") @@ -707,7 +743,9 @@ def image_normalize( df["norm_type"] = norm_type df["ext"] = ext # type: ignore df["verbose"] = verbose - res = self._df_apply_udf(df, image_normalize_udf) + res = self._apply_udf_or_raise_error( + df, image_normalize_udf, "Image normalize" + ) if verbose: normalized_content_b64_series = res._apply_unary_op( @@ -758,7 +796,7 @@ def image_normalize( df["ext"] = ext # type: ignore df["verbose"] = verbose - res = self._df_apply_udf(df, image_normalize_udf) + res = self._apply_udf_or_raise_error(df, image_normalize_udf, "Image normalize") res.cache() # to execute the udf if verbose: @@ -809,6 +847,10 @@ def pdf_extract( depend on the "verbose" parameter. Contains the extracted text from the PDF file. Includes error messages if verbosity is enabled. + + Raises: + ValueError: If engine is not 'pypdf'. + RuntimeError: If PDF extraction fails or returns invalid structure. """ if engine is None or engine.casefold() != "pypdf": raise ValueError("Must specify the engine, supported value is 'pypdf'.") @@ -830,18 +872,29 @@ def pdf_extract( df = self.get_runtime_json_str(mode="R").to_frame() df["verbose"] = verbose - res = self._df_apply_udf(df, pdf_extract_udf) + + res = self._apply_udf_or_raise_error(df, pdf_extract_udf, "PDF extraction") if verbose: - extracted_content_series = res._apply_unary_op( - ops.JSONValue(json_path="$.content") - ) - status_series = res._apply_unary_op(ops.JSONValue(json_path="$.status")) - results_df = bpd.DataFrame( - {"status": status_series, "content": extracted_content_series} - ) - results_struct = bbq.struct(results_df).rename("extracted_results") - return results_struct + # Extract content with error handling + try: + content_series = res._apply_unary_op( + ops.JSONValue(json_path="$.content") + ) + except Exception as e: + raise RuntimeError( + f"Failed to extract content field from PDF result: {e}" + ) from e + try: + status_series = res._apply_unary_op(ops.JSONValue(json_path="$.status")) + except Exception as e: + raise RuntimeError( + f"Failed to extract status field from PDF result: {e}" + ) from e + + res_df = bpd.DataFrame({"status": status_series, "content": content_series}) + struct_series = bbq.struct(res_df).rename("extracted_results") + return struct_series else: return res.rename("extracted_content") @@ -884,6 +937,10 @@ def pdf_chunk( depend on the "verbose" parameter. where each string is a chunk of text extracted from PDF. Includes error messages if verbosity is enabled. + + Raises: + ValueError: If engine is not 'pypdf'. + RuntimeError: If PDF chunking fails or returns invalid structure. """ if engine is None or engine.casefold() != "pypdf": raise ValueError("Must specify the engine, supported value is 'pypdf'.") @@ -915,13 +972,25 @@ def pdf_chunk( df["overlap_size"] = overlap_size df["verbose"] = verbose - res = self._df_apply_udf(df, pdf_chunk_udf) + res = self._apply_udf_or_raise_error(df, pdf_chunk_udf, "PDF chunking") + + try: + content_series = bbq.json_extract_string_array(res, "$.content") + except Exception as e: + raise RuntimeError( + f"Failed to extract content array from PDF chunk result: {e}" + ) from e if verbose: - chunked_content_series = bbq.json_extract_string_array(res, "$.content") - status_series = res._apply_unary_op(ops.JSONValue(json_path="$.status")) + try: + status_series = res._apply_unary_op(ops.JSONValue(json_path="$.status")) + except Exception as e: + raise RuntimeError( + f"Failed to extract status field from PDF chunk result: {e}" + ) from e + results_df = bpd.DataFrame( - {"status": status_series, "content": chunked_content_series} + {"status": status_series, "content": content_series} ) resultes_struct = bbq.struct(results_df).rename("chunked_results") return resultes_struct @@ -962,6 +1031,10 @@ def audio_transcribe( depend on the "verbose" parameter. Contains the transcribed text from the audio file. Includes error messages if verbosity is enabled. + + Raises: + ValueError: If engine is not 'bigquery'. + RuntimeError: If the transcription result structure is invalid. """ if engine.casefold() != "bigquery": raise ValueError("Must specify the engine, supported value is 'bigquery'.") @@ -984,6 +1057,10 @@ def audio_transcribe( model_params={"generationConfig": {"temperature": 0.0}}, ) + # Validate that the result is not None + if transcribed_results is None: + raise RuntimeError("Transcription returned None result") + transcribed_content_series = transcribed_results.struct.field("result").rename( "transcribed_content" ) diff --git a/notebooks/multimodal/multimodal_dataframe.ipynb b/notebooks/multimodal/multimodal_dataframe.ipynb index c04463fc4c..0822ee4c2d 100644 --- a/notebooks/multimodal/multimodal_dataframe.ipynb +++ b/notebooks/multimodal/multimodal_dataframe.ipynb @@ -60,7 +60,8 @@ "2. Combine unstructured data with structured data\n", "3. Conduct image transformations\n", "4. Use LLM models to ask questions and generate embeddings on images\n", - "5. PDF chunking function" + "5. PDF chunking function\n", + "6. Transcribe audio" ] }, { @@ -215,23 +216,23 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
0
1
2
3
4
\n", @@ -297,21 +298,21 @@ "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:124: UserWarning: The `json_extract` is deprecated and will be removed in a future\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:121: UserWarning: The `json_extract` is deprecated and will be removed in a future\n", "version. Use `json_query` instead.\n", " warnings.warn(bfe.format_message(msg), category=UserWarning)\n", "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:124: UserWarning: The `json_extract` is deprecated and will be removed in a future\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:121: UserWarning: The `json_extract` is deprecated and will be removed in a future\n", "version. Use `json_query` instead.\n", " warnings.warn(bfe.format_message(msg), category=UserWarning)\n", "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:124: UserWarning: The `json_extract` is deprecated and will be removed in a future\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:121: UserWarning: The `json_extract` is deprecated and will be removed in a future\n", "version. Use `json_query` instead.\n", " warnings.warn(bfe.format_message(msg), category=UserWarning)\n", "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", @@ -351,7 +352,7 @@ " \n", " \n", " 0\n", - " \n", + " \n", " alice\n", " image/png\n", " 1591240\n", @@ -359,7 +360,7 @@ " \n", " \n", " 1\n", - " \n", + " \n", " bob\n", " image/png\n", " 1182951\n", @@ -367,7 +368,7 @@ " \n", " \n", " 2\n", - " \n", + " \n", " bob\n", " image/png\n", " 1520884\n", @@ -375,7 +376,7 @@ " \n", " \n", " 3\n", - " \n", + " \n", " alice\n", " image/png\n", " 1235401\n", @@ -383,7 +384,7 @@ " \n", " \n", " 4\n", - " \n", + " \n", " bob\n", " image/png\n", " 1591923\n", @@ -463,7 +464,7 @@ "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:124: UserWarning: The `json_extract` is deprecated and will be removed in a future\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:121: UserWarning: The `json_extract` is deprecated and will be removed in a future\n", "version. Use `json_query` instead.\n", " warnings.warn(bfe.format_message(msg), category=UserWarning)\n" ] @@ -471,7 +472,7 @@ { "data": { "text/html": [ - "" + "" ], "text/plain": [ "" @@ -483,7 +484,7 @@ { "data": { "text/html": [ - "" + "" ], "text/plain": [ "" @@ -527,19 +528,19 @@ "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:180: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", " return method(*args, **kwargs)\n", "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:180: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", " return method(*args, **kwargs)\n", "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:180: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", " return method(*args, **kwargs)\n" ] } @@ -579,7 +580,7 @@ "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:180: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", " return method(*args, **kwargs)\n" ] } @@ -589,9 +590,119 @@ "df_image[\"blur_resized\"] = df_image[\"blurred\"].blob.image_resize((300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_resize_transformed/\", engine=\"opencv\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using `verbose` mode for detailed output\\n\n", + "\\n\n", + "All multimodal functions support a `verbose` parameter, which defaults to `False`.\\n\n", + "\\n\n", + "* When `verbose=False` (the default), the function will only return the main content of the result (e.g., the transformed image, the extracted text).\\n\n", + "* When `verbose=True`, the function returns a `STRUCT` containing two fields:\\n\n", + " * `content`: The main result of the operation.\\n\n", + " * `status`: An informational field. If the operation is successful, this will be empty. If an error occurs during the processing of a specific row, this field will contain the error message, allowing the overall job to complete without failing.\\n\n", + "\\n\n", + "Using `verbose=True` is highly recommended for debugging and for workflows where you need to handle potential failures on a row-by-row basis. Let's see it in action with the `image_blur` function." + ] + }, { "cell_type": "code", "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", + " return method(*args, **kwargs)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
blurred_verbose
0{'status': '', 'content': {'uri': 'gs://bigfra...
1{'status': '', 'content': {'uri': 'gs://bigfra...
2{'status': '', 'content': {'uri': 'gs://bigfra...
3{'status': '', 'content': {'uri': 'gs://bigfra...
4{'status': '', 'content': {'uri': 'gs://bigfra...
\n", + "

5 rows × 1 columns

\n", + "
[5 rows x 1 columns in total]" + ], + "text/plain": [ + " blurred_verbose\n", + "0 {'status': '', 'content': {'uri': 'gs://bigfra...\n", + "1 {'status': '', 'content': {'uri': 'gs://bigfra...\n", + "2 {'status': '', 'content': {'uri': 'gs://bigfra...\n", + "3 {'status': '', 'content': {'uri': 'gs://bigfra...\n", + "4 {'status': '', 'content': {'uri': 'gs://bigfra...\n", + "\n", + "[5 rows x 1 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_image[\"blurred_verbose\"] = df_image[\"image\"].blob.image_blur(\n", + " (20, 20), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_transformed_verbose/\", engine=\"opencv\", verbose=True\n", + ")\n", + "df_image[[\"blurred_verbose\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -657,73 +768,79 @@ " resized\n", " normalized\n", " blur_resized\n", + " blurred_verbose\n", " \n", " \n", " \n", " \n", " 0\n", - " \n", + " \n", " alice\n", " image/png\n", " 1591240\n", " 2025-03-20 17:45:04+00:00\n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " {'status': '', 'content': {'uri': 'gs://bigframes_blob_test/image_blur_transformed_verbose/k9-guard-dog-paw-balm.png', 'version': None, 'authorizer': 'bigframes-dev.us.bigframes-default-connection', 'details': None}}\n", " \n", " \n", " 1\n", - " \n", + " \n", " bob\n", " image/png\n", " 1182951\n", " 2025-03-20 17:45:02+00:00\n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " {'status': '', 'content': {'uri': 'gs://bigframes_blob_test/image_blur_transformed_verbose/k9-guard-dog-hot-spot-spray.png', 'version': None, 'authorizer': 'bigframes-dev.us.bigframes-default-connection', 'details': None}}\n", " \n", " \n", " 2\n", - " \n", + " \n", " bob\n", " image/png\n", " 1520884\n", " 2025-03-20 17:44:55+00:00\n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " {'status': '', 'content': {'uri': 'gs://bigframes_blob_test/image_blur_transformed_verbose/fluffy-buns-chinchilla-food-variety-pack.png', 'version': None, 'authorizer': 'bigframes-dev.us.bigframes-default-connection', 'details': None}}\n", " \n", " \n", " 3\n", - " \n", + " \n", " alice\n", " image/png\n", " 1235401\n", " 2025-03-20 17:45:19+00:00\n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " {'status': '', 'content': {'uri': 'gs://bigframes_blob_test/image_blur_transformed_verbose/purrfect-perch-cat-scratcher.png', 'version': None, 'authorizer': 'bigframes-dev.us.bigframes-default-connection', 'details': None}}\n", " \n", " \n", " 4\n", - " \n", + " \n", " bob\n", " image/png\n", " 1591923\n", " 2025-03-20 17:44:47+00:00\n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " {'status': '', 'content': {'uri': 'gs://bigframes_blob_test/image_blur_transformed_verbose/chirpy-seed-deluxe-bird-food.png', 'version': None, 'authorizer': 'bigframes-dev.us.bigframes-default-connection', 'details': None}}\n", " \n", " \n", "\n", - "

5 rows × 9 columns

\n", - "[5 rows x 9 columns in total]" + "

5 rows × 10 columns

\n", + "[5 rows x 10 columns in total]" ], "text/plain": [ " image author content_type \\\n", @@ -761,17 +878,24 @@ "3 {'uri': 'gs://bigframes_blob_test/image_normal... \n", "4 {'uri': 'gs://bigframes_blob_test/image_normal... \n", "\n", - " blur_resized \n", - "0 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", - "1 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", - "2 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", - "3 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", - "4 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", + " blur_resized \\\n", + "0 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", + "1 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", + "2 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", + "3 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", + "4 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n", "\n", - "[5 rows x 9 columns]" + " blurred_verbose \n", + "0 {'status': '', 'content': {'uri': 'gs://bigfra... \n", + "1 {'status': '', 'content': {'uri': 'gs://bigfra... \n", + "2 {'status': '', 'content': {'uri': 'gs://bigfra... \n", + "3 {'status': '', 'content': {'uri': 'gs://bigfra... \n", + "4 {'status': '', 'content': {'uri': 'gs://bigfra... \n", + "\n", + "[5 rows x 10 columns]" ] }, - "execution_count": 10, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -791,7 +915,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": { "id": "mRUGfcaFVW-3" }, @@ -800,7 +924,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:180: FutureWarning: Since upgrading the default model can cause unintended breakages, the\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FutureWarning: Since upgrading the default model can cause unintended breakages, the\n", "default model will be removed in BigFrames 3.0. Please supply an\n", "explicit model to avoid this message.\n", " return method(*args, **kwargs)\n" @@ -814,7 +938,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -874,13 +998,13 @@ " \n", " \n", " 0\n", - " The item is a tin of K9Guard Dog Paw Balm.\n", - " \n", + " The item is a tin of K9 Guard dog paw balm.\n", + " \n", " \n", " \n", " 1\n", - " The item is a bottle of K9 Guard Dog Hot Spot Spray.\n", - " \n", + " The item is K9 Guard Dog Hot Spot Spray.\n", + " \n", " \n", " \n", "\n", @@ -888,9 +1012,9 @@ "[2 rows x 2 columns in total]" ], "text/plain": [ - " ml_generate_text_llm_result \\\n", - "0 The item is a tin of K9Guard Dog Paw Balm. \n", - "1 The item is a bottle of K9 Guard Dog Hot Spot ... \n", + " ml_generate_text_llm_result \\\n", + "0 The item is a tin of K9 Guard dog paw balm. \n", + "1 The item is K9 Guard Dog Hot Spot Spray. \n", "\n", " image \n", "0 {'uri': 'gs://cloud-samples-data/bigquery/tuto... \n", @@ -899,7 +1023,7 @@ "[2 rows x 2 columns]" ] }, - "execution_count": 12, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -913,7 +1037,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": { "id": "IG3J3HsKhyBY" }, @@ -936,7 +1060,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -996,13 +1120,13 @@ " \n", " \n", " 0\n", - " The item is dog paw balm.\n", - " \n", + " The item is a tin of K9Guard Dog Paw Balm.\n", + " \n", " \n", " \n", " 1\n", - " The picture features a white bottle with a light blue spray nozzle and accents. The background is a neutral gray.\\n\n", - " \n", + " The bottle is mostly white, with a light blue accents. The background is a light gray. There are also black and green elements on the bottle's label.\n", + " \n", " \n", " \n", "\n", @@ -1011,8 +1135,8 @@ ], "text/plain": [ " ml_generate_text_llm_result \\\n", - "0 The item is dog paw balm. \n", - "1 The picture features a white bottle with a lig... \n", + "0 The item is a tin of K9Guard Dog Paw Balm. \n", + "1 The bottle is mostly white, with a light blue ... \n", "\n", " image \n", "0 {'uri': 'gs://cloud-samples-data/bigquery/tuto... \n", @@ -1021,7 +1145,7 @@ "[2 rows x 2 columns]" ] }, - "execution_count": 14, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1033,7 +1157,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -1047,7 +1171,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:180: FutureWarning: Since upgrading the default model can cause unintended breakages, the\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FutureWarning: Since upgrading the default model can cause unintended breakages, the\n", "default model will be removed in BigFrames 3.0. Please supply an\n", "explicit model to avoid this message.\n", " return method(*args, **kwargs)\n", @@ -1096,19 +1220,19 @@ " \n", " \n", " 0\n", - " [ 0.00638846 0.01666372 0.00451786 ... -0.02...\n", + " [ 0.00638842 0.01666344 0.00451782 ... -0.02...\n", " \n", " <NA>\n", " <NA>\n", - " {\"access_urls\":{\"expiry_time\":\"2025-10-09T12:2...\n", + " {\"access_urls\":{\"expiry_time\":\"2025-10-25T00:2...\n", " \n", " \n", " 1\n", - " [ 0.0097399 0.0214815 0.00244266 ... 0.00...\n", + " [ 0.00973689 0.02148374 0.00244311 ... 0.00...\n", " \n", " <NA>\n", " <NA>\n", - " {\"access_urls\":{\"expiry_time\":\"2025-10-09T12:2...\n", + " {\"access_urls\":{\"expiry_time\":\"2025-10-25T00:2...\n", " \n", " \n", "\n", @@ -1117,8 +1241,8 @@ ], "text/plain": [ " ml_generate_embedding_result \\\n", - "0 [ 0.00638846 0.01666372 0.00451786 ... -0.02... \n", - "1 [ 0.0097399 0.0214815 0.00244266 ... 0.00... \n", + "0 [ 0.00638842 0.01666344 0.00451782 ... -0.02... \n", + "1 [ 0.00973689 0.02148374 0.00244311 ... 0.00... \n", "\n", " ml_generate_embedding_status ml_generate_embedding_start_sec \\\n", "0 \n", @@ -1129,13 +1253,13 @@ "1 \n", "\n", " content \n", - "0 {\"access_urls\":{\"expiry_time\":\"2025-10-09T12:2... \n", - "1 {\"access_urls\":{\"expiry_time\":\"2025-10-09T12:2... \n", + "0 {\"access_urls\":{\"expiry_time\":\"2025-10-25T00:2... \n", + "1 {\"access_urls\":{\"expiry_time\":\"2025-10-25T00:2... \n", "\n", "[2 rows x 5 columns]" ] }, - "execution_count": 15, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1158,7 +1282,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": { "id": "oDDuYtUm5Yiy" }, @@ -1180,7 +1304,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -1197,9 +1321,12 @@ "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:180: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", " return method(*args, **kwargs)\n", - "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:244: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:239: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n", + "future version. Use `json_value_array` instead.\n", + " warnings.warn(bfe.format_message(msg), category=UserWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:239: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n", "future version. Use `json_value_array` instead.\n", " warnings.warn(bfe.format_message(msg), category=UserWarning)\n" ] @@ -1211,7 +1338,78 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/core/log_adapter.py:182: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", + " return method(*args, **kwargs)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/bigquery/_operations/json.py:239: UserWarning: The `json_extract_string_array` is deprecated and will be removed in a\n", + "future version. Use `json_value_array` instead.\n", + " warnings.warn(bfe.format_message(msg), category=UserWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
chunked_verbose
0{'status': '', 'content': array([\"CritterCuisi...
\n", + "

1 rows × 1 columns

\n", + "
[1 rows x 1 columns in total]" + ], + "text/plain": [ + " chunked_verbose\n", + "0 {'status': '', 'content': array([\"CritterCuisi...\n", + "\n", + "[1 rows x 1 columns]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_pdf[\"chunked_verbose\"] = df_pdf[\"pdf\"].blob.pdf_chunk(engine=\"pypdf\", verbose=True)\n", + "df_pdf[[\"chunked_verbose\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 20, "metadata": { "id": "kaPvJATN7zlw" }, @@ -1239,7 +1437,7 @@ "Name: chunked, dtype: string" ] }, - "execution_count": 18, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -1258,7 +1456,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -1279,7 +1477,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -1303,7 +1501,7 @@ "Name: transcribed_content, dtype: string" ] }, - "execution_count": 20, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -1312,6 +1510,42 @@ "transcribed_series = df['audio'].blob.audio_transcribe(model_name=\"gemini-2.0-flash-001\", verbose=False)\n", "transcribed_series" ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/github.com/googleapis/python-bigquery-dataframes/bigframes/dtypes.py:959: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] + }, + { + "data": { + "text/plain": [ + "0 {'status': '', 'content': 'Now, as all books, ...\n", + "Name: transcription_results, dtype: struct[pyarrow]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "transcribed_series_verbose = df['audio'].blob.audio_transcribe(model_name=\"gemini-2.0-flash-001\", verbose=True)\n", + "transcribed_series_verbose" + ] } ], "metadata": { From e0ac827874a1c22092154680ccf1fc20dc5d6904 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Wed, 29 Oct 2025 16:52:45 -0500 Subject: [PATCH 41/53] refactor: update geo "spec" and split geo ops in ibis compiler (#2208) --- .../core/compile/ibis_compiler/__init__.py | 1 + .../ibis_compiler/operations/geo_ops.py | 159 ++++++++++++++++++ .../ibis_compiler/scalar_op_registry.py | 134 --------------- specs/2025-08-04-geoseries-scalars.md | 13 +- 4 files changed, 168 insertions(+), 139 deletions(-) create mode 100644 bigframes/core/compile/ibis_compiler/operations/geo_ops.py diff --git a/bigframes/core/compile/ibis_compiler/__init__.py b/bigframes/core/compile/ibis_compiler/__init__.py index aef0ed9267..6b9d284c53 100644 --- a/bigframes/core/compile/ibis_compiler/__init__.py +++ b/bigframes/core/compile/ibis_compiler/__init__.py @@ -21,4 +21,5 @@ from __future__ import annotations import bigframes.core.compile.ibis_compiler.operations.generic_ops # noqa: F401 +import bigframes.core.compile.ibis_compiler.operations.geo_ops # noqa: F401 import bigframes.core.compile.ibis_compiler.scalar_op_registry # noqa: F401 diff --git a/bigframes/core/compile/ibis_compiler/operations/geo_ops.py b/bigframes/core/compile/ibis_compiler/operations/geo_ops.py new file mode 100644 index 0000000000..f9155fed5a --- /dev/null +++ b/bigframes/core/compile/ibis_compiler/operations/geo_ops.py @@ -0,0 +1,159 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from typing import cast + +from bigframes_vendored.ibis.expr import types as ibis_types +import bigframes_vendored.ibis.expr.datatypes as ibis_dtypes +import bigframes_vendored.ibis.expr.operations.udf as ibis_udf + +from bigframes.core.compile.ibis_compiler import scalar_op_compiler +from bigframes.operations import geo_ops as ops + +register_unary_op = scalar_op_compiler.scalar_op_compiler.register_unary_op +register_binary_op = scalar_op_compiler.scalar_op_compiler.register_binary_op + + +# Geo Ops +@register_unary_op(ops.geo_area_op) +def geo_area_op_impl(x: ibis_types.Value): + return cast(ibis_types.GeoSpatialValue, x).area() + + +@register_unary_op(ops.geo_st_astext_op) +def geo_st_astext_op_impl(x: ibis_types.Value): + return cast(ibis_types.GeoSpatialValue, x).as_text() + + +@register_unary_op(ops.geo_st_boundary_op, pass_op=False) +def geo_st_boundary_op_impl(x: ibis_types.Value): + return st_boundary(x) + + +@register_unary_op(ops.GeoStBufferOp, pass_op=True) +def geo_st_buffer_op_impl(x: ibis_types.Value, op: ops.GeoStBufferOp): + return st_buffer( + x, + op.buffer_radius, + op.num_seg_quarter_circle, + op.use_spheroid, + ) + + +@register_unary_op(ops.geo_st_centroid_op, pass_op=False) +def geo_st_centroid_op_impl(x: ibis_types.Value): + return cast(ibis_types.GeoSpatialValue, x).centroid() + + +@register_unary_op(ops.geo_st_convexhull_op, pass_op=False) +def geo_st_convexhull_op_impl(x: ibis_types.Value): + return st_convexhull(x) + + +@register_binary_op(ops.geo_st_difference_op, pass_op=False) +def geo_st_difference_op_impl(x: ibis_types.Value, y: ibis_types.Value): + return cast(ibis_types.GeoSpatialValue, x).difference( + cast(ibis_types.GeoSpatialValue, y) + ) + + +@register_binary_op(ops.GeoStDistanceOp, pass_op=True) +def geo_st_distance_op_impl( + x: ibis_types.Value, y: ibis_types.Value, op: ops.GeoStDistanceOp +): + return st_distance(x, y, op.use_spheroid) + + +@register_unary_op(ops.geo_st_geogfromtext_op) +def geo_st_geogfromtext_op_impl(x: ibis_types.Value): + # Ibis doesn't seem to provide a dedicated method to cast from string to geography, + # so we use a BigQuery scalar function, st_geogfromtext(), directly. + return st_geogfromtext(x) + + +@register_binary_op(ops.geo_st_geogpoint_op, pass_op=False) +def geo_st_geogpoint_op_impl(x: ibis_types.Value, y: ibis_types.Value): + return cast(ibis_types.NumericValue, x).point(cast(ibis_types.NumericValue, y)) + + +@register_binary_op(ops.geo_st_intersection_op, pass_op=False) +def geo_st_intersection_op_impl(x: ibis_types.Value, y: ibis_types.Value): + return cast(ibis_types.GeoSpatialValue, x).intersection( + cast(ibis_types.GeoSpatialValue, y) + ) + + +@register_unary_op(ops.geo_st_isclosed_op, pass_op=False) +def geo_st_isclosed_op_impl(x: ibis_types.Value): + return st_isclosed(x) + + +@register_unary_op(ops.geo_x_op) +def geo_x_op_impl(x: ibis_types.Value): + return cast(ibis_types.GeoSpatialValue, x).x() + + +@register_unary_op(ops.GeoStLengthOp, pass_op=True) +def geo_length_op_impl(x: ibis_types.Value, op: ops.GeoStLengthOp): + # Call the st_length UDF defined in this file (or imported) + return st_length(x, op.use_spheroid) + + +@register_unary_op(ops.geo_y_op) +def geo_y_op_impl(x: ibis_types.Value): + return cast(ibis_types.GeoSpatialValue, x).y() + + +@ibis_udf.scalar.builtin +def st_convexhull(x: ibis_dtypes.geography) -> ibis_dtypes.geography: # type: ignore + """ST_CONVEXHULL""" + ... + + +@ibis_udf.scalar.builtin +def st_geogfromtext(a: str) -> ibis_dtypes.geography: # type: ignore + """Convert string to geography.""" + + +@ibis_udf.scalar.builtin +def st_boundary(a: ibis_dtypes.geography) -> ibis_dtypes.geography: # type: ignore + """Find the boundary of a geography.""" + + +@ibis_udf.scalar.builtin +def st_buffer( + geography: ibis_dtypes.geography, # type: ignore + buffer_radius: ibis_dtypes.Float64, + num_seg_quarter_circle: ibis_dtypes.Float64, + use_spheroid: ibis_dtypes.Boolean, +) -> ibis_dtypes.geography: # type: ignore + ... + + +@ibis_udf.scalar.builtin +def st_distance(a: ibis_dtypes.geography, b: ibis_dtypes.geography, use_spheroid: bool) -> ibis_dtypes.float: # type: ignore + """Convert string to geography.""" + + +@ibis_udf.scalar.builtin +def st_length(geog: ibis_dtypes.geography, use_spheroid: bool) -> ibis_dtypes.float: # type: ignore + """ST_LENGTH BQ builtin. This body is never executed.""" + pass + + +@ibis_udf.scalar.builtin +def st_isclosed(a: ibis_dtypes.geography) -> ibis_dtypes.boolean: # type: ignore + """Checks if a geography is closed.""" diff --git a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py index e983fc7e21..0876722990 100644 --- a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py +++ b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -837,98 +837,6 @@ def normalize_op_impl(x: ibis_types.Value): return result.cast(result_type) -# Geo Ops -@scalar_op_compiler.register_unary_op(ops.geo_area_op) -def geo_area_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.GeoSpatialValue, x).area() - - -@scalar_op_compiler.register_unary_op(ops.geo_st_astext_op) -def geo_st_astext_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.GeoSpatialValue, x).as_text() - - -@scalar_op_compiler.register_unary_op(ops.geo_st_boundary_op, pass_op=False) -def geo_st_boundary_op_impl(x: ibis_types.Value): - return st_boundary(x) - - -@scalar_op_compiler.register_unary_op(ops.GeoStBufferOp, pass_op=True) -def geo_st_buffer_op_impl(x: ibis_types.Value, op: ops.GeoStBufferOp): - return st_buffer( - x, - op.buffer_radius, - op.num_seg_quarter_circle, - op.use_spheroid, - ) - - -@scalar_op_compiler.register_unary_op(ops.geo_st_centroid_op, pass_op=False) -def geo_st_centroid_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.GeoSpatialValue, x).centroid() - - -@scalar_op_compiler.register_unary_op(ops.geo_st_convexhull_op, pass_op=False) -def geo_st_convexhull_op_impl(x: ibis_types.Value): - return st_convexhull(x) - - -@scalar_op_compiler.register_binary_op(ops.geo_st_difference_op, pass_op=False) -def geo_st_difference_op_impl(x: ibis_types.Value, y: ibis_types.Value): - return typing.cast(ibis_types.GeoSpatialValue, x).difference( - typing.cast(ibis_types.GeoSpatialValue, y) - ) - - -@scalar_op_compiler.register_binary_op(ops.GeoStDistanceOp, pass_op=True) -def geo_st_distance_op_impl( - x: ibis_types.Value, y: ibis_types.Value, op: ops.GeoStDistanceOp -): - return st_distance(x, y, op.use_spheroid) - - -@scalar_op_compiler.register_unary_op(ops.geo_st_geogfromtext_op) -def geo_st_geogfromtext_op_impl(x: ibis_types.Value): - # Ibis doesn't seem to provide a dedicated method to cast from string to geography, - # so we use a BigQuery scalar function, st_geogfromtext(), directly. - return st_geogfromtext(x) - - -@scalar_op_compiler.register_binary_op(ops.geo_st_geogpoint_op, pass_op=False) -def geo_st_geogpoint_op_impl(x: ibis_types.Value, y: ibis_types.Value): - return typing.cast(ibis_types.NumericValue, x).point( - typing.cast(ibis_types.NumericValue, y) - ) - - -@scalar_op_compiler.register_binary_op(ops.geo_st_intersection_op, pass_op=False) -def geo_st_intersection_op_impl(x: ibis_types.Value, y: ibis_types.Value): - return typing.cast(ibis_types.GeoSpatialValue, x).intersection( - typing.cast(ibis_types.GeoSpatialValue, y) - ) - - -@scalar_op_compiler.register_unary_op(ops.geo_st_isclosed_op, pass_op=False) -def geo_st_isclosed_op_impl(x: ibis_types.Value): - return st_isclosed(x) - - -@scalar_op_compiler.register_unary_op(ops.geo_x_op) -def geo_x_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.GeoSpatialValue, x).x() - - -@scalar_op_compiler.register_unary_op(ops.GeoStLengthOp, pass_op=True) -def geo_length_op_impl(x: ibis_types.Value, op: ops.GeoStLengthOp): - # Call the st_length UDF defined in this file (or imported) - return st_length(x, op.use_spheroid) - - -@scalar_op_compiler.register_unary_op(ops.geo_y_op) -def geo_y_op_impl(x: ibis_types.Value): - return typing.cast(ibis_types.GeoSpatialValue, x).y() - - # Parameterized ops @scalar_op_compiler.register_unary_op(ops.StructFieldOp, pass_op=True) def struct_field_op_impl(x: ibis_types.Value, op: ops.StructFieldOp): @@ -2092,17 +2000,6 @@ def _ibis_num(number: float): return typing.cast(ibis_types.NumericValue, ibis_types.literal(number)) -@ibis_udf.scalar.builtin -def st_convexhull(x: ibis_dtypes.geography) -> ibis_dtypes.geography: # type: ignore - """ST_CONVEXHULL""" - ... - - -@ibis_udf.scalar.builtin -def st_geogfromtext(a: str) -> ibis_dtypes.geography: # type: ignore - """Convert string to geography.""" - - @ibis_udf.scalar.builtin def timestamp(a: str) -> ibis_dtypes.timestamp: # type: ignore """Convert string to timestamp.""" @@ -2113,32 +2010,6 @@ def unix_millis(a: ibis_dtypes.timestamp) -> int: # type: ignore """Convert a timestamp to milliseconds""" -@ibis_udf.scalar.builtin -def st_boundary(a: ibis_dtypes.geography) -> ibis_dtypes.geography: # type: ignore - """Find the boundary of a geography.""" - - -@ibis_udf.scalar.builtin -def st_buffer( - geography: ibis_dtypes.geography, # type: ignore - buffer_radius: ibis_dtypes.Float64, - num_seg_quarter_circle: ibis_dtypes.Float64, - use_spheroid: ibis_dtypes.Boolean, -) -> ibis_dtypes.geography: # type: ignore - ... - - -@ibis_udf.scalar.builtin -def st_distance(a: ibis_dtypes.geography, b: ibis_dtypes.geography, use_spheroid: bool) -> ibis_dtypes.float: # type: ignore - """Convert string to geography.""" - - -@ibis_udf.scalar.builtin -def st_length(geog: ibis_dtypes.geography, use_spheroid: bool) -> ibis_dtypes.float: # type: ignore - """ST_LENGTH BQ builtin. This body is never executed.""" - pass - - @ibis_udf.scalar.builtin def unix_micros(a: ibis_dtypes.timestamp) -> int: # type: ignore """Convert a timestamp to microseconds""" @@ -2272,11 +2143,6 @@ def str_lstrip_op( # type: ignore[empty-body] """Remove leading and trailing characters.""" -@ibis_udf.scalar.builtin -def st_isclosed(a: ibis_dtypes.geography) -> ibis_dtypes.boolean: # type: ignore - """Checks if a geography is closed.""" - - @ibis_udf.scalar.builtin(name="rtrim") def str_rstrip_op( # type: ignore[empty-body] x: ibis_dtypes.String, to_strip: ibis_dtypes.String diff --git a/specs/2025-08-04-geoseries-scalars.md b/specs/2025-08-04-geoseries-scalars.md index 38dc77c4cf..66ed77d0dd 100644 --- a/specs/2025-08-04-geoseries-scalars.md +++ b/specs/2025-08-04-geoseries-scalars.md @@ -267,11 +267,14 @@ Raster functions: Functions for analyzing geospatial rasters using geographies. - [ ] **Export the new operation:** - [ ] In `bigframes/operations/__init__.py`, import your new operation dataclass and add it to the `__all__` list. - [ ] **Implement the compilation logic:** - - [ ] In `bigframes/core/compile/scalar_op_compiler.py`: - - [ ] If the BigQuery function has a direct equivalent in Ibis, you can often reuse an existing Ibis method. - - [ ] If not, define a new Ibis UDF using `@ibis_udf.scalar.builtin` to map to the specific BigQuery function signature. - - [ ] Create a new compiler implementation function (e.g., `geo_length_op_impl`). - - [ ] Register this function to your operation dataclass using `@scalar_op_compiler.register_unary_op` or `@scalar_op_compiler.register_binary_op`. + - [ ] In `bigframes/core/compile/ibis_compiler/operations/geo_ops.py`: + - [ ] If the BigQuery function has a direct equivalent in Ibis, you can often reuse an existing Ibis method. + - [ ] If not, define a new Ibis UDF using `@ibis_udf.scalar.builtin` to map to the specific BigQuery function signature. + - [ ] Create a new compiler implementation function (e.g., `geo_length_op_impl`). + - [ ] Register this function to your operation dataclass using `@register_unary_op` or `@register_binary_op`. + - [ ] In `bigframes/core/compile/sqlglot/expressions/geo_ops.py`: + - [ ] Create a new compiler implementation function that generates the appropriate `sqlglot.exp` expression. + - [ ] Register this function to your operation dataclass using `@register_unary_op` or `@register_binary_op`. - [ ] **Implement the user-facing function or property:** - [ ] For a `bigframes.bigquery` function: - [ ] In `bigframes/bigquery/_operations/geo.py`, create the user-facing function (e.g., `st_length`). From a538c694499050d8797a77350c387b474d2059fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Thu, 30 Oct 2025 09:36:05 -0500 Subject: [PATCH 42/53] feat: support INFORMATION_SCHEMA views in `read_gbq` (#1895) * feat: support INFORMATION_SCHEMA tables in read_gbq * avoid storage semi executor * use faster tables for peek tests * more tests * fix mypy * Update bigframes/session/_io/bigquery/read_gbq_table.py * immediately query for information_schema tables * Fix mypy errors and temporarily update python version * snapshot * snapshot again --- .../session/_io/bigquery/read_gbq_table.py | 96 +++++++++++++++++-- bigframes/session/loader.py | 30 +++--- bigframes/session/read_api_execution.py | 3 + .../test_read_gbq_information_schema.py | 50 ++++++++++ tests/unit/session/test_session.py | 4 +- 5 files changed, 161 insertions(+), 22 deletions(-) create mode 100644 tests/system/small/pandas/test_read_gbq_information_schema.py diff --git a/bigframes/session/_io/bigquery/read_gbq_table.py b/bigframes/session/_io/bigquery/read_gbq_table.py index f8a379aee9..465fa08187 100644 --- a/bigframes/session/_io/bigquery/read_gbq_table.py +++ b/bigframes/session/_io/bigquery/read_gbq_table.py @@ -28,6 +28,7 @@ import google.cloud.bigquery as bigquery import google.cloud.bigquery.table +import bigframes.core import bigframes.core.events import bigframes.exceptions as bfe import bigframes.session._io.bigquery @@ -37,18 +38,79 @@ import bigframes.session +def _convert_information_schema_table_id_to_table_reference( + table_id: str, + default_project: Optional[str], +) -> bigquery.TableReference: + """Squeeze an INFORMATION_SCHEMA reference into a TableReference. + This is kind-of a hack. INFORMATION_SCHEMA is a view that isn't available + via the tables.get REST API. + """ + parts = table_id.split(".") + parts_casefold = [part.casefold() for part in parts] + dataset_index = parts_casefold.index("INFORMATION_SCHEMA".casefold()) + + if dataset_index == 0: + project = default_project + else: + project = ".".join(parts[:dataset_index]) + + if project is None: + message = ( + "Could not determine project ID. " + "Please provide a project or region in your INFORMATION_SCHEMA table ID, " + "For example, 'region-REGION_NAME.INFORMATION_SCHEMA.JOBS'." + ) + raise ValueError(message) + + dataset = "INFORMATION_SCHEMA" + table_id_short = ".".join(parts[dataset_index + 1 :]) + return bigquery.TableReference( + bigquery.DatasetReference(project, dataset), + table_id_short, + ) + + +def get_information_schema_metadata( + bqclient: bigquery.Client, + table_id: str, + default_project: Optional[str], +) -> bigquery.Table: + job_config = bigquery.QueryJobConfig(dry_run=True) + job = bqclient.query( + f"SELECT * FROM `{table_id}`", + job_config=job_config, + ) + table_ref = _convert_information_schema_table_id_to_table_reference( + table_id=table_id, + default_project=default_project, + ) + table = bigquery.Table.from_api_repr( + { + "tableReference": table_ref.to_api_repr(), + "location": job.location, + # Prevent ourselves from trying to read the table with the BQ + # Storage API. + "type": "VIEW", + } + ) + table.schema = job.schema + return table + + def get_table_metadata( bqclient: bigquery.Client, - table_ref: google.cloud.bigquery.table.TableReference, - bq_time: datetime.datetime, *, - cache: Dict[bigquery.TableReference, Tuple[datetime.datetime, bigquery.Table]], + table_id: str, + default_project: Optional[str], + bq_time: datetime.datetime, + cache: Dict[str, Tuple[datetime.datetime, bigquery.Table]], use_cache: bool = True, publisher: bigframes.core.events.Publisher, ) -> Tuple[datetime.datetime, google.cloud.bigquery.table.Table]: """Get the table metadata, either from cache or via REST API.""" - cached_table = cache.get(table_ref) + cached_table = cache.get(table_id) if use_cache and cached_table is not None: snapshot_timestamp, table = cached_table @@ -90,7 +152,16 @@ def get_table_metadata( return cached_table - table = bqclient.get_table(table_ref) + if is_information_schema(table_id): + table = get_information_schema_metadata( + bqclient=bqclient, table_id=table_id, default_project=default_project + ) + else: + table_ref = google.cloud.bigquery.table.TableReference.from_string( + table_id, default_project=default_project + ) + table = bqclient.get_table(table_ref) + # local time will lag a little bit do to network latency # make sure it is at least table creation time. # This is relevant if the table was created immediately before loading it here. @@ -98,10 +169,21 @@ def get_table_metadata( bq_time = table.created cached_table = (bq_time, table) - cache[table_ref] = cached_table + cache[table_id] = cached_table return cached_table +def is_information_schema(table_id: str): + table_id_casefold = table_id.casefold() + # Include the "."s to ensure we don't have false positives for some user + # defined dataset like MY_INFORMATION_SCHEMA or tables called + # INFORMATION_SCHEMA. + return ( + ".INFORMATION_SCHEMA.".casefold() in table_id_casefold + or table_id_casefold.startswith("INFORMATION_SCHEMA.".casefold()) + ) + + def is_time_travel_eligible( bqclient: bigquery.Client, table: google.cloud.bigquery.table.Table, @@ -168,6 +250,8 @@ def is_time_travel_eligible( msg, category=bfe.TimeTravelDisabledWarning, stacklevel=stacklevel ) return False + elif table.table_type == "VIEW": + return False # table might support time travel, lets do a dry-run query with time travel if should_dry_run: diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index 940fdc1352..2d5dec13e6 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -47,6 +47,8 @@ import pandas import pyarrow as pa +import bigframes._tools +import bigframes._tools.strings from bigframes.core import guid, identifiers, local_data, nodes, ordering, utils import bigframes.core as core import bigframes.core.blocks as blocks @@ -272,9 +274,7 @@ def __init__( self._default_index_type = default_index_type self._scan_index_uniqueness = scan_index_uniqueness self._force_total_order = force_total_order - self._df_snapshot: Dict[ - bigquery.TableReference, Tuple[datetime.datetime, bigquery.Table] - ] = {} + self._df_snapshot: Dict[str, Tuple[datetime.datetime, bigquery.Table]] = {} self._metrics = metrics self._publisher = publisher # Unfortunate circular reference, but need to pass reference when constructing objects @@ -629,10 +629,6 @@ def read_gbq_table( _check_duplicates("columns", columns) - table_ref = google.cloud.bigquery.table.TableReference.from_string( - table_id, default_project=self._bqclient.project - ) - columns = list(columns) include_all_columns = columns is None or len(columns) == 0 filters = typing.cast(list, list(filters)) @@ -643,7 +639,8 @@ def read_gbq_table( time_travel_timestamp, table = bf_read_gbq_table.get_table_metadata( self._bqclient, - table_ref=table_ref, + table_id=table_id, + default_project=self._bqclient.project, bq_time=self._clock.get_time(), cache=self._df_snapshot, use_cache=use_cache, @@ -706,18 +703,23 @@ def read_gbq_table( # Optionally, execute the query # ----------------------------- - # max_results introduces non-determinism and limits the cost on - # clustered tables, so fallback to a query. We do this here so that - # the index is consistent with tables that have primary keys, even - # when max_results is set. - if max_results is not None: + if ( + # max_results introduces non-determinism and limits the cost on + # clustered tables, so fallback to a query. We do this here so that + # the index is consistent with tables that have primary keys, even + # when max_results is set. + max_results is not None + # Views such as INFORMATION_SCHEMA can introduce non-determinism. + # They can update frequently and don't support time travel. + or bf_read_gbq_table.is_information_schema(table_id) + ): # TODO(b/338111344): If we are running a query anyway, we might as # well generate ROW_NUMBER() at the same time. all_columns: Iterable[str] = ( itertools.chain(index_cols, columns) if columns else () ) query = bf_io_bigquery.to_query( - table_id, + f"{table.project}.{table.dataset_id}.{table.table_id}", columns=all_columns, sql_predicate=bf_io_bigquery.compile_filters(filters) if filters diff --git a/bigframes/session/read_api_execution.py b/bigframes/session/read_api_execution.py index 2530a1dc8d..136c279c08 100644 --- a/bigframes/session/read_api_execution.py +++ b/bigframes/session/read_api_execution.py @@ -46,6 +46,9 @@ def execute( if node.explicitly_ordered and ordered: return None + if not node.source.table.is_physically_stored: + return None + if limit is not None: if peek is None or limit < peek: peek = limit diff --git a/tests/system/small/pandas/test_read_gbq_information_schema.py b/tests/system/small/pandas/test_read_gbq_information_schema.py new file mode 100644 index 0000000000..32e2dc4712 --- /dev/null +++ b/tests/system/small/pandas/test_read_gbq_information_schema.py @@ -0,0 +1,50 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + + +@pytest.mark.parametrize("include_project", [True, False]) +@pytest.mark.parametrize( + "view_id", + [ + # https://cloud.google.com/bigquery/docs/information-schema-intro + "region-US.INFORMATION_SCHEMA.SESSIONS_BY_USER", + "region-US.INFORMATION_SCHEMA.SCHEMATA", + ], +) +def test_read_gbq_jobs_by_user_returns_schema( + unordered_session, view_id: str, include_project: bool +): + if include_project: + table_id = unordered_session.bqclient.project + "." + view_id + else: + table_id = view_id + + df = unordered_session.read_gbq(table_id, max_results=10) + assert df.dtypes is not None + + +def test_read_gbq_schemata_can_be_peeked(unordered_session): + df = unordered_session.read_gbq("region-US.INFORMATION_SCHEMA.SCHEMATA") + result = df.peek() + assert result is not None + + +def test_read_gbq_schemata_four_parts_can_be_peeked(unordered_session): + df = unordered_session.read_gbq( + f"{unordered_session.bqclient.project}.region-US.INFORMATION_SCHEMA.SCHEMATA" + ) + result = df.peek() + assert result is not None diff --git a/tests/unit/session/test_session.py b/tests/unit/session/test_session.py index d05957b941..f003398706 100644 --- a/tests/unit/session/test_session.py +++ b/tests/unit/session/test_session.py @@ -242,7 +242,7 @@ def test_read_gbq_cached_table(): table._properties["numRows"] = "1000000000" table._properties["location"] = session._location table._properties["type"] = "TABLE" - session._loader._df_snapshot[table_ref] = ( + session._loader._df_snapshot[str(table_ref)] = ( datetime.datetime(1999, 1, 2, 3, 4, 5, 678901, tzinfo=datetime.timezone.utc), table, ) @@ -273,7 +273,7 @@ def test_read_gbq_cached_table_doesnt_warn_for_anonymous_tables_and_doesnt_inclu table._properties["numRows"] = "1000000000" table._properties["location"] = session._location table._properties["type"] = "TABLE" - session._loader._df_snapshot[table_ref] = ( + session._loader._df_snapshot[str(table_ref)] = ( datetime.datetime(1999, 1, 2, 3, 4, 5, 678901, tzinfo=datetime.timezone.utc), table, ) From db5d8ea04ee3e8a6382ac546764aff0f6880f66b Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 30 Oct 2025 20:18:34 +0000 Subject: [PATCH 43/53] Revert: Unwanted code changes --- =3.4.0 | 13 + =3.4.0, | 0 emp | 0 .../dataframes/anywidget_mode.nbconvert.ipynb | 622 +++++++++++++++ notebooks/e2e_RAG_bk.ipynb | 624 +++++++++++++++ notebooks/e2e_RAG_debug.ipynb | 305 ++++++++ notebooks/e2e_RAG_prod.ipynb | 571 ++++++++++++++ notebooks/e2e_RAG_prod_1M.ipynb | 661 ++++++++++++++++ notebooks/e2e_RAG_test.ipynb | 712 ++++++++++++++++++ notebooks/google_sql_notebook.ipynb | 54 ++ .../multimodal/transcribe_partial_mode.ipynb | 153 ++++ notebooks/test.ipynb | 225 ++++++ notebooks/test_blob_trancription.ipynb | 310 ++++++++ notebooks/test_blob_trans_blur_image.ipynb | 200 +++++ notebooks/test_blob_trans_pdf_extract.ipynb | 445 +++++++++++ notebooks/test_blob_transcribe.ipynb | 157 ++++ .../test_blob_transcribe_1M_short_audio.ipynb | 450 +++++++++++ ...st_blob_transcribe_1M_short_audio_v1.ipynb | 345 +++++++++ .../test_blob_transcribe_long_audio.ipynb | 315 ++++++++ .../test_blob_transcribe_long_audio_2p5.ipynb | 271 +++++++ notebooks/test_notebook.ipynb | 58 ++ tests/system/small/test_loc.py | 222 ++++++ tests/unit/ml/test_utils.py | 34 + 23 files changed, 6747 insertions(+) create mode 100644 =3.4.0 create mode 100644 =3.4.0, create mode 100644 emp create mode 100644 notebooks/dataframes/anywidget_mode.nbconvert.ipynb create mode 100644 notebooks/e2e_RAG_bk.ipynb create mode 100644 notebooks/e2e_RAG_debug.ipynb create mode 100644 notebooks/e2e_RAG_prod.ipynb create mode 100644 notebooks/e2e_RAG_prod_1M.ipynb create mode 100644 notebooks/e2e_RAG_test.ipynb create mode 100644 notebooks/google_sql_notebook.ipynb create mode 100644 notebooks/multimodal/transcribe_partial_mode.ipynb create mode 100644 notebooks/test.ipynb create mode 100644 notebooks/test_blob_trancription.ipynb create mode 100644 notebooks/test_blob_trans_blur_image.ipynb create mode 100644 notebooks/test_blob_trans_pdf_extract.ipynb create mode 100644 notebooks/test_blob_transcribe.ipynb create mode 100644 notebooks/test_blob_transcribe_1M_short_audio.ipynb create mode 100644 notebooks/test_blob_transcribe_1M_short_audio_v1.ipynb create mode 100644 notebooks/test_blob_transcribe_long_audio.ipynb create mode 100644 notebooks/test_blob_transcribe_long_audio_2p5.ipynb create mode 100644 notebooks/test_notebook.ipynb create mode 100644 tests/system/small/test_loc.py create mode 100644 tests/unit/ml/test_utils.py diff --git a/=3.4.0 b/=3.4.0 new file mode 100644 index 0000000000..51e648aef4 --- /dev/null +++ b/=3.4.0 @@ -0,0 +1,13 @@ +Collecting pypdf[crypto] + Downloading pypdf-6.0.0-py3-none-any.whl (310 kB) + ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 310.5/310.5 kB 11.5 MB/s eta 0:00:00 +Requirement already satisfied: typing_extensions>=4.0 in ./venv/lib/python3.10/site-packages (from pypdf[crypto]) (4.14.1) +Collecting cryptography + Downloading cryptography-45.0.6-cp37-abi3-manylinux_2_34_x86_64.whl (4.4 MB) + ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 4.4/4.4 MB 78.9 MB/s eta 0:00:00 +Collecting cffi>=1.14 + Using cached cffi-1.17.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (446 kB) +Collecting pycparser + Using cached pycparser-2.22-py3-none-any.whl (117 kB) +Installing collected packages: pypdf, pycparser, cffi, cryptography +Successfully installed cffi-1.17.1 cryptography-45.0.6 pycparser-2.22 pypdf-6.0.0 diff --git a/=3.4.0, b/=3.4.0, new file mode 100644 index 0000000000..e69de29bb2 diff --git a/emp b/emp new file mode 100644 index 0000000000..e69de29bb2 diff --git a/notebooks/dataframes/anywidget_mode.nbconvert.ipynb b/notebooks/dataframes/anywidget_mode.nbconvert.ipynb new file mode 100644 index 0000000000..32a4b432a2 --- /dev/null +++ b/notebooks/dataframes/anywidget_mode.nbconvert.ipynb @@ -0,0 +1,622 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "d10bfca4", + "metadata": { + "execution": { + "iopub.execute_input": "2025-08-19T18:07:47.689257Z", + "iopub.status.busy": "2025-08-19T18:07:47.688863Z", + "iopub.status.idle": "2025-08-19T18:07:47.694257Z", + "shell.execute_reply": "2025-08-19T18:07:47.693398Z" + } + }, + "outputs": [], + "source": [ + "# Copyright 2025 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "id": "acca43ae", + "metadata": {}, + "source": [ + "# Demo to Show Anywidget mode" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "ca22f059", + "metadata": { + "execution": { + "iopub.execute_input": "2025-08-19T18:07:47.697344Z", + "iopub.status.busy": "2025-08-19T18:07:47.697049Z", + "iopub.status.idle": "2025-08-19T18:07:49.528371Z", + "shell.execute_reply": "2025-08-19T18:07:49.527605Z" + } + }, + "outputs": [], + "source": [ + "import bigframes.pandas as bpd" + ] + }, + { + "cell_type": "markdown", + "id": "04406a4d", + "metadata": {}, + "source": [ + "Set the display option to use anywidget" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1bc5aaf3", + "metadata": { + "execution": { + "iopub.execute_input": "2025-08-19T18:07:49.531182Z", + "iopub.status.busy": "2025-08-19T18:07:49.530928Z", + "iopub.status.idle": "2025-08-19T18:07:49.535337Z", + "shell.execute_reply": "2025-08-19T18:07:49.534613Z" + } + }, + "outputs": [], + "source": [ + "bpd.options.bigquery.ordering_mode = \"partial\"\n", + "bpd.options.display.repr_mode = \"anywidget\"" + ] + }, + { + "cell_type": "markdown", + "id": "0a354c69", + "metadata": {}, + "source": [ + "Load Sample Data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "f289d250", + "metadata": { + "execution": { + "iopub.execute_input": "2025-08-19T18:07:49.538687Z", + "iopub.status.busy": "2025-08-19T18:07:49.538398Z", + "iopub.status.idle": "2025-08-19T18:07:53.574536Z", + "shell.execute_reply": "2025-08-19T18:07:53.573718Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Computation deferred. Computation will process 171.4 MB\n" + ] + } + ], + "source": [ + "df = bpd.read_gbq(\"bigquery-public-data.usa_names.usa_1910_2013\")\n", + "print(df)" + ] + }, + { + "cell_type": "markdown", + "id": "3a73e472", + "metadata": {}, + "source": [ + "Display Series in anywidget mode" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "42bb02ab", + "metadata": { + "execution": { + "iopub.execute_input": "2025-08-19T18:07:53.577575Z", + "iopub.status.busy": "2025-08-19T18:07:53.577219Z", + "iopub.status.idle": "2025-08-19T18:07:53.997894Z", + "shell.execute_reply": "2025-08-19T18:07:53.996854Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Computation deferred. Computation will process 44.4 MB\n" + ] + } + ], + "source": [ + "test_series = df[\"year\"]\n", + "print(test_series)" + ] + }, + { + "cell_type": "markdown", + "id": "7bcf1bb7", + "metadata": {}, + "source": [ + "Display with Pagination" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "ce250157", + "metadata": { + "execution": { + "iopub.execute_input": "2025-08-19T18:07:54.001504Z", + "iopub.status.busy": "2025-08-19T18:07:54.000991Z", + "iopub.status.idle": "2025-08-19T18:07:56.279608Z", + "shell.execute_reply": "2025-08-19T18:07:56.278922Z" + } + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d2ab83b1e9f24674a73a12094be1e831", + "version_major": 2, + "version_minor": 1 + }, + "text/plain": [ + "TableWidget()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [], + "text/plain": [ + "Computation deferred. Computation will process 171.4 MB" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "markdown", + "id": "bb15bab6", + "metadata": {}, + "source": [ + "Programmatic Navigation Demo" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "6920d49b", + "metadata": { + "execution": { + "iopub.execute_input": "2025-08-19T18:07:56.282008Z", + "iopub.status.busy": "2025-08-19T18:07:56.281778Z", + "iopub.status.idle": "2025-08-19T18:07:56.959938Z", + "shell.execute_reply": "2025-08-19T18:07:56.959205Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total pages: 555246\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e7c10bb6833b4f649a26d5f33b00897b", + "version_major": 2, + "version_minor": 1 + }, + "text/plain": [ + "TableWidget()" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from bigframes.display.anywidget import TableWidget\n", + "import math\n", + " \n", + "# Create widget programmatically \n", + "widget = TableWidget(df)\n", + "print(f\"Total pages: {math.ceil(widget.row_count / widget.page_size)}\")\n", + " \n", + "# Display the widget\n", + "widget" + ] + }, + { + "cell_type": "markdown", + "id": "02cbd1be", + "metadata": {}, + "source": [ + "Test Navigation Programmatically" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "12b68f15", + "metadata": { + "execution": { + "iopub.execute_input": "2025-08-19T18:07:56.962194Z", + "iopub.status.busy": "2025-08-19T18:07:56.961974Z", + "iopub.status.idle": "2025-08-19T18:07:56.965782Z", + "shell.execute_reply": "2025-08-19T18:07:56.965121Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Current page: 0\n", + "After next: 1\n", + "After prev: 0\n" + ] + } + ], + "source": [ + "# Simulate button clicks programmatically\n", + "print(\"Current page:\", widget.page)\n", + "\n", + "# Go to next page\n", + "widget.page = 1\n", + "print(\"After next:\", widget.page)\n", + "\n", + "# Go to previous page\n", + "widget.page = 0\n", + "print(\"After prev:\", widget.page)" + ] + }, + { + "cell_type": "markdown", + "id": "9d310138", + "metadata": {}, + "source": [ + "Edge Case Demonstration" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "a9d5d13a", + "metadata": { + "execution": { + "iopub.execute_input": "2025-08-19T18:07:56.968276Z", + "iopub.status.busy": "2025-08-19T18:07:56.968023Z", + "iopub.status.idle": "2025-08-19T18:08:12.463471Z", + "shell.execute_reply": "2025-08-19T18:08:12.462652Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/array_value.py:230: AmbiguousWindowWarning: Window ordering may be ambiguous, this can cause unstable results.\n", + " warnings.warn(msg, bfe.AmbiguousWindowWarning)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Small dataset pages: 1\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a670ee71d58c47babab171f4f229db62", + "version_major": 2, + "version_minor": 1 + }, + "text/plain": [ + "TableWidget()" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Test with very small dataset\n", + "small_df = df.sort_values([\"name\", \"year\", \"state\"]).head(5)\n", + "small_widget = TableWidget(small_df)\n", + "print(f\"Small dataset pages: {math.ceil(small_widget.row_count / small_widget.page_size)}\")\n", + "small_widget" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c4e5836b-c872-4a9c-b9ec-14f6f338176d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": { + "2ac7d45b9bce40f196823982403f3bf3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "2.0.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border_bottom": null, + "border_left": null, + "border_right": null, + "border_top": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "45d720d8fd954a529cc657457f681ee1": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "2.0.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border_bottom": null, + "border_left": null, + "border_right": null, + "border_top": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "548b9bae022d4dc5a38a6a8740276387": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "2.0.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "2.0.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border_bottom": null, + "border_left": null, + "border_right": null, + "border_top": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } + }, + "a670ee71d58c47babab171f4f229db62": { + "model_module": "anywidget", + "model_module_version": "~0.9.*", + "model_name": "AnyModel", + "state": { + "_anywidget_id": "bigframes.display.anywidget.TableWidget", + "_css": "/**\n * Copyright 2025 Google LLC\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n.bigframes-widget {\n\tdisplay: inline-block;\n}\n\n.bigframes-widget .table-container {\n\tmax-height: 620px;\n\toverflow: auto;\n}\n\n.bigframes-widget .footer {\n\talign-items: center;\n\tdisplay: flex;\n\tfont-size: 0.8rem;\n\tpadding-top: 8px;\n}\n\n.bigframes-widget .footer > * {\n\tflex: 1;\n}\n\n.bigframes-widget .pagination {\n\talign-items: center;\n\tdisplay: flex;\n\tflex-direction: row;\n\tgap: 4px;\n\tjustify-content: center;\n\tpadding: 4px;\n}\n\n.bigframes-widget .page-size {\n\talign-items: center;\n\tdisplay: flex;\n\tflex-direction: row;\n\tgap: 4px;\n\tjustify-content: end;\n}\n\n.bigframes-widget table {\n\tborder-collapse: collapse;\n\ttext-align: left;\n}\n\n.bigframes-widget th {\n\tbackground-color: var(--colab-primary-surface-color, var(--jp-layout-color0));\n\t/* Uncomment once we support sorting: cursor: pointer; */\n\tposition: sticky;\n\ttop: 0;\n\tz-index: 1;\n}\n\n.bigframes-widget button {\n\tcursor: pointer;\n\tdisplay: inline-block;\n\ttext-align: center;\n\ttext-decoration: none;\n\tuser-select: none;\n\tvertical-align: middle;\n}\n\n.bigframes-widget button:disabled {\n\topacity: 0.65;\n\tpointer-events: none;\n}\n", + "_dom_classes": [], + "_esm": "\nfunction render({ model, el }) {\n\tconsole.log(\"render called\");\n\t// Main container with a unique class for CSS scoping\n\tel.classList.add(\"bigframes-widget\");\n\n\t// State\n\tlet page = 0;\n\tlet pageSize = 10;\n\tlet rowCount = 0;\n\tlet tableHtml = \"\";\n\n\t// Structure\n\tconst tableContainer = document.createElement(\"div\");\n\tconst footer = document.createElement(\"div\");\n\n\t// Footer: Total rows label\n\tconst rowCountLabel = document.createElement(\"div\");\n\n\t// Footer: Pagination controls\n\tconst paginationContainer = document.createElement(\"div\");\n\tconst prevPage = document.createElement(\"button\");\n\tconst paginationLabel = document.createElement(\"span\");\n\tconst nextPage = document.createElement(\"button\");\n\n\t// Footer: Page size controls\n\tconst pageSizeContainer = document.createElement(\"div\");\n\tconst pageSizeLabel = document.createElement(\"label\");\n\tconst pageSizeSelect = document.createElement(\"select\");\n\n\t// Add CSS classes\n\ttableContainer.classList.add(\"table-container\");\n\tfooter.classList.add(\"footer\");\n\tpaginationContainer.classList.add(\"pagination\");\n\tpageSizeContainer.classList.add(\"page-size\");\n\n\t// Configure pagination buttons\n\tprevPage.type = \"button\";\n\tnextPage.type = \"button\";\n\tprevPage.textContent = \"Prev\";\n\tnextPage.textContent = \"Next\";\n\n\t// Configure page size selector\n\tpageSizeLabel.textContent = \"Page Size\";\n\tfor (const size of [10, 25, 50, 100]) {\n\t\tconst option = document.createElement(\"option\");\n\t\toption.value = size;\n\t\toption.textContent = size;\n\t\tpageSizeSelect.appendChild(option);\n\t}\n\n\t// Add event listeners\n\tprevPage.addEventListener(\"click\", () => {\n\t\tmodel.send({ type: \"page_change\", page: page - 1 });\n\t});\n\tnextPage.addEventListener(\"click\", () => {\n\t\tmodel.send({ type: \"page_change\", page: page + 1 });\n\t});\n\tpageSizeSelect.addEventListener(\"change\", (e) => {\n\t\tconst newSize = Number(e.target.value);\n\t\tif (newSize) {\n\t\t\tmodel.send({ type: \"page_size_change\", page_size: newSize });\n\t\t}\n\t});\n\n\tfunction updateUI() {\n\t\tconst totalPages = Math.ceil(rowCount / pageSize);\n\t\trowCountLabel.textContent = `${rowCount.toLocaleString()} total rows`;\n\t\tpaginationLabel.textContent = `Page ${page + 1} of ${totalPages || 1}`;\n\t\tprevPage.disabled = page === 0;\n\t\tnextPage.disabled = page >= totalPages - 1;\n\t\tpageSizeSelect.value = pageSize;\n\t\ttableContainer.innerHTML = tableHtml;\n\t}\n\n\tmodel.onMsg((msg) => {\n\t\tconsole.log(\"message received\", msg);\n\t\tif (msg.type === \"update\") {\n\t\t\tpage = msg.page;\n\t\t\tpageSize = msg.page_size;\n\t\t\trowCount = msg.row_count;\n\t\t\ttableHtml = msg.table_html;\n\t\t\tupdateUI();\n\t\t}\n\t});\n\n\t// Assemble the DOM\n\tpaginationContainer.appendChild(prevPage);\n\tpaginationContainer.appendChild(paginationLabel);\n\tpaginationContainer.appendChild(nextPage);\n\n\tpageSizeContainer.appendChild(pageSizeLabel);\n\tpageSizeContainer.appendChild(pageSizeSelect);\n\n\tfooter.appendChild(rowCountLabel);\n\tfooter.appendChild(paginationContainer);\n\tfooter.appendChild(pageSizeContainer);\n\n\tel.appendChild(tableContainer);\n\tel.appendChild(footer);\n\n\t// Initial UI state\n\tupdateUI();\n}\n\nexport default { render };\n", + "_model_module": "anywidget", + "_model_module_version": "~0.9.*", + "_model_name": "AnyModel", + "_view_count": null, + "_view_module": "anywidget", + "_view_module_version": "~0.9.*", + "_view_name": "AnyView", + "layout": "IPY_MODEL_548b9bae022d4dc5a38a6a8740276387", + "tabbable": null, + "tooltip": null + } + }, + "d2ab83b1e9f24674a73a12094be1e831": { + "model_module": "anywidget", + "model_module_version": "~0.9.*", + "model_name": "AnyModel", + "state": { + "_anywidget_id": "bigframes.display.anywidget.TableWidget", + "_css": "/**\n * Copyright 2025 Google LLC\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n.bigframes-widget {\n\tdisplay: inline-block;\n}\n\n.bigframes-widget .table-container {\n\tmax-height: 620px;\n\toverflow: auto;\n}\n\n.bigframes-widget .footer {\n\talign-items: center;\n\tdisplay: flex;\n\tfont-size: 0.8rem;\n\tpadding-top: 8px;\n}\n\n.bigframes-widget .footer > * {\n\tflex: 1;\n}\n\n.bigframes-widget .pagination {\n\talign-items: center;\n\tdisplay: flex;\n\tflex-direction: row;\n\tgap: 4px;\n\tjustify-content: center;\n\tpadding: 4px;\n}\n\n.bigframes-widget .page-size {\n\talign-items: center;\n\tdisplay: flex;\n\tflex-direction: row;\n\tgap: 4px;\n\tjustify-content: end;\n}\n\n.bigframes-widget table {\n\tborder-collapse: collapse;\n\ttext-align: left;\n}\n\n.bigframes-widget th {\n\tbackground-color: var(--colab-primary-surface-color, var(--jp-layout-color0));\n\t/* Uncomment once we support sorting: cursor: pointer; */\n\tposition: sticky;\n\ttop: 0;\n\tz-index: 1;\n}\n\n.bigframes-widget button {\n\tcursor: pointer;\n\tdisplay: inline-block;\n\ttext-align: center;\n\ttext-decoration: none;\n\tuser-select: none;\n\tvertical-align: middle;\n}\n\n.bigframes-widget button:disabled {\n\topacity: 0.65;\n\tpointer-events: none;\n}\n", + "_dom_classes": [], + "_esm": "\nfunction render({ model, el }) {\n\tconsole.log(\"render called\");\n\t// Main container with a unique class for CSS scoping\n\tel.classList.add(\"bigframes-widget\");\n\n\t// State\n\tlet page = 0;\n\tlet pageSize = 10;\n\tlet rowCount = 0;\n\tlet tableHtml = \"\";\n\n\t// Structure\n\tconst tableContainer = document.createElement(\"div\");\n\tconst footer = document.createElement(\"div\");\n\n\t// Footer: Total rows label\n\tconst rowCountLabel = document.createElement(\"div\");\n\n\t// Footer: Pagination controls\n\tconst paginationContainer = document.createElement(\"div\");\n\tconst prevPage = document.createElement(\"button\");\n\tconst paginationLabel = document.createElement(\"span\");\n\tconst nextPage = document.createElement(\"button\");\n\n\t// Footer: Page size controls\n\tconst pageSizeContainer = document.createElement(\"div\");\n\tconst pageSizeLabel = document.createElement(\"label\");\n\tconst pageSizeSelect = document.createElement(\"select\");\n\n\t// Add CSS classes\n\ttableContainer.classList.add(\"table-container\");\n\tfooter.classList.add(\"footer\");\n\tpaginationContainer.classList.add(\"pagination\");\n\tpageSizeContainer.classList.add(\"page-size\");\n\n\t// Configure pagination buttons\n\tprevPage.type = \"button\";\n\tnextPage.type = \"button\";\n\tprevPage.textContent = \"Prev\";\n\tnextPage.textContent = \"Next\";\n\n\t// Configure page size selector\n\tpageSizeLabel.textContent = \"Page Size\";\n\tfor (const size of [10, 25, 50, 100]) {\n\t\tconst option = document.createElement(\"option\");\n\t\toption.value = size;\n\t\toption.textContent = size;\n\t\tpageSizeSelect.appendChild(option);\n\t}\n\n\t// Add event listeners\n\tprevPage.addEventListener(\"click\", () => {\n\t\tmodel.send({ type: \"page_change\", page: page - 1 });\n\t});\n\tnextPage.addEventListener(\"click\", () => {\n\t\tmodel.send({ type: \"page_change\", page: page + 1 });\n\t});\n\tpageSizeSelect.addEventListener(\"change\", (e) => {\n\t\tconst newSize = Number(e.target.value);\n\t\tif (newSize) {\n\t\t\tmodel.send({ type: \"page_size_change\", page_size: newSize });\n\t\t}\n\t});\n\n\tfunction updateUI() {\n\t\tconst totalPages = Math.ceil(rowCount / pageSize);\n\t\trowCountLabel.textContent = `${rowCount.toLocaleString()} total rows`;\n\t\tpaginationLabel.textContent = `Page ${page + 1} of ${totalPages || 1}`;\n\t\tprevPage.disabled = page === 0;\n\t\tnextPage.disabled = page >= totalPages - 1;\n\t\tpageSizeSelect.value = pageSize;\n\t\ttableContainer.innerHTML = tableHtml;\n\t}\n\n\tmodel.onMsg((msg) => {\n\t\tconsole.log(\"message received\", msg);\n\t\tif (msg.type === \"update\") {\n\t\t\tpage = msg.page;\n\t\t\tpageSize = msg.page_size;\n\t\t\trowCount = msg.row_count;\n\t\t\ttableHtml = msg.table_html;\n\t\t\tupdateUI();\n\t\t}\n\t});\n\n\t// Assemble the DOM\n\tpaginationContainer.appendChild(prevPage);\n\tpaginationContainer.appendChild(paginationLabel);\n\tpaginationContainer.appendChild(nextPage);\n\n\tpageSizeContainer.appendChild(pageSizeLabel);\n\tpageSizeContainer.appendChild(pageSizeSelect);\n\n\tfooter.appendChild(rowCountLabel);\n\tfooter.appendChild(paginationContainer);\n\tfooter.appendChild(pageSizeContainer);\n\n\tel.appendChild(tableContainer);\n\tel.appendChild(footer);\n\n\t// Initial UI state\n\tupdateUI();\n}\n\nexport default { render };\n", + "_model_module": "anywidget", + "_model_module_version": "~0.9.*", + "_model_name": "AnyModel", + "_view_count": null, + "_view_module": "anywidget", + "_view_module_version": "~0.9.*", + "_view_name": "AnyView", + "layout": "IPY_MODEL_45d720d8fd954a529cc657457f681ee1", + "tabbable": null, + "tooltip": null + } + }, + "e7c10bb6833b4f649a26d5f33b00897b": { + "model_module": "anywidget", + "model_module_version": "~0.9.*", + "model_name": "AnyModel", + "state": { + "_anywidget_id": "bigframes.display.anywidget.TableWidget", + "_css": "/**\n * Copyright 2025 Google LLC\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n.bigframes-widget {\n\tdisplay: inline-block;\n}\n\n.bigframes-widget .table-container {\n\tmax-height: 620px;\n\toverflow: auto;\n}\n\n.bigframes-widget .footer {\n\talign-items: center;\n\tdisplay: flex;\n\tfont-size: 0.8rem;\n\tpadding-top: 8px;\n}\n\n.bigframes-widget .footer > * {\n\tflex: 1;\n}\n\n.bigframes-widget .pagination {\n\talign-items: center;\n\tdisplay: flex;\n\tflex-direction: row;\n\tgap: 4px;\n\tjustify-content: center;\n\tpadding: 4px;\n}\n\n.bigframes-widget .page-size {\n\talign-items: center;\n\tdisplay: flex;\n\tflex-direction: row;\n\tgap: 4px;\n\tjustify-content: end;\n}\n\n.bigframes-widget table {\n\tborder-collapse: collapse;\n\ttext-align: left;\n}\n\n.bigframes-widget th {\n\tbackground-color: var(--colab-primary-surface-color, var(--jp-layout-color0));\n\t/* Uncomment once we support sorting: cursor: pointer; */\n\tposition: sticky;\n\ttop: 0;\n\tz-index: 1;\n}\n\n.bigframes-widget button {\n\tcursor: pointer;\n\tdisplay: inline-block;\n\ttext-align: center;\n\ttext-decoration: none;\n\tuser-select: none;\n\tvertical-align: middle;\n}\n\n.bigframes-widget button:disabled {\n\topacity: 0.65;\n\tpointer-events: none;\n}\n", + "_dom_classes": [], + "_esm": "\nfunction render({ model, el }) {\n\tconsole.log(\"render called\");\n\t// Main container with a unique class for CSS scoping\n\tel.classList.add(\"bigframes-widget\");\n\n\t// State\n\tlet page = 0;\n\tlet pageSize = 10;\n\tlet rowCount = 0;\n\tlet tableHtml = \"\";\n\n\t// Structure\n\tconst tableContainer = document.createElement(\"div\");\n\tconst footer = document.createElement(\"div\");\n\n\t// Footer: Total rows label\n\tconst rowCountLabel = document.createElement(\"div\");\n\n\t// Footer: Pagination controls\n\tconst paginationContainer = document.createElement(\"div\");\n\tconst prevPage = document.createElement(\"button\");\n\tconst paginationLabel = document.createElement(\"span\");\n\tconst nextPage = document.createElement(\"button\");\n\n\t// Footer: Page size controls\n\tconst pageSizeContainer = document.createElement(\"div\");\n\tconst pageSizeLabel = document.createElement(\"label\");\n\tconst pageSizeSelect = document.createElement(\"select\");\n\n\t// Add CSS classes\n\ttableContainer.classList.add(\"table-container\");\n\tfooter.classList.add(\"footer\");\n\tpaginationContainer.classList.add(\"pagination\");\n\tpageSizeContainer.classList.add(\"page-size\");\n\n\t// Configure pagination buttons\n\tprevPage.type = \"button\";\n\tnextPage.type = \"button\";\n\tprevPage.textContent = \"Prev\";\n\tnextPage.textContent = \"Next\";\n\n\t// Configure page size selector\n\tpageSizeLabel.textContent = \"Page Size\";\n\tfor (const size of [10, 25, 50, 100]) {\n\t\tconst option = document.createElement(\"option\");\n\t\toption.value = size;\n\t\toption.textContent = size;\n\t\tpageSizeSelect.appendChild(option);\n\t}\n\n\t// Add event listeners\n\tprevPage.addEventListener(\"click\", () => {\n\t\tmodel.send({ type: \"page_change\", page: page - 1 });\n\t});\n\tnextPage.addEventListener(\"click\", () => {\n\t\tmodel.send({ type: \"page_change\", page: page + 1 });\n\t});\n\tpageSizeSelect.addEventListener(\"change\", (e) => {\n\t\tconst newSize = Number(e.target.value);\n\t\tif (newSize) {\n\t\t\tmodel.send({ type: \"page_size_change\", page_size: newSize });\n\t\t}\n\t});\n\n\tfunction updateUI() {\n\t\tconst totalPages = Math.ceil(rowCount / pageSize);\n\t\trowCountLabel.textContent = `${rowCount.toLocaleString()} total rows`;\n\t\tpaginationLabel.textContent = `Page ${page + 1} of ${totalPages || 1}`;\n\t\tprevPage.disabled = page === 0;\n\t\tnextPage.disabled = page >= totalPages - 1;\n\t\tpageSizeSelect.value = pageSize;\n\t\ttableContainer.innerHTML = tableHtml;\n\t}\n\n\tmodel.onMsg((msg) => {\n\t\tconsole.log(\"message received\", msg);\n\t\tif (msg.type === \"update\") {\n\t\t\tpage = msg.page;\n\t\t\tpageSize = msg.page_size;\n\t\t\trowCount = msg.row_count;\n\t\t\ttableHtml = msg.table_html;\n\t\t\tupdateUI();\n\t\t}\n\t});\n\n\t// Assemble the DOM\n\tpaginationContainer.appendChild(prevPage);\n\tpaginationContainer.appendChild(paginationLabel);\n\tpaginationContainer.appendChild(nextPage);\n\n\tpageSizeContainer.appendChild(pageSizeLabel);\n\tpageSizeContainer.appendChild(pageSizeSelect);\n\n\tfooter.appendChild(rowCountLabel);\n\tfooter.appendChild(paginationContainer);\n\tfooter.appendChild(pageSizeContainer);\n\n\tel.appendChild(tableContainer);\n\tel.appendChild(footer);\n\n\t// Initial UI state\n\tupdateUI();\n}\n\nexport default { render };\n", + "_model_module": "anywidget", + "_model_module_version": "~0.9.*", + "_model_name": "AnyModel", + "_view_count": null, + "_view_module": "anywidget", + "_view_module_version": "~0.9.*", + "_view_name": "AnyView", + "layout": "IPY_MODEL_2ac7d45b9bce40f196823982403f3bf3", + "tabbable": null, + "tooltip": null + } + } + }, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/e2e_RAG_bk.ipynb b/notebooks/e2e_RAG_bk.ipynb new file mode 100644 index 0000000000..9b2f23a483 --- /dev/null +++ b/notebooks/e2e_RAG_bk.ipynb @@ -0,0 +1,624 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2025 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get hands-on experience with Gemini-powered AI operator APIs in this notebook. We'll start with clear examples of API syntax, ensuring you understand how to use these operators. Then, we'll dive into a real-world application, showcasing their performance on a large dataset and providing key statistics. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Preparation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, import the BigFrames modules." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import bigframes\n", + "import bigframes.pandas as bpd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Make sure the BigFrames version is at least `1.36.0`" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from packaging.version import Version\n", + "\n", + "assert Version(bigframes.__version__) >= Version(\"1.36.0\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Connect to test environmet" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/_config/experiment_options.py:55: PreviewWarning: \u001b[93mBigFrames Blob is still under experiments. It may not work and subject\n", + "to change in the future.\u001b[0m\n", + " warnings.warn(msg, category=bfe.PreviewWarning)\n", + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/_config/bigquery_options.py:364: UserWarning: \u001b[93mThis is an advanced configuration option for directly setting\n", + "endpoints. Incorrect use may lead to unexpected behavior or system\n", + "instability. Proceed only if you fully understand its implications.\u001b[0m\n", + " warnings.warn(msg)\n" + ] + } + ], + "source": [ + "bigframes.options.experiments.blob = True\n", + "bigframes.options._bigquery_options.client_endpoints_override = {\"bqclient\": \"https://test-bigquery.sandbox.google.com\", \n", + " \"bqconnectionclient\": \"test-bigqueryconnection.sandbox.googleapis.com\", \n", + " \"bqstoragereadclient\": \"test-bigquerystorage-grpc.sandbox.googleapis.com\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PDF chunk" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Retrieval of PDF URLs" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/global_session.py:114: DefaultLocationWarning: \u001b[93mNo explicit location is set, so using location US for the session.\u001b[0m\n", + " return func(get_global_session(), *args, **kwargs)\n" + ] + }, + { + "data": { + "text/html": [ + "Query job b261812f-a98e-4453-9a23-7f5c8ab7811b is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 370a4f1e-3b2c-405d-86f6-92167f03d464 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "chunks_df = bpd.from_glob_path(\"gs://shuowei_bucket/pdf/*\")\n", + "chunks_df.columns = [\"uri\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "#chunks_df = chunks_df.head(50)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# copy twice for testing\n", + "#copies = [chunks_df] * 10000\n", + "#chunks_df = bpd.concat(copies, ignore_index=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Text extraction, and chunking" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 3f5faa00-8366-4bfc-87f5-1dbe18e355fb is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dataframe.py:4117: PreviewWarning: \u001b[93maxis=1 scenario is in preview.\u001b[0m\n", + " warnings.warn(msg, category=bfe.PreviewWarning)\n" + ] + } + ], + "source": [ + "bq_connection = \"bigframes-dev.us.bigframes-default-connection\"\n", + "chunks_df[\"chunk_text\"] = chunks_df[\"uri\"].blob.pdf_chunk(connection=bq_connection, chunk_size=2000, overlap_size=200)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Explode column for future processing." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "chunk_df_exploded = chunks_df[\"chunk_text\"].explode()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job ee1ac492-65b8-4c1a-aac2-9abfadea3251 is DONE. 7.3 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 0bec4448-73ac-4478-88c2-eef74552fb3b is DONE. 8.6 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job c3a4fd8f-d42f-4d54-93ae-3b6e40f0e9ad is DONE. 8.6 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "0 Hydra: Bidirectional State Space Models\n", + "Throug...\n", + "0 multiple domains, including language and visio...\n", + "0 Mixing\n", + "��ℳ��&\n", + "Figure 1: (Left) A schematic of ...\n", + "0 parameterizations underpin efficient sequence ...\n", + "0 more\n", + "coherent and theoretically grounded advan...\n", + "0 important characteristics of downstream sequen...\n", + "0 preprocessing function and the matrix construc...\n", + "0 Sequence Aligned Matrices (SAM) to\n", + "systematica...\n", + "0 Toeplitz matrix mixer; GSS [26] adds a data-de...\n", + "0 (FNet is a structured matrix mixer without seq...\n", + "0 each generated fromQand K. Specifically, each\n", + "...\n", + "0 ��$\"��$:&×\"��&\"��$\"��$:'×\"��'\"\n", + "��&\"��&:!×\"��!\"...\n", + "0 represented within the matrix mixer framework,...\n", + "0 defined\n", + "as follows: a matrixM is N-quasisepara...\n", + "0 This generosity in the rank-based definition s...\n", + "0 consequence of the favorable mathematical prop...\n", + "0 84.1 88.2 69.1 91.0 85.9 47.6 83.9 78.4\n", + "Attent...\n", + "0 analyzing the matrix mixer framework through e...\n", + "0 rigorous and focused comparison between differ...\n", + "0 Appendix D.1.\n", + "Results. The results presented i...\n", + "0 BERT – trained with the latest HuggingFace\n", + "rec...\n", + "0 from 0.3 to 0.5 as stronger\n", + "regularization. We...\n", + "0 Michael Poli, James Zou, Atri\n", + "Rudra, and Chris...\n", + "0 of\n", + "deep bidirectional transformers for languag...\n", + "0 vision and pattern recognition.\n", + "2016, pp. 770–...\n", + "Name: chunk_text, dtype: string" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chunk_df_exploded.cache()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Generate Embeddings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Generation of embeddings within BigFrames." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job b31a3f5e-943a-4c67-9c62-b8b058cced45 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from bigframes.ml import llm\n", + "\n", + "text_embedding_model = llm.TextEmbeddingGenerator(model_name=\"text-embedding-005\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job e1aacf23-0af8-4a45-ba15-e3cfae45370b is DONE. 8.6 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: \u001b[93mInterpreting JSON column(s) as the `db_dtypes.dbjson` extension type\n", + "is in preview; this behavior may change in future versions.\u001b[0m\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "Query job d4f524a3-4005-401c-978d-76e6e6a9b496 is DONE. 8.5 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 5e88b6d3-f68c-4f31-84d9-6b0cbd6c5d47 is DONE. 8.5 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# generate embeddings\n", + "embeddings_df = text_embedding_model.predict(chunk_df_exploded)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create Embedding table in Bigquery if not exist." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "test_project_id = \"bigframes-dev\"\n", + "test_dataset_id = \"shuowei_test_us\"\n", + "test_table_id = \"pdf_chunk_embedding_v10\"\n", + "embedding_table_id = f\"{test_project_id}.{test_dataset_id}.{test_table_id}\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Save embedding into a BigQuery table for downstream processing.." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 29727e93-2f99-4331-95c9-a6ab78fd06b6 is DONE. 34.8 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'bigframes-dev.shuowei_test_us.pdf_chunk_embedding_v10'" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "embeddings_df.to_gbq(destination_table=embedding_table_id,if_exists=\"replace\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Create vector search index" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Construction of an index over these embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'embedding_table_id_v11' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[15], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mbigframes\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbigquery\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mbbq\u001b[39;00m\n\u001b[1;32m 2\u001b[0m bbq\u001b[38;5;241m.\u001b[39mcreate_vector_index(\n\u001b[0;32m----> 3\u001b[0m table_id\u001b[38;5;241m=\u001b[39m\u001b[43membedding_table_id_v11\u001b[49m,\n\u001b[1;32m 4\u001b[0m column_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mml_generate_embedding_result\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 5\u001b[0m distance_type\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcosine\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 6\u001b[0m index_type\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mivf\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 7\u001b[0m ivf_options\u001b[38;5;241m=\u001b[39m{\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnum_lists\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;241m100\u001b[39m},\n\u001b[1;32m 8\u001b[0m )\n", + "\u001b[0;31mNameError\u001b[0m: name 'embedding_table_id_v11' is not defined" + ] + } + ], + "source": [ + "import bigframes.bigquery as bbq\n", + "bbq.create_vector_index(\n", + " table_id=embedding_table_id_v11,\n", + " column_name=\"ml_generate_embedding_result\",\n", + " distance_type=\"cosine\",\n", + " index_type=\"ivf\",\n", + " ivf_options={\"num_lists\": 100},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Search with pointers to the original pdf" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Execution of semantic search, with results linked back to the original PDFs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# temp test code, reada from gbq\n", + "embeddings_df = bpd.read_gbq(embedding_table_id)\n", + "embedding_table_id_v11 = \"bigframes-dev.shuowei_test_us.pdf_chunk_embedding_v11\"\n", + "# copy twice for testing\n", + "copies = [embeddings_df] * 5\n", + "embeddings_df= bpd.concat(copies, ignore_index=True)\n", + "type(embeddings_df)\n", + "embeddings_df.to_gbq(destination_table=embedding_table_id_v11, if_exists=\"replace\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bigframes.options.experiments.semantic_operators = True\n", + "\n", + "embeddings_df.semantics.search(\n", + " \"ml_generate_embedding_result\", \n", + " \"reinforce\", \n", + " top_k=3, \n", + " model=text_embedding_model, \n", + " score_column=\"distance\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# generate embedding for the word for searching\n", + "searched_words = [\"reinforce\"]\n", + "searched_words_embeddings = text_embedding_model.predict(searched_words)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "search_query = bpd.DataFrame({\"query_id\": [\"dog\", \"cat\"], embedding=})\n", + "result_df = bbq.vector_search(\n", + " base_table=\n", + " column_to_search=\n", + " query=search_query,\n", + " distance_type=\"cosine\",\n", + " top_k=5,\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/e2e_RAG_debug.ipynb b/notebooks/e2e_RAG_debug.ipynb new file mode 100644 index 0000000000..c112e5149f --- /dev/null +++ b/notebooks/e2e_RAG_debug.ipynb @@ -0,0 +1,305 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2025 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get hands-on experience with Gemini-powered AI operator APIs in this notebook. We'll start with clear examples of API syntax, ensuring you understand how to use these operators. Then, we'll dive into a real-world application, showcasing their performance on a large dataset and providing key statistics. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Preparation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, import the BigFrames modules." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import bigframes\n", + "import bigframes.pandas as bpd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Make sure the BigFrames version is at least `1.38.0`" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from packaging.version import Version\n", + "\n", + "assert Version(bigframes.__version__) >= Version(\"1.38.0\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/_config/experiment_options.py:68: PreviewWarning: BigFrames Blob is still under experiments. It may not work and subject\n", + "to change in the future.\n", + " warnings.warn(msg, category=bfe.PreviewWarning)\n" + ] + } + ], + "source": [ + "bigframes.options.experiments.blob = True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PDF chunk" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Retrieval of PDF URLs, text extraction, and chunking." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/global_session.py:114: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + " return func(get_global_session(), *args, **kwargs)\n" + ] + }, + { + "data": { + "text/html": [ + "Query job a777cf43-fe93-49f6-8a48-2db57a248a85 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 63d6cafb-56e2-435a-b00a-b3b35badcab0 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "chunks_df = bpd.from_glob_path(\"gs://shuowei_bucket/failed_pdf/*\", name=\"pdf\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/functions/_function_session.py:804: PreviewWarning: udf is in preview.\n", + " warnings.warn(\"udf is in preview.\", category=bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "Query job e39925b5-632d-4d8d-8282-141dfad0463f is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "ename": "NotImplementedError", + "evalue": "Cannot mix Series with other types. Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey. You are currently running BigFrames version 2.0.0.", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNotImplementedError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[6], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m bq_connection \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbigframes-dev.us.bigframes-default-connection\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 2\u001b[0m chunks_df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mchunk_text\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mchunks_df\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mpdf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mblob\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpdf_chunk\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# connection=bq_connection, verbose=True)#, chunk_size=2000, overlap_size=200,\u001b[39;00m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;66;03m#max_batching_rows=1, container_cpu=2, container_memory=\"1Gi\")\u001b[39;00m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;66;03m# notes: use connection is not necessary, we can use default connection.\u001b[39;00m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;66;03m# However, in current stage, using a specfic conneciton will grant more quota\u001b[39;00m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:180\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 171\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_block\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 172\u001b[0m submit_pandas_labels(\n\u001b[1;32m 173\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_block\u001b[38;5;241m.\u001b[39mexpr\u001b[38;5;241m.\u001b[39msession\u001b[38;5;241m.\u001b[39mbqclient,\n\u001b[1;32m 174\u001b[0m class_name,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 178\u001b[0m task\u001b[38;5;241m=\u001b[39mPANDAS_PARAM_TRACKING_TASK,\n\u001b[1;32m 179\u001b[0m )\n\u001b[0;32m--> 180\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 182\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mpop()\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:164\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 161\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 163\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 164\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 165\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 166\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 167\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 168\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 169\u001b[0m \u001b[38;5;66;03m# self._block.expr.session.bqclient. Also, to avoid generating multiple queries\u001b[39;00m\n\u001b[1;32m 170\u001b[0m \u001b[38;5;66;03m# because of internal calls, we log only when the method is directly invoked.\u001b[39;00m\n\u001b[1;32m 171\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_block\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/operations/blob.py:735\u001b[0m, in \u001b[0;36mBlobAccessor.pdf_chunk\u001b[0;34m(self, connection, chunk_size, overlap_size, max_batching_rows, container_cpu, container_memory, verbose)\u001b[0m\n\u001b[1;32m 714\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moverlap_size must be smaller than chunk_size.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 716\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m'''\u001b[39;00m\n\u001b[1;32m 717\u001b[0m \u001b[38;5;124;03mpdf_chunk_udf = blob_func.TransformFunction(\u001b[39;00m\n\u001b[1;32m 718\u001b[0m \u001b[38;5;124;03m blob_func.pdf_chunk_def,\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 733\u001b[0m \u001b[38;5;124;03m#df[\"res\"] = df[\"pdf\"].apply(blob_func.pdf_chunk_func)\u001b[39;00m\n\u001b[1;32m 734\u001b[0m \u001b[38;5;124;03m'''\u001b[39;00m\n\u001b[0;32m--> 735\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43mbpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mDataFrame\u001b[49m\u001b[43m(\u001b[49m\u001b[43m{\u001b[49m\n\u001b[1;32m 736\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mpdf_url_json\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_runtime_json_str\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mR\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 737\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mchunk_size\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mchunk_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 738\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43moverlap_size\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43moverlap_size\u001b[49m\n\u001b[1;32m 739\u001b[0m \u001b[43m\u001b[49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 741\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m_call_pdf_chunk_udf\u001b[39m(row):\n\u001b[1;32m 742\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_block\u001b[38;5;241m.\u001b[39msession\u001b[38;5;241m.\u001b[39mcall_udf(\n\u001b[1;32m 743\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mshuowei_bb1_us.pdf_chunk_def\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;66;03m# Use the full UDF name\u001b[39;00m\n\u001b[1;32m 744\u001b[0m row[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpdf_url_json\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 750\u001b[0m container_memory\u001b[38;5;241m=\u001b[39mcontainer_memory,\n\u001b[1;32m 751\u001b[0m )\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:180\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 171\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_block\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 172\u001b[0m submit_pandas_labels(\n\u001b[1;32m 173\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_block\u001b[38;5;241m.\u001b[39mexpr\u001b[38;5;241m.\u001b[39msession\u001b[38;5;241m.\u001b[39mbqclient,\n\u001b[1;32m 174\u001b[0m class_name,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 178\u001b[0m task\u001b[38;5;241m=\u001b[39mPANDAS_PARAM_TRACKING_TASK,\n\u001b[1;32m 179\u001b[0m )\n\u001b[0;32m--> 180\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 182\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mpop()\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:164\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 161\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 163\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 164\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 165\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 166\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 167\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 168\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 169\u001b[0m \u001b[38;5;66;03m# self._block.expr.session.bqclient. Also, to avoid generating multiple queries\u001b[39;00m\n\u001b[1;32m 170\u001b[0m \u001b[38;5;66;03m# because of internal calls, we log only when the method is directly invoked.\u001b[39;00m\n\u001b[1;32m 171\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_block\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/dataframe.py:157\u001b[0m, in \u001b[0;36mDataFrame.__init__\u001b[0;34m(self, data, index, columns, dtype, copy, session)\u001b[0m\n\u001b[1;32m 146\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m (\n\u001b[1;32m 147\u001b[0m utils\u001b[38;5;241m.\u001b[39mis_dict_like(data)\n\u001b[1;32m 148\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(data) \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 151\u001b[0m )\n\u001b[1;32m 152\u001b[0m ):\n\u001b[1;32m 153\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mall\u001b[39m(\n\u001b[1;32m 154\u001b[0m \u001b[38;5;28misinstance\u001b[39m(data[key], bigframes\u001b[38;5;241m.\u001b[39mseries\u001b[38;5;241m.\u001b[39mSeries) \u001b[38;5;28;01mfor\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m data\u001b[38;5;241m.\u001b[39mkeys()\n\u001b[1;32m 155\u001b[0m ):\n\u001b[1;32m 156\u001b[0m \u001b[38;5;66;03m# TODO(tbergeron): Support local list/series data by converting to memtable.\u001b[39;00m\n\u001b[0;32m--> 157\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\n\u001b[1;32m 158\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCannot mix Series with other types. \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mconstants\u001b[38;5;241m.\u001b[39mFEEDBACK_LINK\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 159\u001b[0m )\n\u001b[1;32m 160\u001b[0m keys \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(data\u001b[38;5;241m.\u001b[39mkeys())\n\u001b[1;32m 161\u001b[0m first_label, first_series \u001b[38;5;241m=\u001b[39m keys[\u001b[38;5;241m0\u001b[39m], data[keys[\u001b[38;5;241m0\u001b[39m]]\n", + "\u001b[0;31mNotImplementedError\u001b[0m: Cannot mix Series with other types. Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey. You are currently running BigFrames version 2.0.0." + ] + } + ], + "source": [ + "bq_connection = \"bigframes-dev.us.bigframes-default-connection\"\n", + "chunks_df[\"chunk_text\"] = chunks_df[\"pdf\"].blob.pdf_chunk()\n", + "# connection=bq_connection, verbose=True)#, chunk_size=2000, overlap_size=200,\n", + " #max_batching_rows=1, container_cpu=2, container_memory=\"1Gi\")\n", + "# notes: use connection is not necessary, we can use default connection.\n", + "# However, in current stage, using a specfic conneciton will grant more quota" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Explode column for future processing." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "chunks_df" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "chunk_df_exploded = chunks_df[\"chunk_text\"].explode()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "chunk_df_exploded" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Generation of embeddings within BigFrames." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create Embedding table in Bigquery if not exist." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Save embedding into a BigQuery table for downstream processing." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Create vector search index" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Construction of an index over these embeddings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Search with pointers to the original pdf" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Execution of vector search, with results linked back to the original PDFs" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/e2e_RAG_prod.ipynb b/notebooks/e2e_RAG_prod.ipynb new file mode 100644 index 0000000000..aa00a9013d --- /dev/null +++ b/notebooks/e2e_RAG_prod.ipynb @@ -0,0 +1,571 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2025 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get hands-on experience with Gemini-powered AI operator APIs in this notebook. We'll start with clear examples of API syntax, ensuring you understand how to use these operators. Then, we'll dive into a real-world application, showcasing their performance on a large dataset and providing key statistics. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Preparation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, import the BigFrames modules." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import bigframes\n", + "import bigframes.pandas as bpd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Make sure the BigFrames version is at least `1.38.0`" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from packaging.version import Version\n", + "\n", + "assert Version(bigframes.__version__) >= Version(\"1.38.0\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set blob to true for testing" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/_config/experiment_options.py:71: ApiDeprecationWarning: BigFrames Blob is in preview now. This flag is no longer needed.\n", + " warnings.warn(msg, category=bfe.ApiDeprecationWarning)\n" + ] + } + ], + "source": [ + "bigframes.options.experiments.blob = True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PDF chunk" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Retrieval of PDF URLs, text extraction, and chunking." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/global_session.py:103: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + " _global_session = bigframes.session.connect(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=262006177488-ka1m0ue4fptfmt9siejdd5lom7p39upa.apps.googleusercontent.com&redirect_uri=https%3A%2F%2Fpydata-google-auth.readthedocs.io%2Fen%2Flatest%2Foauth.html&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform&state=GE3CiB2iPQ32Mbcgug2H68pdMulb7j&prompt=consent&access_type=offline\n" + ] + }, + { + "ename": "ValueError", + "evalue": "Project must be set to initialize BigQuery client. Try setting `bigframes.options.bigquery.project` first.", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[4], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m chunks_df \u001b[38;5;241m=\u001b[39m \u001b[43mbpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_glob_path\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgs://garrettwu_bucket/pdfs/*\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2\u001b[0m chunks_df\u001b[38;5;241m.\u001b[39mcolumns \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124muri\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 3\u001b[0m bq_connection \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbigframes-dev.us.bigframes-default-connection\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:175\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 172\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 175\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 178\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 179\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;66;03m# _block.session.bqclient.\u001b[39;00m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/pandas/io/api.py:606\u001b[0m, in \u001b[0;36mfrom_glob_path\u001b[0;34m(path, connection, name)\u001b[0m\n\u001b[1;32m 603\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mfrom_glob_path\u001b[39m(\n\u001b[1;32m 604\u001b[0m path: \u001b[38;5;28mstr\u001b[39m, \u001b[38;5;241m*\u001b[39m, connection: Optional[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m, name: Optional[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 605\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m bigframes\u001b[38;5;241m.\u001b[39mdataframe\u001b[38;5;241m.\u001b[39mDataFrame:\n\u001b[0;32m--> 606\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mglobal_session\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwith_default_session\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 607\u001b[0m \u001b[43m \u001b[49m\u001b[43mbigframes\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mSession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_glob_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 608\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 609\u001b[0m \u001b[43m \u001b[49m\u001b[43mconnection\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconnection\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 610\u001b[0m \u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 611\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/global_session.py:114\u001b[0m, in \u001b[0;36mwith_default_session\u001b[0;34m(func_, *args, **kwargs)\u001b[0m\n\u001b[1;32m 113\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mwith_default_session\u001b[39m(func_: Callable[\u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m, _T], \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m _T:\n\u001b[0;32m--> 114\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func_(\u001b[43mget_global_session\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/global_session.py:103\u001b[0m, in \u001b[0;36mget_global_session\u001b[0;34m()\u001b[0m\n\u001b[1;32m 101\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _global_session_lock:\n\u001b[1;32m 102\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m _global_session \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 103\u001b[0m _global_session \u001b[38;5;241m=\u001b[39m \u001b[43mbigframes\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconnect\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 104\u001b[0m \u001b[43m \u001b[49m\u001b[43mbigframes\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbigquery\u001b[49m\n\u001b[1;32m 105\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 107\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _global_session\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/__init__.py:2188\u001b[0m, in \u001b[0;36mconnect\u001b[0;34m(context)\u001b[0m\n\u001b[1;32m 2187\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mconnect\u001b[39m(context: Optional[bigquery_options\u001b[38;5;241m.\u001b[39mBigQueryOptions] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Session:\n\u001b[0;32m-> 2188\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mSession\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcontext\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:175\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 172\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 175\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 178\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 179\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;66;03m# _block.session.bqclient.\u001b[39;00m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/__init__.py:170\u001b[0m, in \u001b[0;36mSession.__init__\u001b[0;34m(self, context, clients_provider)\u001b[0m\n\u001b[1;32m 168\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_clients_provider \u001b[38;5;241m=\u001b[39m clients_provider\n\u001b[1;32m 169\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 170\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_clients_provider \u001b[38;5;241m=\u001b[39m \u001b[43mbigframes\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mclients\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mClientsProvider\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 171\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcontext\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mproject\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 172\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_location\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 173\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_regional_endpoints\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcontext\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43muse_regional_endpoints\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 174\u001b[0m \u001b[43m \u001b[49m\u001b[43mcredentials\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcontext\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcredentials\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 175\u001b[0m \u001b[43m \u001b[49m\u001b[43mapplication_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcontext\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapplication_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[43m \u001b[49m\u001b[43mbq_kms_key_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_bq_kms_key_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 177\u001b[0m \u001b[43m \u001b[49m\u001b[43mclient_endpoints_override\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcontext\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mclient_endpoints_override\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 178\u001b[0m \u001b[43m \u001b[49m\u001b[43mrequests_transport_adapters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcontext\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrequests_transport_adapters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 179\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;66;03m# TODO(shobs): Remove this logic after https://github.com/ibis-project/ibis/issues/8494\u001b[39;00m\n\u001b[1;32m 182\u001b[0m \u001b[38;5;66;03m# has been fixed. The ibis client changes the default query job config\u001b[39;00m\n\u001b[1;32m 183\u001b[0m \u001b[38;5;66;03m# so we are going to remember the current config and restore it after\u001b[39;00m\n\u001b[1;32m 184\u001b[0m \u001b[38;5;66;03m# the ibis client has been created\u001b[39;00m\n\u001b[1;32m 185\u001b[0m original_default_query_job_config \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbqclient\u001b[38;5;241m.\u001b[39mdefault_query_job_config\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/clients.py:107\u001b[0m, in \u001b[0;36mClientsProvider.__init__\u001b[0;34m(self, project, location, use_regional_endpoints, credentials, application_name, bq_kms_key_name, client_endpoints_override, requests_transport_adapters)\u001b[0m\n\u001b[1;32m 100\u001b[0m project \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 101\u001b[0m project\n\u001b[1;32m 102\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m os\u001b[38;5;241m.\u001b[39mgetenv(_ENV_DEFAULT_PROJECT)\n\u001b[1;32m 103\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m typing\u001b[38;5;241m.\u001b[39mcast(Optional[\u001b[38;5;28mstr\u001b[39m], credentials_project)\n\u001b[1;32m 104\u001b[0m )\n\u001b[1;32m 106\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m project:\n\u001b[0;32m--> 107\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 108\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mProject must be set to initialize BigQuery client. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 109\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTry setting `bigframes.options.bigquery.project` first.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 110\u001b[0m )\n\u001b[1;32m 112\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_application_name \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 113\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m_get_application_names()\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mapplication_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 114\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m application_name\n\u001b[1;32m 115\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m _get_application_names()\n\u001b[1;32m 116\u001b[0m )\n\u001b[1;32m 117\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_project \u001b[38;5;241m=\u001b[39m project\n", + "\u001b[0;31mValueError\u001b[0m: Project must be set to initialize BigQuery client. Try setting `bigframes.options.bigquery.project` first." + ] + } + ], + "source": [ + "chunks_df = bpd.from_glob_path(\"gs://garrettwu_bucket/pdfs/*\")\n", + "chunks_df.columns = [\"uri\"]\n", + "bq_connection = \"bigframes-dev.us.bigframes-default-connection\"\n", + "chunks_df[\"chunk_text\"] = chunks_df[\"uri\"].blob.pdf_chunk(\n", + " connection=bq_connection, chunk_size=2000, overlap_size=200,\n", + " max_batching_rows=1\n", + ")\n", + "chunk_df_exploded = chunks_df[\"chunk_text\"].explode()\n", + "chunk_df_exploded.cache()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/global_session.py:114: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + " return func(get_global_session(), *args, **kwargs)\n" + ] + }, + { + "data": { + "text/html": [ + "Query job 4b6facd8-54f1-4a58-a2cb-db230bfc1388 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 9c194a18-b5ff-425f-9312-4dce3e21f4bf is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "chunks_df = bpd.from_glob_path(\"gs://shuowei_bucket/pdf/*\", name=\"pdf\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# copy files to genearte more inputs, now we have 1000 PDF files\n", + "#copies = [chunks_df] * 20\n", + "#chunks_df = bpd.concat(copies, ignore_index=True)\n", + "#chunks_df = chunks_df.cache()\n", + "chunks_df = chunks_df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# copy files to genearte more inputs, now we have 10,000 PDF files\n", + "copies = [chunks_df] * 100\n", + "chunks_df = bpd.concat(copies, ignore_index=True)\n", + "chunks_df = chunks_df.cache()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# copy files again, now we have 100,000 PDF files\n", + "copies = [chunks_df] * 10\n", + "chunks_df = bpd.concat(copies, ignore_index=True)\n", + "chunks_df = chunks_df.cache()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 58e48da6-87fb-4d17-97e0-d3d7e02ee58f is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dataframe.py:4162: PreviewWarning: axis=1 scenario is in preview.\n", + " warnings.warn(msg, category=bfe.PreviewWarning)\n" + ] + } + ], + "source": [ + "bq_connection = \"bigframes-dev.us.bigframes-default-connection\"\n", + "chunks_df[\"chunk_text\"] = chunks_df[\"pdf\"].blob.pdf_chunk(\n", + " connection=bq_connection)\n", + "# notes: use connection is not necessary, we can use default connection.\n", + "# However, in current stage, using a specfic conneciton will grant more quota" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Explode column for future processing." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "chunk_df_exploded = chunks_df[\"chunk_text\"].explode()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Save to a temporary table" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job c0c05fe3-c1cb-4d59-a1a1-c2a3d8582c94 is DONE. 49.0 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "chunk_df_exploded = chunk_df_exploded.cache()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 2145211c-d60f-45d2-acf8-c19c9176298f is DONE. 457.0 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 77319c66-4155-48c2-af63-cf5a558e3cf5 is DONE. 455.2 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "0 Integrating Reinforcement Learning, Action Mod...\n", + "0 Benyamin)\n", + "Preprint submitted to Artificial Int...\n", + "0 classical, discrete, environments.\n", + "Therefore, ...\n", + "0 setting we consider in this work isoffline lea...\n", + "0 more complex\n", + "problems that required longer-ter...\n", + "0 domain models for planning, and RL. We also pr...\n", + "0 means that a planning domain defines parameter...\n", + "0 which actions to perform in\n", + "order to collect n...\n", + "0 these\n", + "assumptions, NSAM is guaranteed to retur...\n", + "0 policy.\n", + "Off-policy algorithms are algorithms t...\n", + "0 the\n", + "environment, mining resources, collecting ...\n", + "0 must:\n", + "1. Harvest at least one wood block from ...\n", + "0 irreversible and the amount of resources in a ...\n", + "0 created by observing an expert solve different...\n", + "0 Moreover, most actions are TP TO actions, whic...\n", + "0 our RL models. Moreover, our gym environment i...\n", + "0 within that time limit,\n", + "we consider the run as...\n", + "0 length.\n", + "4https://imitation.readthedocs.io\n", + "5htt...\n", + "0 planning lies in its capacity to generalize ac...\n", + "0 for the simpler\n", + "Craft Wooden Sword, BC is actu...\n", + "0 the number of episodes\n", + "in which the agent succ...\n", + "0 policy using higher-quality examples. Figure 1...\n", + "0 search\n", + "processes may require higher computatio...\n", + "0 right, (1, 1), move up, (1, 2), move right, (2...\n", + "0 methodological tool to solve problems when pla...\n", + "Name: chunk_text, dtype: string" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chunk_df_exploded" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Generate Embeddings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Generation of embeddings within BigFrames." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from bigframes.ml import llm\n", + "\n", + "text_embedding_model = llm.TextEmbeddingGenerator(model_name=\"text-embedding-005\")\n", + "embeddings_df = text_embedding_model.predict(chunk_df_exploded)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create Embedding table in Bigquery if not exist." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "test_project_id = \"bigframes-dev\"\n", + "test_dataset_id = \"shuowei_test_us\"\n", + "test_table_id = \"pdf_chunk_embedding\"\n", + "embedding_table_id = f\"{test_project_id}.{test_dataset_id}.{test_table_id}\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Save embedding into a BigQuery table for downstream processing." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "embeddings_df.to_gbq(destination_table=embedding_table_id,if_exists=\"replace\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Create vector search index" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Construction of an index over these embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import bigframes.bigquery as bbq\n", + "bbq.create_vector_index(\n", + " table_id=embedding_table_id,\n", + " column_name=\"ml_generate_embedding_result\",\n", + " distance_type=\"cosine\",\n", + " index_type=\"ivf\",\n", + " ivf_options={\"num_lists\": 100},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Search with pointers to the original pdf" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Execution of vector search, with results linked back to the original PDFs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# generate the embedding of the words for search\n", + "searched_words = [\"reinforce\"]\n", + "searched_words_embeddings = text_embedding_model.predict(searched_words)\n", + "embedding_result_column = \"ml_generate_embedding_result\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# perform vector search\n", + "search_result = (\n", + " bbq.vector_search(\n", + " base_table=embedding_table_id,\n", + " column_to_search=embedding_result_column,\n", + " query=searched_words_embeddings,\n", + " query_column_to_search=embedding_result_column,\n", + " top_k=3,\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "search_result" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/e2e_RAG_prod_1M.ipynb b/notebooks/e2e_RAG_prod_1M.ipynb new file mode 100644 index 0000000000..5154aa804c --- /dev/null +++ b/notebooks/e2e_RAG_prod_1M.ipynb @@ -0,0 +1,661 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2025 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get hands-on experience with Gemini-powered AI operator APIs in this notebook. We'll start with clear examples of API syntax, ensuring you understand how to use these operators. Then, we'll dive into a real-world application, showcasing their performance on a large dataset and providing key statistics. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Preparation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, import the BigFrames modules." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import bigframes\n", + "import bigframes.pandas as bpd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Make sure the BigFrames version is at least `1.38.0`" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from packaging.version import Version\n", + "\n", + "assert Version(bigframes.__version__) >= Version(\"1.38.0\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set blob to true for testing" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/_config/experiment_options.py:55: PreviewWarning: \u001b[93mBigFrames Blob is still under experiments. It may not work and subject\n", + "to change in the future.\u001b[0m\n", + " warnings.warn(msg, category=bfe.PreviewWarning)\n" + ] + } + ], + "source": [ + "bigframes.options.experiments.blob = True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PDF chunk" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Retrieval of PDF URLs, text extraction, and chunking." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/global_session.py:114: DefaultLocationWarning: \u001b[93mNo explicit location is set, so using location US for the session.\u001b[0m\n", + " return func(get_global_session(), *args, **kwargs)\n" + ] + }, + { + "data": { + "text/html": [ + "Query job b4323a3d-e7f7-41b6-a122-59e5c1a5e6ba is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 5455fdcd-3102-451d-bead-20356689285f is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "chunks_df = bpd.from_glob_path(\"gs://shuowei_bucket/pdf/*\", name=\"pdf\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# copy files to genearte more inputs, now we have 1000 PDF files\n", + "#copies = [chunks_df] * 20\n", + "#chunks_df = bpd.concat(copies, ignore_index=True)\n", + "#chunks_df = chunks_df.cache()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 0ab3a77c-24ea-4bac-8cd4-a29d6b489151 is DONE. 1.8 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# copy files to genearte more inputs, now we have 10,000 PDF files\n", + "copies = [chunks_df] * 100\n", + "chunks_df = bpd.concat(copies, ignore_index=True)\n", + "chunks_df = chunks_df.cache()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 23bfff29-3e86-4eaf-a8e3-c6323e5e41de is DONE. 1.6 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# copy files again, now we have 1,000,000 PDF files\n", + "copies = [chunks_df] * 100\n", + "chunks_df = bpd.concat(copies, ignore_index=True)\n", + "chunks_df = chunks_df.cache()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "#chunks_df" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 6c1cc2ff-b0df-4678-84ba-153592077591 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dataframe.py:4117: PreviewWarning: \u001b[93maxis=1 scenario is in preview.\u001b[0m\n", + " warnings.warn(msg, category=bfe.PreviewWarning)\n" + ] + } + ], + "source": [ + "bq_connection = \"bigframes-dev.us.bigframes-default-connection\"\n", + "chunks_df[\"chunk_text\"] = chunks_df[\"pdf\"].blob.pdf_chunk(\n", + " connection=bq_connection, chunk_size=2000, overlap_size=200,\n", + " max_batching_rows=1, container_cpu=2, container_memory=\"1Gi\")\n", + "# notes: use connection is not necessary, we can use default connection.\n", + "# However, in current stage, using a specfic conneciton will grant more quota" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Explode column for future processing." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "chunk_df_exploded = chunks_df[\"chunk_text\"].explode()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Save to a temporary table" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 03167820-aa2b-4499-9d85-2ffae2770c82 is DONE. 158.0 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "chunk_df_exploded = chunk_df_exploded.cache()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job f03d7a4d-2977-43fe-82ab-a40341355a7d is DONE. 86.3 GB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job bde70e30-312b-4fa5-ba2e-35b441988e4c is DONE. 10.1 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "0 Integrating Reinforcement Learning, Action Mod...\n", + "0 Benyamin)\n", + "Preprint submitted to Artificial Int...\n", + "0 classical, discrete, environments.\n", + "Therefore, ...\n", + "0 setting we consider in this work isoffline lea...\n", + "0 more complex\n", + "problems that required longer-ter...\n", + "Name: chunk_text, dtype: string" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chunk_df_exploded.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Generate Embeddings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Generation of embeddings within BigFrames." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 741a86ed-0b1b-4c69-ad07-6f9859c6ec9f is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job fd427249-d3e1-42bd-86a0-a1952965effd is DONE. 85.7 GB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: \u001b[93mInterpreting JSON column(s) as the `db_dtypes.dbjson` extension type\n", + "is in preview; this behavior may change in future versions.\u001b[0m\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "Query job a8fba082-2b9b-434c-95fa-c50644d03e03 is DONE. 4.2 GB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 9fc6a16b-03b7-4ea6-a69d-902576e0b251 is DONE. 4.2 GB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/ml/base.py:289: RuntimeWarning: \u001b[93mSome predictions failed. Check column ml_generate_embedding_status for\n", + "detailed status. You may want to filter the failed rows and retry.\u001b[0m\n", + " warnings.warn(msg, category=RuntimeWarning)\n" + ] + } + ], + "source": [ + "from bigframes.ml import llm\n", + "\n", + "text_embedding_model = llm.TextEmbeddingGenerator(model_name=\"text-embedding-005\")\n", + "embeddings_df = text_embedding_model.predict(chunk_df_exploded)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create Embedding table in Bigquery if not exist." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "test_project_id = \"bigframes-dev\"\n", + "test_dataset_id = \"shuowei_test_us\"\n", + "test_table_id = \"pdf_chunk_embedding\"\n", + "embedding_table_id = f\"{test_project_id}.{test_dataset_id}.{test_table_id}\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Save embedding into a BigQuery table for downstream processing." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 7f8ac705-f32c-447a-ac5a-57a0c165dde0 is DONE. 104.8 GB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "'bigframes-dev.shuowei_test_us.pdf_chunk_embedding'" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "embeddings_df.to_gbq(destination_table=embedding_table_id,if_exists=\"replace\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Create vector search index" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Construction of an index over these embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job acfac823-c809-4928-8b1c-132f7f84ea11 is RUNNING. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "ename": "BadRequest", + "evalue": "400 GET https://bigquery.googleapis.com/bigquery/v2/projects/bigframes-dev/queries/acfac823-c809-4928-8b1c-132f7f84ea11?maxResults=0&location=US&prettyPrint=false: Column 'ml_generate_embedding_result' must have the same array length, while the minimum length is 0 and the maximum length is 768.\n\nLocation: US\nJob ID: acfac823-c809-4928-8b1c-132f7f84ea11\n Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey.You are currently running BigFrames version 1.39.0 [{'@type': 'type.googleapis.com/google.rpc.DebugInfo', 'detail': '[INVALID_INPUT] message=QUERY_ERROR: [Column \\'ml_generate_embedding_result\\' must have the same array length, while the minimum length is 0 and the maximum length is 768.] debug=code: \\t BAD_QUERY\\ndescription: \"Column \\\\\\'ml_generate_embedding_result\\\\\\' must have the same array length, while the minimum length is 0 and the maximum length is 768.\"\\ncause: USER_ERROR\\naddress: \"http://jdyd1.prod.google.com:4901/task?handle=logs.0.prod-ml-us.server.cloud-dataengine-ml.10584282029591\"\\nstatus_proto {\\n code: 3\\n space: \"generic\"\\n message: \"Column \\\\\\'ml_generate_embedding_result\\\\\\' must have the same array length, while the minimum length is 0 and the maximum length is 768.\"\\n}\\nerror_details {\\n argument_error {\\n query_error {\\n }\\n }\\n debug_info {\\n error_message_template: \"Column \\\\\\'$0\\\\\\' must have the same array length, while the minimum length is $1 and the maximum length is $2.\"\\n error_id: 3839077984\\n }\\n}\\n errorProto=code: \"QUERY_ERROR\"\\nargument: \"Column \\\\\\'ml_generate_embedding_result\\\\\\' must have the same array length, while the minimum length is 0 and the maximum length is 768.\"\\nlocation_type: OTHER\\nlocation: \"query\"\\n\\n\\tat com.google.cloud.helix.common.Exceptions.fromProto(Exceptions.java:1993)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl.mapDremelErrorsTohelixException(QueryExecutorImpl.java:1206)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:766)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:693)\\n\\tat com.google.net.rpc3.stream.RpcMessageCallback$ForwardingRpcMessageCallback.onMessage(RpcMessageCallback.java:123)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.processMessageUnlocked(RpcStreamInternalContext.java:1839)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksInternalUnlocked(RpcStreamInternalContext.java:2877)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksUnlocked(RpcStreamInternalContext.java:2801)\\n\\tat com.google.net.eventmanager.AbstractFutureTask$Sync.innerRun(AbstractFutureTask.java:259)\\n\\tat com.google.net.eventmanager.AbstractFutureTask.run(AbstractFutureTask.java:120)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runTaskTraced(EventManagerImpl.java:901)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runTask(EventManagerImpl.java:893)\\n\\tat com.google.net.eventmanager.EventManagerImpl.internalRunWorkerLoop(EventManagerImpl.java:1320)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runWorkerLoop(EventManagerImpl.java:1211)\\n\\tat com.google.net.eventmanager.WorkerThreadInfo.runWorkerLoop(WorkerThreadInfo.java:153)\\n\\tat com.google.net.eventmanager.EventManagerImpl$WorkerThread.run(EventManagerImpl.java:2000)\\n'}]", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mBadRequest\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[17], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mbigframes\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbigquery\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mbbq\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[43mbbq\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcreate_vector_index\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[43m \u001b[49m\u001b[43mtable_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43membedding_table_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumn_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mml_generate_embedding_result\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[43mdistance_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcosine\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mivf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[43mivf_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mnum_lists\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m100\u001b[39;49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/bigquery/_operations/search.py:89\u001b[0m, in \u001b[0;36mcreate_vector_index\u001b[0;34m(table_id, column_name, replace, index_name, distance_type, stored_column_names, index_type, ivf_options, tree_ah_options, session)\u001b[0m\n\u001b[1;32m 86\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 87\u001b[0m read_gbq_query \u001b[38;5;241m=\u001b[39m session\u001b[38;5;241m.\u001b[39mread_gbq_query\n\u001b[0;32m---> 89\u001b[0m \u001b[43mread_gbq_query\u001b[49m\u001b[43m(\u001b[49m\u001b[43msql\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/pandas/io/api.py:223\u001b[0m, in \u001b[0;36mread_gbq_query\u001b[0;34m(query, index_col, columns, configuration, max_results, use_cache, col_order, filters)\u001b[0m\n\u001b[1;32m 211\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mread_gbq_query\u001b[39m(\n\u001b[1;32m 212\u001b[0m query: \u001b[38;5;28mstr\u001b[39m,\n\u001b[1;32m 213\u001b[0m \u001b[38;5;241m*\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 220\u001b[0m filters: vendored_pandas_gbq\u001b[38;5;241m.\u001b[39mFiltersType \u001b[38;5;241m=\u001b[39m (),\n\u001b[1;32m 221\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m bigframes\u001b[38;5;241m.\u001b[39mdataframe\u001b[38;5;241m.\u001b[39mDataFrame:\n\u001b[1;32m 222\u001b[0m _set_default_session_location_if_possible(query)\n\u001b[0;32m--> 223\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mglobal_session\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwith_default_session\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 224\u001b[0m \u001b[43m \u001b[49m\u001b[43mbigframes\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mSession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_gbq_query\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 225\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 226\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex_col\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 227\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 228\u001b[0m \u001b[43m \u001b[49m\u001b[43mconfiguration\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfiguration\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 229\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 230\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 231\u001b[0m \u001b[43m \u001b[49m\u001b[43mcol_order\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcol_order\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 232\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 233\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/global_session.py:114\u001b[0m, in \u001b[0;36mwith_default_session\u001b[0;34m(func, *args, **kwargs)\u001b[0m\n\u001b[1;32m 113\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwith_default_session\u001b[39m(func: Callable[\u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m, _T], \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m _T:\n\u001b[0;32m--> 114\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mget_global_session\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/__init__.py:534\u001b[0m, in \u001b[0;36mSession.read_gbq_query\u001b[0;34m(self, query, index_col, columns, configuration, max_results, use_cache, col_order, filters)\u001b[0m\n\u001b[1;32m 531\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m col_order:\n\u001b[1;32m 532\u001b[0m columns \u001b[38;5;241m=\u001b[39m col_order\n\u001b[0;32m--> 534\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_loader\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_gbq_query\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 535\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 536\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex_col\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 537\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 538\u001b[0m \u001b[43m \u001b[49m\u001b[43mconfiguration\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfiguration\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 539\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 540\u001b[0m \u001b[43m \u001b[49m\u001b[43mapi_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mread_gbq_query\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 541\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 542\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 543\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/loader.py:609\u001b[0m, in \u001b[0;36mGbqDataLoader.read_gbq_query\u001b[0;34m(self, query, index_col, columns, configuration, max_results, api_name, use_cache, filters)\u001b[0m\n\u001b[1;32m 598\u001b[0m all_columns \u001b[38;5;241m=\u001b[39m itertools\u001b[38;5;241m.\u001b[39mchain(index_cols, columns) \u001b[38;5;28;01mif\u001b[39;00m columns \u001b[38;5;28;01melse\u001b[39;00m ()\n\u001b[1;32m 599\u001b[0m query \u001b[38;5;241m=\u001b[39m bf_io_bigquery\u001b[38;5;241m.\u001b[39mto_query(\n\u001b[1;32m 600\u001b[0m query,\n\u001b[1;32m 601\u001b[0m all_columns,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 606\u001b[0m time_travel_timestamp\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 607\u001b[0m )\n\u001b[0;32m--> 609\u001b[0m destination, query_job \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_query_to_destination\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 610\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 611\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex_cols\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 612\u001b[0m \u001b[43m \u001b[49m\u001b[43mapi_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mapi_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 613\u001b[0m \u001b[43m \u001b[49m\u001b[43mconfiguration\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfiguration\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 614\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 616\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_metrics \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 617\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_metrics\u001b[38;5;241m.\u001b[39mcount_job_stats(query_job)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/loader.py:661\u001b[0m, in \u001b[0;36mGbqDataLoader._query_to_destination\u001b[0;34m(self, query, index_cols, api_name, configuration, do_clustering)\u001b[0m\n\u001b[1;32m 657\u001b[0m _, dry_run_job \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_start_query(\n\u001b[1;32m 658\u001b[0m query, job_config\u001b[38;5;241m=\u001b[39mdry_run_config, api_name\u001b[38;5;241m=\u001b[39mapi_name\n\u001b[1;32m 659\u001b[0m )\n\u001b[1;32m 660\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m dry_run_job\u001b[38;5;241m.\u001b[39mstatement_type \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSELECT\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m--> 661\u001b[0m _, query_job \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_start_query\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mapi_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mapi_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 662\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m query_job\u001b[38;5;241m.\u001b[39mdestination, query_job\n\u001b[1;32m 664\u001b[0m \u001b[38;5;66;03m# Create a table to workaround BigQuery 10 GB query results limit. See:\u001b[39;00m\n\u001b[1;32m 665\u001b[0m \u001b[38;5;66;03m# internal issue 303057336.\u001b[39;00m\n\u001b[1;32m 666\u001b[0m \u001b[38;5;66;03m# Since we have a `statement_type == 'SELECT'`, schema should be populated.\u001b[39;00m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/loader.py:729\u001b[0m, in \u001b[0;36mGbqDataLoader._start_query\u001b[0;34m(self, sql, job_config, max_results, timeout, api_name)\u001b[0m\n\u001b[1;32m 724\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m bigframes\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mcompute\u001b[38;5;241m.\u001b[39mmaximum_bytes_billed \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 725\u001b[0m \u001b[38;5;66;03m# Maybe this should be pushed down into start_query_with_client\u001b[39;00m\n\u001b[1;32m 726\u001b[0m job_config\u001b[38;5;241m.\u001b[39mmaximum_bytes_billed \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 727\u001b[0m bigframes\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mcompute\u001b[38;5;241m.\u001b[39mmaximum_bytes_billed\n\u001b[1;32m 728\u001b[0m )\n\u001b[0;32m--> 729\u001b[0m iterator, query_job \u001b[38;5;241m=\u001b[39m \u001b[43mbf_io_bigquery\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstart_query_with_client\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 730\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_bqclient\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 731\u001b[0m \u001b[43m \u001b[49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 732\u001b[0m \u001b[43m \u001b[49m\u001b[43mjob_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjob_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 733\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 734\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 735\u001b[0m \u001b[43m \u001b[49m\u001b[43mapi_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mapi_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 736\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 737\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m query_job \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 738\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m iterator, query_job\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/_io/bigquery/__init__.py:267\u001b[0m, in \u001b[0;36mstart_query_with_client\u001b[0;34m(bq_client, sql, job_config, location, project, max_results, page_size, timeout, api_name, metrics, query_with_job)\u001b[0m\n\u001b[1;32m 265\u001b[0m opts \u001b[38;5;241m=\u001b[39m bigframes\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mdisplay\n\u001b[1;32m 266\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m opts\u001b[38;5;241m.\u001b[39mprogress_bar \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m query_job\u001b[38;5;241m.\u001b[39mconfiguration\u001b[38;5;241m.\u001b[39mdry_run:\n\u001b[0;32m--> 267\u001b[0m results_iterator \u001b[38;5;241m=\u001b[39m \u001b[43mformatting_helpers\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait_for_query_job\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 268\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_job\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 269\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 270\u001b[0m \u001b[43m \u001b[49m\u001b[43mprogress_bar\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mopts\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprogress_bar\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 271\u001b[0m \u001b[43m \u001b[49m\u001b[43mpage_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpage_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 272\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 273\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 274\u001b[0m results_iterator \u001b[38;5;241m=\u001b[39m query_job\u001b[38;5;241m.\u001b[39mresult(\n\u001b[1;32m 275\u001b[0m max_results\u001b[38;5;241m=\u001b[39mmax_results, page_size\u001b[38;5;241m=\u001b[39mpage_size\n\u001b[1;32m 276\u001b[0m )\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/formatting_helpers.py:139\u001b[0m, in \u001b[0;36mwait_for_query_job\u001b[0;34m(query_job, max_results, page_size, progress_bar)\u001b[0m\n\u001b[1;32m 137\u001b[0m loading_bar \u001b[38;5;241m=\u001b[39m display\u001b[38;5;241m.\u001b[39mHTML(get_query_job_loading_html(query_job))\n\u001b[1;32m 138\u001b[0m display\u001b[38;5;241m.\u001b[39mdisplay(loading_bar, display_id\u001b[38;5;241m=\u001b[39mdisplay_id)\n\u001b[0;32m--> 139\u001b[0m query_result \u001b[38;5;241m=\u001b[39m \u001b[43mquery_job\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 140\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpage_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpage_size\u001b[49m\n\u001b[1;32m 141\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 142\u001b[0m query_job\u001b[38;5;241m.\u001b[39mreload()\n\u001b[1;32m 143\u001b[0m display\u001b[38;5;241m.\u001b[39mupdate_display(\n\u001b[1;32m 144\u001b[0m display\u001b[38;5;241m.\u001b[39mHTML(get_query_job_loading_html(query_job)),\n\u001b[1;32m 145\u001b[0m display_id\u001b[38;5;241m=\u001b[39mdisplay_id,\n\u001b[1;32m 146\u001b[0m )\n", + "File \u001b[0;32m~/src/python-bigquery/google/cloud/bigquery/job/query.py:1681\u001b[0m, in \u001b[0;36mQueryJob.result\u001b[0;34m(self, page_size, max_results, retry, timeout, start_index, job_retry)\u001b[0m\n\u001b[1;32m 1676\u001b[0m remaining_timeout \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1678\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m remaining_timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1679\u001b[0m \u001b[38;5;66;03m# Since is_job_done() calls jobs.getQueryResults, which is a\u001b[39;00m\n\u001b[1;32m 1680\u001b[0m \u001b[38;5;66;03m# long-running API, don't delay the next request at all.\u001b[39;00m\n\u001b[0;32m-> 1681\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[43mis_job_done\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 1682\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[1;32m 1683\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1684\u001b[0m \u001b[38;5;66;03m# Use a monotonic clock since we don't actually care about\u001b[39;00m\n\u001b[1;32m 1685\u001b[0m \u001b[38;5;66;03m# daylight savings or similar, just the elapsed time.\u001b[39;00m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:293\u001b[0m, in \u001b[0;36mRetry.__call__..retry_wrapped_func\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 289\u001b[0m target \u001b[38;5;241m=\u001b[39m functools\u001b[38;5;241m.\u001b[39mpartial(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 290\u001b[0m sleep_generator \u001b[38;5;241m=\u001b[39m exponential_sleep_generator(\n\u001b[1;32m 291\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_initial, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_maximum, multiplier\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_multiplier\n\u001b[1;32m 292\u001b[0m )\n\u001b[0;32m--> 293\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mretry_target\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 294\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 295\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_predicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 296\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep_generator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 297\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 298\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 299\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:153\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 149\u001b[0m \u001b[38;5;66;03m# pylint: disable=broad-except\u001b[39;00m\n\u001b[1;32m 150\u001b[0m \u001b[38;5;66;03m# This function explicitly must deal with broad exceptions.\u001b[39;00m\n\u001b[1;32m 151\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 152\u001b[0m \u001b[38;5;66;03m# defer to shared logic for handling errors\u001b[39;00m\n\u001b[0;32m--> 153\u001b[0m \u001b[43m_retry_error_helper\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 154\u001b[0m \u001b[43m \u001b[49m\u001b[43mexc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 155\u001b[0m \u001b[43m \u001b[49m\u001b[43mdeadline\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 156\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 157\u001b[0m \u001b[43m \u001b[49m\u001b[43merror_list\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 158\u001b[0m \u001b[43m \u001b[49m\u001b[43mpredicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 159\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 160\u001b[0m \u001b[43m \u001b[49m\u001b[43mexception_factory\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 161\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 162\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 163\u001b[0m \u001b[38;5;66;03m# if exception not raised, sleep before next attempt\u001b[39;00m\n\u001b[1;32m 164\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(sleep)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_base.py:212\u001b[0m, in \u001b[0;36m_retry_error_helper\u001b[0;34m(exc, deadline, next_sleep, error_list, predicate_fn, on_error_fn, exc_factory_fn, original_timeout)\u001b[0m\n\u001b[1;32m 206\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m predicate_fn(exc):\n\u001b[1;32m 207\u001b[0m final_exc, source_exc \u001b[38;5;241m=\u001b[39m exc_factory_fn(\n\u001b[1;32m 208\u001b[0m error_list,\n\u001b[1;32m 209\u001b[0m RetryFailureReason\u001b[38;5;241m.\u001b[39mNON_RETRYABLE_ERROR,\n\u001b[1;32m 210\u001b[0m original_timeout,\n\u001b[1;32m 211\u001b[0m )\n\u001b[0;32m--> 212\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m final_exc \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msource_exc\u001b[39;00m\n\u001b[1;32m 213\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m on_error_fn \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 214\u001b[0m on_error_fn(exc)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:144\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m sleep \u001b[38;5;129;01min\u001b[39;00m sleep_generator:\n\u001b[1;32m 143\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 144\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mtarget\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 145\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inspect\u001b[38;5;241m.\u001b[39misawaitable(result):\n\u001b[1;32m 146\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(_ASYNC_RETRY_WARNING)\n", + "File \u001b[0;32m~/src/python-bigquery/google/cloud/bigquery/job/query.py:1650\u001b[0m, in \u001b[0;36mQueryJob.result..is_job_done\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1644\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 1646\u001b[0m \u001b[38;5;66;03m# Call jobs.getQueryResults with max results set to 0 just to\u001b[39;00m\n\u001b[1;32m 1647\u001b[0m \u001b[38;5;66;03m# wait for the query to finish. Unlike most methods,\u001b[39;00m\n\u001b[1;32m 1648\u001b[0m \u001b[38;5;66;03m# jobs.getQueryResults hangs as long as it can to ensure we\u001b[39;00m\n\u001b[1;32m 1649\u001b[0m \u001b[38;5;66;03m# know when the query has finished as soon as possible.\u001b[39;00m\n\u001b[0;32m-> 1650\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_reload_query_results\u001b[49m\u001b[43m(\u001b[49m\u001b[43mretry\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mretry\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mreload_query_results_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1652\u001b[0m \u001b[38;5;66;03m# Even if the query is finished now according to\u001b[39;00m\n\u001b[1;32m 1653\u001b[0m \u001b[38;5;66;03m# jobs.getQueryResults, we'll want to reload the job status if\u001b[39;00m\n\u001b[1;32m 1654\u001b[0m \u001b[38;5;66;03m# it's not already DONE.\u001b[39;00m\n\u001b[1;32m 1655\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n", + "File \u001b[0;32m~/src/python-bigquery/google/cloud/bigquery/job/query.py:1448\u001b[0m, in \u001b[0;36mQueryJob._reload_query_results\u001b[0;34m(self, retry, timeout, page_size)\u001b[0m\n\u001b[1;32m 1445\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(transport_timeout, (\u001b[38;5;28mfloat\u001b[39m, \u001b[38;5;28mint\u001b[39m)):\n\u001b[1;32m 1446\u001b[0m transport_timeout \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1448\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_query_results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_client\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_query_results\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1449\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjob_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1450\u001b[0m \u001b[43m \u001b[49m\u001b[43mretry\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1451\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mproject\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1452\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout_ms\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout_ms\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1453\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlocation\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1454\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtransport_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1455\u001b[0m \u001b[43m \u001b[49m\u001b[43mpage_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpage_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1456\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery/google/cloud/bigquery/client.py:2028\u001b[0m, in \u001b[0;36mClient._get_query_results\u001b[0;34m(self, job_id, retry, project, timeout_ms, location, timeout, page_size)\u001b[0m\n\u001b[1;32m 2024\u001b[0m \u001b[38;5;66;03m# This call is typically made in a polling loop that checks whether the\u001b[39;00m\n\u001b[1;32m 2025\u001b[0m \u001b[38;5;66;03m# job is complete (from QueryJob.done(), called ultimately from\u001b[39;00m\n\u001b[1;32m 2026\u001b[0m \u001b[38;5;66;03m# QueryJob.result()). So we don't need to poll here.\u001b[39;00m\n\u001b[1;32m 2027\u001b[0m span_attributes \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpath\u001b[39m\u001b[38;5;124m\"\u001b[39m: path}\n\u001b[0;32m-> 2028\u001b[0m resource \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_api\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2029\u001b[0m \u001b[43m \u001b[49m\u001b[43mretry\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2030\u001b[0m \u001b[43m \u001b[49m\u001b[43mspan_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mBigQuery.getQueryResults\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2031\u001b[0m \u001b[43m \u001b[49m\u001b[43mspan_attributes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mspan_attributes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2032\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mGET\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2033\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2034\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_params\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextra_params\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2035\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2036\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2037\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _QueryResults\u001b[38;5;241m.\u001b[39mfrom_api_repr(resource)\n", + "File \u001b[0;32m~/src/python-bigquery/google/cloud/bigquery/client.py:837\u001b[0m, in \u001b[0;36mClient._call_api\u001b[0;34m(self, retry, span_name, span_attributes, job_ref, headers, **kwargs)\u001b[0m\n\u001b[1;32m 833\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m span_name \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 834\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m create_span(\n\u001b[1;32m 835\u001b[0m name\u001b[38;5;241m=\u001b[39mspan_name, attributes\u001b[38;5;241m=\u001b[39mspan_attributes, client\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m, job_ref\u001b[38;5;241m=\u001b[39mjob_ref\n\u001b[1;32m 836\u001b[0m ):\n\u001b[0;32m--> 837\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcall\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 839\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m call()\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:293\u001b[0m, in \u001b[0;36mRetry.__call__..retry_wrapped_func\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 289\u001b[0m target \u001b[38;5;241m=\u001b[39m functools\u001b[38;5;241m.\u001b[39mpartial(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 290\u001b[0m sleep_generator \u001b[38;5;241m=\u001b[39m exponential_sleep_generator(\n\u001b[1;32m 291\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_initial, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_maximum, multiplier\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_multiplier\n\u001b[1;32m 292\u001b[0m )\n\u001b[0;32m--> 293\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mretry_target\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 294\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 295\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_predicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 296\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep_generator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 297\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 298\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 299\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:153\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 149\u001b[0m \u001b[38;5;66;03m# pylint: disable=broad-except\u001b[39;00m\n\u001b[1;32m 150\u001b[0m \u001b[38;5;66;03m# This function explicitly must deal with broad exceptions.\u001b[39;00m\n\u001b[1;32m 151\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 152\u001b[0m \u001b[38;5;66;03m# defer to shared logic for handling errors\u001b[39;00m\n\u001b[0;32m--> 153\u001b[0m \u001b[43m_retry_error_helper\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 154\u001b[0m \u001b[43m \u001b[49m\u001b[43mexc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 155\u001b[0m \u001b[43m \u001b[49m\u001b[43mdeadline\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 156\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 157\u001b[0m \u001b[43m \u001b[49m\u001b[43merror_list\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 158\u001b[0m \u001b[43m \u001b[49m\u001b[43mpredicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 159\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 160\u001b[0m \u001b[43m \u001b[49m\u001b[43mexception_factory\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 161\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 162\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 163\u001b[0m \u001b[38;5;66;03m# if exception not raised, sleep before next attempt\u001b[39;00m\n\u001b[1;32m 164\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(sleep)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_base.py:212\u001b[0m, in \u001b[0;36m_retry_error_helper\u001b[0;34m(exc, deadline, next_sleep, error_list, predicate_fn, on_error_fn, exc_factory_fn, original_timeout)\u001b[0m\n\u001b[1;32m 206\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m predicate_fn(exc):\n\u001b[1;32m 207\u001b[0m final_exc, source_exc \u001b[38;5;241m=\u001b[39m exc_factory_fn(\n\u001b[1;32m 208\u001b[0m error_list,\n\u001b[1;32m 209\u001b[0m RetryFailureReason\u001b[38;5;241m.\u001b[39mNON_RETRYABLE_ERROR,\n\u001b[1;32m 210\u001b[0m original_timeout,\n\u001b[1;32m 211\u001b[0m )\n\u001b[0;32m--> 212\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m final_exc \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msource_exc\u001b[39;00m\n\u001b[1;32m 213\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m on_error_fn \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 214\u001b[0m on_error_fn(exc)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:144\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m sleep \u001b[38;5;129;01min\u001b[39;00m sleep_generator:\n\u001b[1;32m 143\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 144\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mtarget\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 145\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inspect\u001b[38;5;241m.\u001b[39misawaitable(result):\n\u001b[1;32m 146\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(_ASYNC_RETRY_WARNING)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/_http/__init__.py:494\u001b[0m, in \u001b[0;36mJSONConnection.api_request\u001b[0;34m(self, method, path, query_params, data, content_type, headers, api_base_url, api_version, expect_json, _target_object, timeout, extra_api_info)\u001b[0m\n\u001b[1;32m 482\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_make_request(\n\u001b[1;32m 483\u001b[0m method\u001b[38;5;241m=\u001b[39mmethod,\n\u001b[1;32m 484\u001b[0m url\u001b[38;5;241m=\u001b[39murl,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 490\u001b[0m extra_api_info\u001b[38;5;241m=\u001b[39mextra_api_info,\n\u001b[1;32m 491\u001b[0m )\n\u001b[1;32m 493\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;241m200\u001b[39m \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m response\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m300\u001b[39m:\n\u001b[0;32m--> 494\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exceptions\u001b[38;5;241m.\u001b[39mfrom_http_response(response)\n\u001b[1;32m 496\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m expect_json \u001b[38;5;129;01mand\u001b[39;00m response\u001b[38;5;241m.\u001b[39mcontent:\n\u001b[1;32m 497\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m response\u001b[38;5;241m.\u001b[39mjson()\n", + "\u001b[0;31mBadRequest\u001b[0m: 400 GET https://bigquery.googleapis.com/bigquery/v2/projects/bigframes-dev/queries/acfac823-c809-4928-8b1c-132f7f84ea11?maxResults=0&location=US&prettyPrint=false: Column 'ml_generate_embedding_result' must have the same array length, while the minimum length is 0 and the maximum length is 768.\n\nLocation: US\nJob ID: acfac823-c809-4928-8b1c-132f7f84ea11\n Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey.You are currently running BigFrames version 1.39.0 [{'@type': 'type.googleapis.com/google.rpc.DebugInfo', 'detail': '[INVALID_INPUT] message=QUERY_ERROR: [Column \\'ml_generate_embedding_result\\' must have the same array length, while the minimum length is 0 and the maximum length is 768.] debug=code: \\t BAD_QUERY\\ndescription: \"Column \\\\\\'ml_generate_embedding_result\\\\\\' must have the same array length, while the minimum length is 0 and the maximum length is 768.\"\\ncause: USER_ERROR\\naddress: \"http://jdyd1.prod.google.com:4901/task?handle=logs.0.prod-ml-us.server.cloud-dataengine-ml.10584282029591\"\\nstatus_proto {\\n code: 3\\n space: \"generic\"\\n message: \"Column \\\\\\'ml_generate_embedding_result\\\\\\' must have the same array length, while the minimum length is 0 and the maximum length is 768.\"\\n}\\nerror_details {\\n argument_error {\\n query_error {\\n }\\n }\\n debug_info {\\n error_message_template: \"Column \\\\\\'$0\\\\\\' must have the same array length, while the minimum length is $1 and the maximum length is $2.\"\\n error_id: 3839077984\\n }\\n}\\n errorProto=code: \"QUERY_ERROR\"\\nargument: \"Column \\\\\\'ml_generate_embedding_result\\\\\\' must have the same array length, while the minimum length is 0 and the maximum length is 768.\"\\nlocation_type: OTHER\\nlocation: \"query\"\\n\\n\\tat com.google.cloud.helix.common.Exceptions.fromProto(Exceptions.java:1993)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl.mapDremelErrorsTohelixException(QueryExecutorImpl.java:1206)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:766)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:693)\\n\\tat com.google.net.rpc3.stream.RpcMessageCallback$ForwardingRpcMessageCallback.onMessage(RpcMessageCallback.java:123)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.processMessageUnlocked(RpcStreamInternalContext.java:1839)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksInternalUnlocked(RpcStreamInternalContext.java:2877)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksUnlocked(RpcStreamInternalContext.java:2801)\\n\\tat com.google.net.eventmanager.AbstractFutureTask$Sync.innerRun(AbstractFutureTask.java:259)\\n\\tat com.google.net.eventmanager.AbstractFutureTask.run(AbstractFutureTask.java:120)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runTaskTraced(EventManagerImpl.java:901)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runTask(EventManagerImpl.java:893)\\n\\tat com.google.net.eventmanager.EventManagerImpl.internalRunWorkerLoop(EventManagerImpl.java:1320)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runWorkerLoop(EventManagerImpl.java:1211)\\n\\tat com.google.net.eventmanager.WorkerThreadInfo.runWorkerLoop(WorkerThreadInfo.java:153)\\n\\tat com.google.net.eventmanager.EventManagerImpl$WorkerThread.run(EventManagerImpl.java:2000)\\n'}]" + ] + } + ], + "source": [ + "import bigframes.bigquery as bbq\n", + "bbq.create_vector_index(\n", + " table_id=embedding_table_id,\n", + " column_name=\"ml_generate_embedding_result\",\n", + " distance_type=\"cosine\",\n", + " index_type=\"ivf\",\n", + " ivf_options={\"num_lists\": 100},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Search with pointers to the original pdf" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Execution of vector search, with results linked back to the original PDFs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# generate the embedding of the words for search\n", + "searched_words = [\"reinforce\"]\n", + "searched_words_embeddings = text_embedding_model.predict(searched_words)\n", + "embedding_result_column = \"ml_generate_embedding_result\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# perform vector search\n", + "search_result = (\n", + " bbq.vector_search(\n", + " base_table=embedding_table_id,\n", + " column_to_search=embedding_result_column,\n", + " query=searched_words_embeddings,\n", + " query_column_to_search=embedding_result_column,\n", + " top_k=3,\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "search_result" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/e2e_RAG_test.ipynb b/notebooks/e2e_RAG_test.ipynb new file mode 100644 index 0000000000..5a707bca3c --- /dev/null +++ b/notebooks/e2e_RAG_test.ipynb @@ -0,0 +1,712 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2025 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Get hands-on experience with Gemini-powered AI operator APIs in this notebook. We'll start with clear examples of API syntax, ensuring you understand how to use these operators. Then, we'll dive into a real-world application, showcasing their performance on a large dataset and providing key statistics. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Preparation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, import the BigFrames modules." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import bigframes\n", + "import bigframes.pandas as bpd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Make sure the BigFrames version is at least `1.38.0`" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from packaging.version import Version\n", + "\n", + "assert Version(bigframes.__version__) >= Version(\"1.38.0\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Connect to test environmet" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/_config/experiment_options.py:54: PreviewWarning: BigFrames Blob is still under experiments. It may not work and subject to change in the future.\n", + " warnings.warn(msg, category=bfe.PreviewWarning)\n", + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/_config/bigquery_options.py:362: UserWarning: This is an advanced configuration option for directly setting endpoints. Incorrect use may lead to unexpected behavior or system instability. Proceed only if you fully understand its implications.\n", + " warnings.warn(msg)\n" + ] + } + ], + "source": [ + "bigframes.options.experiments.blob = True\n", + "bigframes.options._bigquery_options.client_endpoints_override = {\"bqclient\": \"https://test-bigquery.sandbox.google.com\", \n", + " \"bqconnectionclient\": \"test-bigqueryconnection.sandbox.googleapis.com\", \n", + " \"bqstoragereadclient\": \"test-bigquerystorage-grpc.sandbox.googleapis.com\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PDF chunk" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Retrieval of PDF URLs, text extraction, and chunking." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/global_session.py:114: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + " return func(get_global_session(), *args, **kwargs)\n" + ] + }, + { + "data": { + "text/html": [ + "Query job 8bb45a5c-8c84-42a2-945e-c82ded85fb31 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 85c66232-0901-46b3-a2b5-69f5a11ff85e is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "chunks_df = bpd.from_glob_path(\"gs://shuowei_bucket/pdf/*\", name=\"pdf\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# copy files to genearte more inputs, now we have 1000 PDF files\n", + "#copies = [chunks_df] * 20\n", + "#chunks_df = bpd.concat(copies, ignore_index=True)\n", + "#chunks_df = chunks_df.cache()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job f055935b-00cc-40b3-8631-eab065130596 is DONE. 734.8 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# copy files to genearte more inputs, now we have 10,000 PDF files\n", + "copies = [chunks_df] * 100\n", + "chunks_df = bpd.concat(copies, ignore_index=True)\n", + "chunks_df = chunks_df.cache()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 66f5550a-e645-4fc7-87d7-4e0eee7ff08b is DONE. 1.6 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# copy files again, now we have 1,000,000 PDF files\n", + "copies = [chunks_df] * 100\n", + "chunks_df = bpd.concat(copies, ignore_index=True)\n", + "chunks_df = chunks_df.cache()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 28e709fc-2cd0-4512-aac2-b22872a3b84f is DONE. 158.0 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job b9a9479e-bb47-4a2e-afe1-b8fd099dcb88 is DONE. 158.0 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pdf
0uri: gs://shuowei_bucket/pdf/NeurIPS-2024-hydra-bidirectional-state-space-models-through-generalized-matrix-mixers-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
1uri: gs://shuowei_bucket/pdf/NeurIPS-2023-neural-latent-geometry-search-product-manifold-inference-via-gromov-hausdorff-informed-bayesian-optimization-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
2uri: gs://shuowei_bucket/pdf/NeurIPS-2024-a-robust-inlier-identification-algorithm-for-point-cloud-registration-via-mathbfell_0-minimization-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
3uri: gs://shuowei_bucket/pdf/NeurIPS-2024-can-an-ai-agent-safely-run-a-government-existence-of-probably-approximately-aligned-policies-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
4uri: gs://shuowei_bucket/pdf/2502.12961v1.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
5uri: gs://shuowei_bucket/pdf/NeurIPS-2024-inexact-augmented-lagrangian-methods-for-conic-optimization-quadratic-growth-and-linear-convergence-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
6uri: gs://shuowei_bucket/pdf/NeurIPS-2024-predicting-the-performance-of-foundation-models-via-agreement-on-the-line-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
7uri: gs://shuowei_bucket/pdf/NeurIPS-2024-prediction-with-action-visual-policy-learning-via-joint-denoising-process-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
8uri: gs://shuowei_bucket/pdf/NeurIPS-2023-look-ma-no-hands-agent-environment-factorization-of-egocentric-videos-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
9uri: gs://shuowei_bucket/pdf/NeurIPS-2024-cross-scale-self-supervised-blind-image-deblurring-via-implicit-neural-representation-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
10uri: gs://shuowei_bucket/pdf/NeurIPS-2023-two-stage-learning-to-defer-with-multiple-experts-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
11uri: gs://shuowei_bucket/pdf/NeurIPS-2023-on-separate-normalization-in-self-supervised-transformers-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
12uri: gs://shuowei_bucket/pdf/NeurIPS-2024-decrl-a-deep-evolutionary-clustering-jointed-temporal-knowledge-graph-representation-learning-approach-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
13uri: gs://shuowei_bucket/pdf/NeurIPS-2023-demystifying-the-optimal-performance-of-multi-class-classification-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
14uri: gs://shuowei_bucket/pdf/2502.12926v1.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
15uri: gs://shuowei_bucket/pdf/2502.13069v1.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
16uri: gs://shuowei_bucket/pdf/NeurIPS-2024-a-scalable-generative-model-for-dynamical-system-reconstruction-from-neuroimaging-data-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
17uri: gs://shuowei_bucket/pdf/NeurIPS-2024-disentangling-interpretable-factors-with-supervised-independent-subspace-principal-component-analysis-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
18uri: gs://shuowei_bucket/pdf/NeurIPS-2023-deliffas-deformable-light-fields-for-fast-avatar-synthesis-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
19uri: gs://shuowei_bucket/pdf/NeurIPS-2024-diffusion-actor-critic-with-entropy-regulator-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
20uri: gs://shuowei_bucket/pdf/NeurIPS-2023-accurate-interpolation-for-scattered-data-through-hierarchical-residual-refinement-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
21uri: gs://shuowei_bucket/pdf/NeurIPS-2023-expressive-sign-equivariant-networks-for-spectral-geometric-learning-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
22uri: gs://shuowei_bucket/pdf/NeurIPS-2024-flexible-task-abstractions-emerge-in-linear-networks-with-fast-and-bounded-units-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
23uri: gs://shuowei_bucket/pdf/NeurIPS-2023-h3t-efficient-integration-of-memory-optimization-and-parallelism-for-large-scale-transformer-training-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
24uri: gs://shuowei_bucket/pdf/2502.12224v1.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
\n", + "

25 rows × 1 columns

\n", + "
[1000000 rows x 1 columns in total]" + ], + "text/plain": [ + " pdf\n", + "0 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2024-...\n", + "1 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2023-...\n", + "2 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2024-...\n", + "3 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2024-...\n", + "4 {'uri': 'gs://shuowei_bucket/pdf/2502.12961v1....\n", + "5 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2024-...\n", + "6 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2024-...\n", + "7 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2024-...\n", + "8 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2023-...\n", + "9 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2024-...\n", + "10 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2023-...\n", + "11 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2023-...\n", + "12 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2024-...\n", + "13 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2023-...\n", + "14 {'uri': 'gs://shuowei_bucket/pdf/2502.12926v1....\n", + "15 {'uri': 'gs://shuowei_bucket/pdf/2502.13069v1....\n", + "16 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2024-...\n", + "17 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2024-...\n", + "18 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2023-...\n", + "19 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2024-...\n", + "20 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2023-...\n", + "21 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2023-...\n", + "22 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2024-...\n", + "23 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2023-...\n", + "24 {'uri': 'gs://shuowei_bucket/pdf/2502.12224v1....\n", + "...\n", + "\n", + "[1000000 rows x 1 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chunks_df" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 4081bd0d-0d54-4c70-96d0-1f55b497cb5d is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dataframe.py:4098: PreviewWarning: axis=1 scenario is in preview.\n", + " warnings.warn(msg, category=bfe.PreviewWarning)\n" + ] + } + ], + "source": [ + "bq_connection = \"bigframes-dev.us.bigframes-default-connection\"\n", + "chunks_df[\"chunk_text\"] = chunks_df[\"pdf\"].blob.pdf_chunk(\n", + " connection=bq_connection, chunk_size=2000, overlap_size=200,\n", + " max_batching_rows=1, container_cpu=2, container_memory=\"1Gi\")\n", + "# notes: use connection is not necessary, we can use default connection.\n", + "# However, in current stage, using a specfic conneciton will grant more quota" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Explode column for future processing." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "chunk_df_exploded = chunks_df[\"chunk_text\"].explode()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Save to a temporary table" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job e671bba2-377c-45b9-9947-44f1914fae4e is RUNNING. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "ename": "BadRequest", + "evalue": "400 GET https://test-bigquery.sandbox.google.com/bigquery/v2/projects/bigframes-dev/queries/e671bba2-377c-45b9-9947-44f1914fae4e?maxResults=0&location=US&prettyPrint=false: The job encountered an error during execution. Retrying the job may solve the problem.\n\nLocation: US\nJob ID: e671bba2-377c-45b9-9947-44f1914fae4e\n Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey.You are currently running BigFrames version 1.38.0 [{'@type': 'type.googleapis.com/google.rpc.DebugInfo', 'detail': '[CONNECTION_ERROR] debug=Dremel returned an error: generic::UNAVAILABLE: Reached maximum number of retriable errors. errorProto=code: \"CONNECTION_ERROR\"\\n\\n\\tat com.google.cloud.helix.common.Exceptions$Public.connectionError(Exceptions.java:776)\\n\\tat com.google.cloud.helix.common.Exceptions$Public.connectionError(Exceptions.java:780)\\n\\tat com.google.cloud.helix.server.job.DremelErrorUtil.createHelixErrorFromDremelRpcException(DremelErrorUtil.java:60)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:783)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:697)\\n\\tat com.google.net.rpc3.stream.RpcMessageCallback$ForwardingRpcMessageCallback.onMessage(RpcMessageCallback.java:123)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.processMessageUnlocked(RpcStreamInternalContext.java:1839)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksInternalUnlocked(RpcStreamInternalContext.java:2877)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksUnlocked(RpcStreamInternalContext.java:2801)\\n\\tat com.google.net.eventmanager.AbstractFutureTask$Sync.innerRun(AbstractFutureTask.java:259)\\n\\tat com.google.net.eventmanager.AbstractFutureTask.run(AbstractFutureTask.java:120)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runTaskTraced(EventManagerImpl.java:901)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runTask(EventManagerImpl.java:893)\\n\\tat com.google.net.eventmanager.EventManagerImpl.internalRunWorkerLoop(EventManagerImpl.java:1320)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runWorkerLoop(EventManagerImpl.java:1211)\\n\\tat com.google.net.eventmanager.WorkerThreadInfo.runWorkerLoop(WorkerThreadInfo.java:153)\\n\\tat com.google.net.eventmanager.EventManagerImpl$WorkerThread.run(EventManagerImpl.java:2000)\\n\\tSuppressed: java.lang.Exception: Including call stack from HelixFutures\\n\\t\\tat com.google.cloud.helix.common.HelixFutures.getHelixException(HelixFutures.java:76)\\n\\t\\tat com.google.cloud.helix.common.HelixFutures.getDone(HelixFutures.java:55)\\n\\t\\tat com.google.cloud.helix.server.job.LocalQueryJobController.handleQueryDone(LocalQueryJobController.java:2626)\\n\\t\\tat com.google.cloud.helix.server.job.LocalQueryJobController.lambda$runJob$1(LocalQueryJobController.java:2539)\\n\\t\\tat com.google.common.util.concurrent.CombinedFuture$CallableInterruptibleTask.runInterruptibly(CombinedFuture.java:196)\\n\\t\\tat com.google.common.util.concurrent.InterruptibleTask.run(InterruptibleTask.java:74)\\n\\t\\tat com.google.common.context.ContextRunnable.runInContext(ContextRunnable.java:83)\\n\\t\\tat io.grpc.Context.run(Context.java:536)\\n\\t\\tat com.google.tracing.GenericContextCallback.runInInheritedContext(GenericContextCallback.java:78)\\n\\t\\tat com.google.common.context.ContextRunnable.run(ContextRunnable.java:74)\\n\\t\\tat java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)\\n\\t\\tat java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)\\n\\t\\tat java.base/java.lang.Thread.run(Unknown Source)\\n'}]", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mBadRequest\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[12], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m chunk_df_exploded \u001b[38;5;241m=\u001b[39m \u001b[43mchunk_df_exploded\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcache\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:147\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 144\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 146\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 147\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 148\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 149\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 150\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 151\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 152\u001b[0m \u001b[38;5;66;03m# self._block.expr.session.bqclient. Also, to avoid generating multiple queries\u001b[39;00m\n\u001b[1;32m 153\u001b[0m \u001b[38;5;66;03m# because of internal calls, we log only when the method is directly invoked.\u001b[39;00m\n\u001b[1;32m 154\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_block\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/series.py:2135\u001b[0m, in \u001b[0;36mSeries.cache\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 2126\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 2127\u001b[0m \u001b[38;5;124;03mMaterializes the Series to a temporary table.\u001b[39;00m\n\u001b[1;32m 2128\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 2132\u001b[0m \u001b[38;5;124;03m Series: Self\u001b[39;00m\n\u001b[1;32m 2133\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 2134\u001b[0m \u001b[38;5;66;03m# Do not use session-aware cashing if user-requested\u001b[39;00m\n\u001b[0;32m-> 2135\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_cached\u001b[49m\u001b[43m(\u001b[49m\u001b[43mforce\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msession_aware\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:147\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 144\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 146\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 147\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 148\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 149\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 150\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 151\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 152\u001b[0m \u001b[38;5;66;03m# self._block.expr.session.bqclient. Also, to avoid generating multiple queries\u001b[39;00m\n\u001b[1;32m 153\u001b[0m \u001b[38;5;66;03m# because of internal calls, we log only when the method is directly invoked.\u001b[39;00m\n\u001b[1;32m 154\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_block\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/series.py:2138\u001b[0m, in \u001b[0;36mSeries._cached\u001b[0;34m(self, force, session_aware)\u001b[0m\n\u001b[1;32m 2137\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_cached\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39m, force: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m, session_aware: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Series:\n\u001b[0;32m-> 2138\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_block\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcached\u001b[49m\u001b[43m(\u001b[49m\u001b[43mforce\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mforce\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msession_aware\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msession_aware\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2139\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/blocks.py:2445\u001b[0m, in \u001b[0;36mBlock.cached\u001b[0;34m(self, force, session_aware)\u001b[0m\n\u001b[1;32m 2443\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Write the block to a session table.\"\"\"\u001b[39;00m\n\u001b[1;32m 2444\u001b[0m \u001b[38;5;66;03m# use a heuristic for whether something needs to be cached\u001b[39;00m\n\u001b[0;32m-> 2445\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_executor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcached\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2446\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexpr\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2447\u001b[0m \u001b[43m \u001b[49m\u001b[43mforce\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mforce\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2448\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_session\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msession_aware\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2449\u001b[0m \u001b[43m \u001b[49m\u001b[43mcluster_cols\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mindex_columns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2450\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/executor.py:456\u001b[0m, in \u001b[0;36mBigQueryCachingExecutor.cached\u001b[0;34m(self, array_value, force, use_session, cluster_cols)\u001b[0m\n\u001b[1;32m 454\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_cache_with_session_awareness(array_value)\n\u001b[1;32m 455\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 456\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_cache_with_cluster_cols\u001b[49m\u001b[43m(\u001b[49m\u001b[43marray_value\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcluster_cols\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcluster_cols\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/executor.py:532\u001b[0m, in \u001b[0;36mBigQueryCachingExecutor._cache_with_cluster_cols\u001b[0;34m(self, array_value, cluster_cols)\u001b[0m\n\u001b[1;32m 527\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Executes the query and uses the resulting table to rewrite future executions.\"\"\"\u001b[39;00m\n\u001b[1;32m 529\u001b[0m sql, schema, ordering_info \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcompiler\u001b[38;5;241m.\u001b[39mcompile_raw(\n\u001b[1;32m 530\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreplace_cached_subtrees(array_value\u001b[38;5;241m.\u001b[39mnode)\n\u001b[1;32m 531\u001b[0m )\n\u001b[0;32m--> 532\u001b[0m tmp_table \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sql_as_cached_temp_table\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 533\u001b[0m \u001b[43m \u001b[49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 534\u001b[0m \u001b[43m \u001b[49m\u001b[43mschema\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 535\u001b[0m \u001b[43m \u001b[49m\u001b[43mcluster_cols\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbq_io\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mselect_cluster_cols\u001b[49m\u001b[43m(\u001b[49m\u001b[43mschema\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcluster_cols\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 536\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 537\u001b[0m cached_replacement \u001b[38;5;241m=\u001b[39m array_value\u001b[38;5;241m.\u001b[39mas_cached(\n\u001b[1;32m 538\u001b[0m cache_table\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbqclient\u001b[38;5;241m.\u001b[39mget_table(tmp_table),\n\u001b[1;32m 539\u001b[0m ordering\u001b[38;5;241m=\u001b[39mordering_info,\n\u001b[1;32m 540\u001b[0m )\u001b[38;5;241m.\u001b[39mnode\n\u001b[1;32m 541\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_cached_executions[array_value\u001b[38;5;241m.\u001b[39mnode] \u001b[38;5;241m=\u001b[39m cached_replacement\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/executor.py:626\u001b[0m, in \u001b[0;36mBigQueryCachingExecutor._sql_as_cached_temp_table\u001b[0;34m(self, sql, schema, cluster_cols)\u001b[0m\n\u001b[1;32m 621\u001b[0m job_config \u001b[38;5;241m=\u001b[39m cast(\n\u001b[1;32m 622\u001b[0m bigquery\u001b[38;5;241m.\u001b[39mQueryJobConfig,\n\u001b[1;32m 623\u001b[0m bigquery\u001b[38;5;241m.\u001b[39mQueryJobConfig\u001b[38;5;241m.\u001b[39mfrom_api_repr({}),\n\u001b[1;32m 624\u001b[0m )\n\u001b[1;32m 625\u001b[0m job_config\u001b[38;5;241m.\u001b[39mdestination \u001b[38;5;241m=\u001b[39m temp_table\n\u001b[0;32m--> 626\u001b[0m _, query_job \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_execute_query\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 627\u001b[0m \u001b[43m \u001b[49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 628\u001b[0m \u001b[43m \u001b[49m\u001b[43mjob_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjob_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 629\u001b[0m \u001b[43m \u001b[49m\u001b[43mapi_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcached\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 630\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 631\u001b[0m query_job\u001b[38;5;241m.\u001b[39mdestination\n\u001b[1;32m 632\u001b[0m query_job\u001b[38;5;241m.\u001b[39mresult()\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/executor.py:492\u001b[0m, in \u001b[0;36mBigQueryCachingExecutor._run_execute_query\u001b[0;34m(self, sql, job_config, api_name, page_size, max_results)\u001b[0m\n\u001b[1;32m 490\u001b[0m bq_io\u001b[38;5;241m.\u001b[39madd_and_trim_labels(job_config, api_name\u001b[38;5;241m=\u001b[39mapi_name)\n\u001b[1;32m 491\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 492\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mbq_io\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstart_query_with_client\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 493\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbqclient\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 494\u001b[0m \u001b[43m \u001b[49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 495\u001b[0m \u001b[43m \u001b[49m\u001b[43mjob_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjob_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 496\u001b[0m \u001b[43m \u001b[49m\u001b[43mapi_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mapi_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 497\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 498\u001b[0m \u001b[43m \u001b[49m\u001b[43mpage_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpage_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 499\u001b[0m \u001b[43m \u001b[49m\u001b[43mmetrics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmetrics\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 500\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 502\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m google\u001b[38;5;241m.\u001b[39mapi_core\u001b[38;5;241m.\u001b[39mexceptions\u001b[38;5;241m.\u001b[39mBadRequest \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 503\u001b[0m \u001b[38;5;66;03m# Unfortunately, this error type does not have a separate error code or exception type\u001b[39;00m\n\u001b[1;32m 504\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mResources exceeded during query execution\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m e\u001b[38;5;241m.\u001b[39mmessage:\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/_io/bigquery/__init__.py:253\u001b[0m, in \u001b[0;36mstart_query_with_client\u001b[0;34m(bq_client, sql, job_config, location, project, max_results, page_size, timeout, api_name, metrics)\u001b[0m\n\u001b[1;32m 251\u001b[0m opts \u001b[38;5;241m=\u001b[39m bigframes\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mdisplay\n\u001b[1;32m 252\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m opts\u001b[38;5;241m.\u001b[39mprogress_bar \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m query_job\u001b[38;5;241m.\u001b[39mconfiguration\u001b[38;5;241m.\u001b[39mdry_run:\n\u001b[0;32m--> 253\u001b[0m results_iterator \u001b[38;5;241m=\u001b[39m \u001b[43mformatting_helpers\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait_for_query_job\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 254\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_job\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 255\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 256\u001b[0m \u001b[43m \u001b[49m\u001b[43mprogress_bar\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mopts\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprogress_bar\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 257\u001b[0m \u001b[43m \u001b[49m\u001b[43mpage_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpage_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 258\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 259\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 260\u001b[0m results_iterator \u001b[38;5;241m=\u001b[39m query_job\u001b[38;5;241m.\u001b[39mresult(\n\u001b[1;32m 261\u001b[0m max_results\u001b[38;5;241m=\u001b[39mmax_results, page_size\u001b[38;5;241m=\u001b[39mpage_size\n\u001b[1;32m 262\u001b[0m )\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/formatting_helpers.py:139\u001b[0m, in \u001b[0;36mwait_for_query_job\u001b[0;34m(query_job, max_results, page_size, progress_bar)\u001b[0m\n\u001b[1;32m 137\u001b[0m loading_bar \u001b[38;5;241m=\u001b[39m display\u001b[38;5;241m.\u001b[39mHTML(get_query_job_loading_html(query_job))\n\u001b[1;32m 138\u001b[0m display\u001b[38;5;241m.\u001b[39mdisplay(loading_bar, display_id\u001b[38;5;241m=\u001b[39mdisplay_id)\n\u001b[0;32m--> 139\u001b[0m query_result \u001b[38;5;241m=\u001b[39m \u001b[43mquery_job\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 140\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpage_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpage_size\u001b[49m\n\u001b[1;32m 141\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 142\u001b[0m query_job\u001b[38;5;241m.\u001b[39mreload()\n\u001b[1;32m 143\u001b[0m display\u001b[38;5;241m.\u001b[39mupdate_display(\n\u001b[1;32m 144\u001b[0m display\u001b[38;5;241m.\u001b[39mHTML(get_query_job_loading_html(query_job)),\n\u001b[1;32m 145\u001b[0m display_id\u001b[38;5;241m=\u001b[39mdisplay_id,\n\u001b[1;32m 146\u001b[0m )\n", + "File \u001b[0;32m~/src/python-bigquery/google/cloud/bigquery/job/query.py:1681\u001b[0m, in \u001b[0;36mQueryJob.result\u001b[0;34m(self, page_size, max_results, retry, timeout, start_index, job_retry)\u001b[0m\n\u001b[1;32m 1676\u001b[0m remaining_timeout \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1678\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m remaining_timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1679\u001b[0m \u001b[38;5;66;03m# Since is_job_done() calls jobs.getQueryResults, which is a\u001b[39;00m\n\u001b[1;32m 1680\u001b[0m \u001b[38;5;66;03m# long-running API, don't delay the next request at all.\u001b[39;00m\n\u001b[0;32m-> 1681\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[43mis_job_done\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 1682\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[1;32m 1683\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1684\u001b[0m \u001b[38;5;66;03m# Use a monotonic clock since we don't actually care about\u001b[39;00m\n\u001b[1;32m 1685\u001b[0m \u001b[38;5;66;03m# daylight savings or similar, just the elapsed time.\u001b[39;00m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:293\u001b[0m, in \u001b[0;36mRetry.__call__..retry_wrapped_func\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 289\u001b[0m target \u001b[38;5;241m=\u001b[39m functools\u001b[38;5;241m.\u001b[39mpartial(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 290\u001b[0m sleep_generator \u001b[38;5;241m=\u001b[39m exponential_sleep_generator(\n\u001b[1;32m 291\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_initial, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_maximum, multiplier\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_multiplier\n\u001b[1;32m 292\u001b[0m )\n\u001b[0;32m--> 293\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mretry_target\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 294\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 295\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_predicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 296\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep_generator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 297\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 298\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 299\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:153\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 149\u001b[0m \u001b[38;5;66;03m# pylint: disable=broad-except\u001b[39;00m\n\u001b[1;32m 150\u001b[0m \u001b[38;5;66;03m# This function explicitly must deal with broad exceptions.\u001b[39;00m\n\u001b[1;32m 151\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 152\u001b[0m \u001b[38;5;66;03m# defer to shared logic for handling errors\u001b[39;00m\n\u001b[0;32m--> 153\u001b[0m \u001b[43m_retry_error_helper\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 154\u001b[0m \u001b[43m \u001b[49m\u001b[43mexc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 155\u001b[0m \u001b[43m \u001b[49m\u001b[43mdeadline\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 156\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 157\u001b[0m \u001b[43m \u001b[49m\u001b[43merror_list\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 158\u001b[0m \u001b[43m \u001b[49m\u001b[43mpredicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 159\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 160\u001b[0m \u001b[43m \u001b[49m\u001b[43mexception_factory\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 161\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 162\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 163\u001b[0m \u001b[38;5;66;03m# if exception not raised, sleep before next attempt\u001b[39;00m\n\u001b[1;32m 164\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(sleep)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_base.py:212\u001b[0m, in \u001b[0;36m_retry_error_helper\u001b[0;34m(exc, deadline, next_sleep, error_list, predicate_fn, on_error_fn, exc_factory_fn, original_timeout)\u001b[0m\n\u001b[1;32m 206\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m predicate_fn(exc):\n\u001b[1;32m 207\u001b[0m final_exc, source_exc \u001b[38;5;241m=\u001b[39m exc_factory_fn(\n\u001b[1;32m 208\u001b[0m error_list,\n\u001b[1;32m 209\u001b[0m RetryFailureReason\u001b[38;5;241m.\u001b[39mNON_RETRYABLE_ERROR,\n\u001b[1;32m 210\u001b[0m original_timeout,\n\u001b[1;32m 211\u001b[0m )\n\u001b[0;32m--> 212\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m final_exc \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msource_exc\u001b[39;00m\n\u001b[1;32m 213\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m on_error_fn \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 214\u001b[0m on_error_fn(exc)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:144\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m sleep \u001b[38;5;129;01min\u001b[39;00m sleep_generator:\n\u001b[1;32m 143\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 144\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mtarget\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 145\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inspect\u001b[38;5;241m.\u001b[39misawaitable(result):\n\u001b[1;32m 146\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(_ASYNC_RETRY_WARNING)\n", + "File \u001b[0;32m~/src/python-bigquery/google/cloud/bigquery/job/query.py:1650\u001b[0m, in \u001b[0;36mQueryJob.result..is_job_done\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1644\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 1646\u001b[0m \u001b[38;5;66;03m# Call jobs.getQueryResults with max results set to 0 just to\u001b[39;00m\n\u001b[1;32m 1647\u001b[0m \u001b[38;5;66;03m# wait for the query to finish. Unlike most methods,\u001b[39;00m\n\u001b[1;32m 1648\u001b[0m \u001b[38;5;66;03m# jobs.getQueryResults hangs as long as it can to ensure we\u001b[39;00m\n\u001b[1;32m 1649\u001b[0m \u001b[38;5;66;03m# know when the query has finished as soon as possible.\u001b[39;00m\n\u001b[0;32m-> 1650\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_reload_query_results\u001b[49m\u001b[43m(\u001b[49m\u001b[43mretry\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mretry\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mreload_query_results_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1652\u001b[0m \u001b[38;5;66;03m# Even if the query is finished now according to\u001b[39;00m\n\u001b[1;32m 1653\u001b[0m \u001b[38;5;66;03m# jobs.getQueryResults, we'll want to reload the job status if\u001b[39;00m\n\u001b[1;32m 1654\u001b[0m \u001b[38;5;66;03m# it's not already DONE.\u001b[39;00m\n\u001b[1;32m 1655\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n", + "File \u001b[0;32m~/src/python-bigquery/google/cloud/bigquery/job/query.py:1448\u001b[0m, in \u001b[0;36mQueryJob._reload_query_results\u001b[0;34m(self, retry, timeout, page_size)\u001b[0m\n\u001b[1;32m 1445\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(transport_timeout, (\u001b[38;5;28mfloat\u001b[39m, \u001b[38;5;28mint\u001b[39m)):\n\u001b[1;32m 1446\u001b[0m transport_timeout \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1448\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_query_results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_client\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_query_results\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1449\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjob_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1450\u001b[0m \u001b[43m \u001b[49m\u001b[43mretry\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1451\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mproject\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1452\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout_ms\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout_ms\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1453\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlocation\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1454\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtransport_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1455\u001b[0m \u001b[43m \u001b[49m\u001b[43mpage_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpage_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1456\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery/google/cloud/bigquery/client.py:2028\u001b[0m, in \u001b[0;36mClient._get_query_results\u001b[0;34m(self, job_id, retry, project, timeout_ms, location, timeout, page_size)\u001b[0m\n\u001b[1;32m 2024\u001b[0m \u001b[38;5;66;03m# This call is typically made in a polling loop that checks whether the\u001b[39;00m\n\u001b[1;32m 2025\u001b[0m \u001b[38;5;66;03m# job is complete (from QueryJob.done(), called ultimately from\u001b[39;00m\n\u001b[1;32m 2026\u001b[0m \u001b[38;5;66;03m# QueryJob.result()). So we don't need to poll here.\u001b[39;00m\n\u001b[1;32m 2027\u001b[0m span_attributes \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpath\u001b[39m\u001b[38;5;124m\"\u001b[39m: path}\n\u001b[0;32m-> 2028\u001b[0m resource \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_api\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2029\u001b[0m \u001b[43m \u001b[49m\u001b[43mretry\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2030\u001b[0m \u001b[43m \u001b[49m\u001b[43mspan_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mBigQuery.getQueryResults\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2031\u001b[0m \u001b[43m \u001b[49m\u001b[43mspan_attributes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mspan_attributes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2032\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mGET\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2033\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2034\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_params\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextra_params\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2035\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2036\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2037\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _QueryResults\u001b[38;5;241m.\u001b[39mfrom_api_repr(resource)\n", + "File \u001b[0;32m~/src/python-bigquery/google/cloud/bigquery/client.py:837\u001b[0m, in \u001b[0;36mClient._call_api\u001b[0;34m(self, retry, span_name, span_attributes, job_ref, headers, **kwargs)\u001b[0m\n\u001b[1;32m 833\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m span_name \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 834\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m create_span(\n\u001b[1;32m 835\u001b[0m name\u001b[38;5;241m=\u001b[39mspan_name, attributes\u001b[38;5;241m=\u001b[39mspan_attributes, client\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m, job_ref\u001b[38;5;241m=\u001b[39mjob_ref\n\u001b[1;32m 836\u001b[0m ):\n\u001b[0;32m--> 837\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcall\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 839\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m call()\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:293\u001b[0m, in \u001b[0;36mRetry.__call__..retry_wrapped_func\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 289\u001b[0m target \u001b[38;5;241m=\u001b[39m functools\u001b[38;5;241m.\u001b[39mpartial(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 290\u001b[0m sleep_generator \u001b[38;5;241m=\u001b[39m exponential_sleep_generator(\n\u001b[1;32m 291\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_initial, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_maximum, multiplier\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_multiplier\n\u001b[1;32m 292\u001b[0m )\n\u001b[0;32m--> 293\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mretry_target\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 294\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 295\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_predicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 296\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep_generator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 297\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 298\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 299\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:153\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 149\u001b[0m \u001b[38;5;66;03m# pylint: disable=broad-except\u001b[39;00m\n\u001b[1;32m 150\u001b[0m \u001b[38;5;66;03m# This function explicitly must deal with broad exceptions.\u001b[39;00m\n\u001b[1;32m 151\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 152\u001b[0m \u001b[38;5;66;03m# defer to shared logic for handling errors\u001b[39;00m\n\u001b[0;32m--> 153\u001b[0m \u001b[43m_retry_error_helper\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 154\u001b[0m \u001b[43m \u001b[49m\u001b[43mexc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 155\u001b[0m \u001b[43m \u001b[49m\u001b[43mdeadline\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 156\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 157\u001b[0m \u001b[43m \u001b[49m\u001b[43merror_list\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 158\u001b[0m \u001b[43m \u001b[49m\u001b[43mpredicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 159\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 160\u001b[0m \u001b[43m \u001b[49m\u001b[43mexception_factory\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 161\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 162\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 163\u001b[0m \u001b[38;5;66;03m# if exception not raised, sleep before next attempt\u001b[39;00m\n\u001b[1;32m 164\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(sleep)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_base.py:212\u001b[0m, in \u001b[0;36m_retry_error_helper\u001b[0;34m(exc, deadline, next_sleep, error_list, predicate_fn, on_error_fn, exc_factory_fn, original_timeout)\u001b[0m\n\u001b[1;32m 206\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m predicate_fn(exc):\n\u001b[1;32m 207\u001b[0m final_exc, source_exc \u001b[38;5;241m=\u001b[39m exc_factory_fn(\n\u001b[1;32m 208\u001b[0m error_list,\n\u001b[1;32m 209\u001b[0m RetryFailureReason\u001b[38;5;241m.\u001b[39mNON_RETRYABLE_ERROR,\n\u001b[1;32m 210\u001b[0m original_timeout,\n\u001b[1;32m 211\u001b[0m )\n\u001b[0;32m--> 212\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m final_exc \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msource_exc\u001b[39;00m\n\u001b[1;32m 213\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m on_error_fn \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 214\u001b[0m on_error_fn(exc)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:144\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m sleep \u001b[38;5;129;01min\u001b[39;00m sleep_generator:\n\u001b[1;32m 143\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 144\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mtarget\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 145\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inspect\u001b[38;5;241m.\u001b[39misawaitable(result):\n\u001b[1;32m 146\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(_ASYNC_RETRY_WARNING)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/_http/__init__.py:494\u001b[0m, in \u001b[0;36mJSONConnection.api_request\u001b[0;34m(self, method, path, query_params, data, content_type, headers, api_base_url, api_version, expect_json, _target_object, timeout, extra_api_info)\u001b[0m\n\u001b[1;32m 482\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_make_request(\n\u001b[1;32m 483\u001b[0m method\u001b[38;5;241m=\u001b[39mmethod,\n\u001b[1;32m 484\u001b[0m url\u001b[38;5;241m=\u001b[39murl,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 490\u001b[0m extra_api_info\u001b[38;5;241m=\u001b[39mextra_api_info,\n\u001b[1;32m 491\u001b[0m )\n\u001b[1;32m 493\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;241m200\u001b[39m \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m response\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m300\u001b[39m:\n\u001b[0;32m--> 494\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exceptions\u001b[38;5;241m.\u001b[39mfrom_http_response(response)\n\u001b[1;32m 496\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m expect_json \u001b[38;5;129;01mand\u001b[39;00m response\u001b[38;5;241m.\u001b[39mcontent:\n\u001b[1;32m 497\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m response\u001b[38;5;241m.\u001b[39mjson()\n", + "\u001b[0;31mBadRequest\u001b[0m: 400 GET https://test-bigquery.sandbox.google.com/bigquery/v2/projects/bigframes-dev/queries/e671bba2-377c-45b9-9947-44f1914fae4e?maxResults=0&location=US&prettyPrint=false: The job encountered an error during execution. Retrying the job may solve the problem.\n\nLocation: US\nJob ID: e671bba2-377c-45b9-9947-44f1914fae4e\n Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey.You are currently running BigFrames version 1.38.0 [{'@type': 'type.googleapis.com/google.rpc.DebugInfo', 'detail': '[CONNECTION_ERROR] debug=Dremel returned an error: generic::UNAVAILABLE: Reached maximum number of retriable errors. errorProto=code: \"CONNECTION_ERROR\"\\n\\n\\tat com.google.cloud.helix.common.Exceptions$Public.connectionError(Exceptions.java:776)\\n\\tat com.google.cloud.helix.common.Exceptions$Public.connectionError(Exceptions.java:780)\\n\\tat com.google.cloud.helix.server.job.DremelErrorUtil.createHelixErrorFromDremelRpcException(DremelErrorUtil.java:60)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:783)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:697)\\n\\tat com.google.net.rpc3.stream.RpcMessageCallback$ForwardingRpcMessageCallback.onMessage(RpcMessageCallback.java:123)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.processMessageUnlocked(RpcStreamInternalContext.java:1839)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksInternalUnlocked(RpcStreamInternalContext.java:2877)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksUnlocked(RpcStreamInternalContext.java:2801)\\n\\tat com.google.net.eventmanager.AbstractFutureTask$Sync.innerRun(AbstractFutureTask.java:259)\\n\\tat com.google.net.eventmanager.AbstractFutureTask.run(AbstractFutureTask.java:120)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runTaskTraced(EventManagerImpl.java:901)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runTask(EventManagerImpl.java:893)\\n\\tat com.google.net.eventmanager.EventManagerImpl.internalRunWorkerLoop(EventManagerImpl.java:1320)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runWorkerLoop(EventManagerImpl.java:1211)\\n\\tat com.google.net.eventmanager.WorkerThreadInfo.runWorkerLoop(WorkerThreadInfo.java:153)\\n\\tat com.google.net.eventmanager.EventManagerImpl$WorkerThread.run(EventManagerImpl.java:2000)\\n\\tSuppressed: java.lang.Exception: Including call stack from HelixFutures\\n\\t\\tat com.google.cloud.helix.common.HelixFutures.getHelixException(HelixFutures.java:76)\\n\\t\\tat com.google.cloud.helix.common.HelixFutures.getDone(HelixFutures.java:55)\\n\\t\\tat com.google.cloud.helix.server.job.LocalQueryJobController.handleQueryDone(LocalQueryJobController.java:2626)\\n\\t\\tat com.google.cloud.helix.server.job.LocalQueryJobController.lambda$runJob$1(LocalQueryJobController.java:2539)\\n\\t\\tat com.google.common.util.concurrent.CombinedFuture$CallableInterruptibleTask.runInterruptibly(CombinedFuture.java:196)\\n\\t\\tat com.google.common.util.concurrent.InterruptibleTask.run(InterruptibleTask.java:74)\\n\\t\\tat com.google.common.context.ContextRunnable.runInContext(ContextRunnable.java:83)\\n\\t\\tat io.grpc.Context.run(Context.java:536)\\n\\t\\tat com.google.tracing.GenericContextCallback.runInInheritedContext(GenericContextCallback.java:78)\\n\\t\\tat com.google.common.context.ContextRunnable.run(ContextRunnable.java:74)\\n\\t\\tat java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)\\n\\t\\tat java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)\\n\\t\\tat java.base/java.lang.Thread.run(Unknown Source)\\n'}]" + ] + } + ], + "source": [ + "chunk_df_exploded = chunk_df_exploded.cache()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "chunk_df_exploded" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Generate Embeddings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Generation of embeddings within BigFrames." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from bigframes.ml import llm\n", + "\n", + "text_embedding_model = llm.TextEmbeddingGenerator(model_name=\"text-embedding-005\")\n", + "embeddings_df = text_embedding_model.predict(chunk_df_exploded)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create Embedding table in Bigquery if not exist." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "test_project_id = \"bigframes-dev\"\n", + "test_dataset_id = \"shuowei_test_us\"\n", + "test_table_id = \"pdf_chunk_embedding\"\n", + "embedding_table_id = f\"{test_project_id}.{test_dataset_id}.{test_table_id}\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Save embedding into a BigQuery table for downstream processing." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "embeddings_df.to_gbq(destination_table=embedding_table_id,if_exists=\"replace\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Create vector search index" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Construction of an index over these embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import bigframes.bigquery as bbq\n", + "bbq.create_vector_index(\n", + " table_id=embedding_table_id,\n", + " column_name=\"ml_generate_embedding_result\",\n", + " distance_type=\"cosine\",\n", + " index_type=\"ivf\",\n", + " ivf_options={\"num_lists\": 100},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Search with pointers to the original pdf" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Execution of vector search, with results linked back to the original PDFs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# generate the embedding of the words for search\n", + "searched_words = [\"reinforce\"]\n", + "searched_words_embeddings = text_embedding_model.predict(searched_words)\n", + "embedding_result_column = \"ml_generate_embedding_result\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# perform vector search\n", + "search_result = (\n", + " bbq.vector_search(\n", + " base_table=embedding_table_id,\n", + " column_to_search=embedding_result_column,\n", + " query=searched_words_embeddings,\n", + " query_column_to_search=embedding_result_column,\n", + " top_k=3,\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "search_result" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/google_sql_notebook.ipynb b/notebooks/google_sql_notebook.ipynb new file mode 100644 index 0000000000..5b6fd2d7b3 --- /dev/null +++ b/notebooks/google_sql_notebook.ipynb @@ -0,0 +1,54 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import bigframes.pandas as bpd\n", + "bpd.options.bigquery.project = 'bigquery-public-data'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gender_filter = 'M'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bigquery --params {\"gender_filter\": gender_filter}\n", + "SELECT * FROM `bigquery-public-data.usa_names.usa_1910_2013` WHERE gender = @gender_filter" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/multimodal/transcribe_partial_mode.ipynb b/notebooks/multimodal/transcribe_partial_mode.ipynb new file mode 100644 index 0000000000..4d3598df8d --- /dev/null +++ b/notebooks/multimodal/transcribe_partial_mode.ipynb @@ -0,0 +1,153 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "6d77cb8d", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/global_session.py:103: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + " _global_session = bigframes.session.connect(\n" + ] + }, + { + "data": { + "text/html": [ + "Query job 536d38d9-59d8-49ac-9247-f8d66dccabdc is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job d6f4bc43-015f-427c-97a4-4005fcd5dc37 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 3dc5742c-35a4-45ec-9b1d-a08fd072b7b8 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 0b18e95b-0728-448e-b27a-8094d27d8135 is RUNNING. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "ename": "Forbidden", + "evalue": "403 GET https://bigquery.googleapis.com/bigquery/v2/projects/bigframes-dev/queries/0b18e95b-0728-448e-b27a-8094d27d8135?maxResults=0&location=US&prettyPrint=false: Access Denied: BigQuery BigQuery: Permission denied while globbing file pattern. bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission 'storage.objects.list' denied on resource (or it may not exist). Please make sure gs://your-bucket/audio-files/* is accessible via appropriate IAM roles, e.g. Storage Object Viewer or Storage Object Creator.\n\nLocation: US\nJob ID: 0b18e95b-0728-448e-b27a-8094d27d8135\n Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey. You are currently running BigFrames version 2.10.0. [{'@type': 'type.googleapis.com/google.rpc.DebugInfo', 'detail': '[ACCESS_DENIED] message=ACCESS_DENIED: [BigQuery, BigQuery, Permission denied while globbing file pattern. bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\'storage.objects.list\\' denied on resource (or it may not exist). Please make sure gs://your-bucket/audio-files/* is accessible via appropriate IAM roles, e.g. Storage Object Viewer or Storage Object Creator.] debug=code: \\t AUTHORIZATION_ERROR\\ndescription: \"Permission denied while globbing file pattern. bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\'storage.objects.list\\\\\\' denied on resource (or it may not exist). Please make sure gs://your-bucket/audio-files/* is accessible via appropriate IAM roles, e.g. Storage Object Viewer or Storage Object Creator. message: \\\\\"GCS aka Bigstore is not approved for storing Google user data (go/blobstore-gcs-getting-started). If you have obtained security exceptions for Bigstore instead of Blobstore, Please make sure Dremel has access to the files and all directories on the path to the files (http://go/dremel-access).\\\\\"\\\\npermission_denied_error {\\\\n accessed_resource_uri: \\\\\"/bigstore/your-bucket/audio-files/*\\\\\"\\\\n system: BIGSTORE\\\\n}\\\\nunderlying_status {\\\\n code: 7\\\\n space: \\\\\"generic\\\\\"\\\\n message: \\\\\"Calling Match with file \\\\\\\\\\\\\"/bigstore/your-bucket/audio-files/**\\\\\\\\\\\\\": cloud.bigstore.ResponseCode.ErrorCode::ACCESS_DENIED: bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist). bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist). [google.rpc.error_details_ext] { message: \\\\\\\\\\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\\\\\\\\\" details { [type.googleapis.com/google.rpc.DebugInfo] { stack_entries: \\\\\\\\\\\\\"com.google.net.rpc3.client.RpcClientException: APPLICATION_ERROR;cloud.bigstore/FrontendObjectsService.List;bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist). bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist).;AppErrorCode=1;StartTimeMs=1752257982548;tcp;Deadline(sec)=29.991333608;ResFormat=uncompressed;interceptors={[com.google.prod.fireaxe.filters.FireaxeRpcClientInterceptorImpl;com.google.cloud.bigstore.common.LatencyCollectingInterceptor;com.google.frameworks.debug.sherlog.core.rpcutil.Stubby3ClientInterceptor];overrides={}};ServerTimeSec=0.021769909;LogBytes=256;Non-FailFast;EffSecLevel=strong_privacy_and_integrity;ReqFormat=uncompressed;ReqID=f0298bdab819b7ef;GlobalID=632dd03b377a3319;Server=[::1]:14003\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.net.rpc3.client.RpcStub.startBlockingRpcInternal(RpcStub.java:571)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.net.rpc3.client.RpcStub.startBlockingRpc(RpcStub.java:471)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.proto.BackendObjectsService$Stub.list(BackendObjectsService.java:734)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.common.LampreyServiceBase.call(LampreyServiceBase.java:37)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.common.BigstoreFrontendObjectsServiceImpl.call(BigstoreFrontendObjectsServiceImpl.java:40)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.common.BigstoreFrontendObjectsServiceImpl.list(BigstoreFrontendObjectsServiceImpl.java:97)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.stubby.BigstoreStubbyImpl.lambda$listObjects$0(BigstoreStubbyImpl.java:832)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.common.CallbackRequestHandler.handleCallbackRequest(CallbackRequestHandler.java:288)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.common.CallbackRequestHandler.handleCallbackRequest(CallbackRequestHandler.java:158)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.stubby.BigstoreStubbyImpl.listObjects(BigstoreStubbyImpl.java:821)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.common.ProxyDelegatorBase.callAndRecordLatency(ProxyDelegatorBase.java:109)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.stubby.StubbyProxyDelegator.call(StubbyProxyDelegator.java:184)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.stubby.BigstoreStubbyProxy.listObjects(BigstoreStubbyProxy.java:334)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.isolation.RpcReceiver.lambda$processRequestAsync$0(RpcReceiver.java:198)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.isolation.AsyncExecutor.lambda$submit$0(AsyncExecutor.java:213)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.common.context.ContextRunnable.runInContext(ContextRunnable.java:83)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"io.grpc.Context.run(Context.java:536)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.tracing.GenericContextCallback.runInInheritedContext(GenericContextCallback.java:58)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.common.context.ContextRunnable.run(ContextRunnable.java:74)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"java.base/java.lang.Thread.run(Unknown Source)\\\\\\\\\\\\\" } } } [blobstore2.GcsErrorDetails] { xml_code: \\\\\\\\\\\\\"AccessDenied\\\\\\\\\\\\\" msg: \\\\\\\\\\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\\\\\\\\\" http_code: 403 details { first: \\\\\\\\\\\\\"Details\\\\\\\\\\\\\" second: \\\\\\\\\\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\\\\\\\\\" } debug_info: \\\\\\\\\\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist). bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\\\\\\\\\" } [cloud.bigstore.GcsLatencyInfo] { requests { method: \\\\\\\\\\\\\"/BackendObjectsService.List\\\\\\\\\\\\\" deadline { seconds: 29 nanos: 979569783 } start { seconds: 1752257982 nanos: 549935579 } end { seconds: 1752257982 nanos: 570470964 } status { code: 1 space: \\\\\\\\\\\\\"cloud.bigstore.ResponseCode.ErrorCode\\\\\\\\\\\\\" message: \\\\\\\\\\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist). bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\\\\\\\\\" canonical_code: 7 } } requests { method: \\\\\\\\\\\\\"/BucketMdService.LookupBucket\\\\\\\\\\\\\" deadline { seconds: 4 nanos: 999967650 } start { seconds: 1752257982 nanos: 551314439 } end { seconds: 1752257982 nanos: 553833108 } status { } metadata_spanner_stats { read_walltime_millis: 0 read_cpu_millis: 0 read_scheduler_delay_millis: 0 read_throttle_delay_millis: 0 read_per_service_limit_queue_delay_millis: 0 read_locking_delay_millis: 0 read_client_overhead_delay_millis: 0 read_client_flow_control_delay_millis: 0 read_io_delay_millis: 0 } elapsed_time_isolator_metrics { } sunspot_verdict: VERDICT_CAT_UNKNOWN } requests { method: \\\\\\\\\\\\\"/AccessService.CheckListObjects\\\\\\\\\\\\\" deadline { seconds: 29 nanos: 969642965 } start { seconds: 1752257982 nanos: 553970188 } end { seconds: 1752257982 nanos: 569602062 } status { } } }\\\\\"\\\\n message_set {\\\\n [google.rpc.error_details_ext] {\\\\n message: \\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\"\\\\n }\\\\n }\\\\n}\\\\nerror_context {\\\\n table_name: \\\\\"bigframes-dev._8b037bfb7316dddf9d92b12dcf93e008906bfe52.bqdf20250711_sessionee050e_bb3969eb60f548be8a4b6f5e2564f69f\\\\\"\\\\n}\\\\n\"\\ncause: USER_ERROR\\naddress: \"http://jfs5.prod.google.com:4901/task?handle=logs.2071.serving.shard-mals.cloud-dataengine.11785653463719 Partition description: __SHUFFLE0/0 TableDef \\\\\\'bigframes-dev._8b037bfb7316dddf9d92b12dcf93e008906bfe52.bqdf20250711_sessionee050e_bb3969eb60f548be8a4b6f5e2564f69f\\\\\\' of type \\\\\\'object-meta\\\\\\': /bigstore/your-bucket/audio-files/*\"\\nstatus_proto {\\n code: 7\\n space: \"generic\"\\n message: \"Permission denied while globbing file pattern. bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\'storage.objects.list\\\\\\' denied on resource (or it may not exist). Please make sure gs://your-bucket/audio-files/* is accessible via appropriate IAM roles, e.g. Storage Object Viewer or Storage Object Creator.\"\\n}\\nerror_details {\\n message: \"GCS aka Bigstore is not approved for storing Google user data (go/blobstore-gcs-getting-started). If you have obtained security exceptions for Bigstore instead of Blobstore, Please make sure Dremel has access to the files and all directories on the path to the files (http://go/dremel-access).\"\\n permission_denied_error {\\n accessed_resource_uri: \"/bigstore/your-bucket/audio-files/*\"\\n system: BIGSTORE\\n }\\n underlying_status {\\n code: 7\\n space: \"generic\"\\n message: \"Calling Match with file \\\\\"/bigstore/your-bucket/audio-files/**\\\\\": cloud.bigstore.ResponseCode.ErrorCode::ACCESS_DENIED: bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\'storage.objects.list\\\\\\' denied on resource (or it may not exist). bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\'storage.objects.list\\\\\\' denied on resource (or it may not exist). [google.rpc.error_details_ext] { message: \\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\" details { [type.googleapis.com/google.rpc.DebugInfo] { stack_entries: \\\\\"com.google.net.rpc3.client.RpcClientException: APPLICATION_ERROR;cloud.bigstore/FrontendObjectsService.List;bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist). bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist).;AppErrorCode=1;StartTimeMs=1752257982548;tcp;Deadline(sec)=29.991333608;ResFormat=uncompressed;interceptors={[com.google.prod.fireaxe.filters.FireaxeRpcClientInterceptorImpl;com.google.cloud.bigstore.common.LatencyCollectingInterceptor;com.google.frameworks.debug.sherlog.core.rpcutil.Stubby3ClientInterceptor];overrides={}};ServerTimeSec=0.021769909;LogBytes=256;Non-FailFast;EffSecLevel=strong_privacy_and_integrity;ReqFormat=uncompressed;ReqID=f0298bdab819b7ef;GlobalID=632dd03b377a3319;Server=[::1]:14003\\\\\" stack_entries: \\\\\"com.google.net.rpc3.client.RpcStub.startBlockingRpcInternal(RpcStub.java:571)\\\\\" stack_entries: \\\\\"com.google.net.rpc3.client.RpcStub.startBlockingRpc(RpcStub.java:471)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.proto.BackendObjectsService$Stub.list(BackendObjectsService.java:734)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.common.LampreyServiceBase.call(LampreyServiceBase.java:37)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.common.BigstoreFrontendObjectsServiceImpl.call(BigstoreFrontendObjectsServiceImpl.java:40)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.common.BigstoreFrontendObjectsServiceImpl.list(BigstoreFrontendObjectsServiceImpl.java:97)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.stubby.BigstoreStubbyImpl.lambda$listObjects$0(BigstoreStubbyImpl.java:832)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.common.CallbackRequestHandler.handleCallbackRequest(CallbackRequestHandler.java:288)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.common.CallbackRequestHandler.handleCallbackRequest(CallbackRequestHandler.java:158)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.stubby.BigstoreStubbyImpl.listObjects(BigstoreStubbyImpl.java:821)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.common.ProxyDelegatorBase.callAndRecordLatency(ProxyDelegatorBase.java:109)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.stubby.StubbyProxyDelegator.call(StubbyProxyDelegator.java:184)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.stubby.BigstoreStubbyProxy.listObjects(BigstoreStubbyProxy.java:334)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.isolation.RpcReceiver.lambda$processRequestAsync$0(RpcReceiver.java:198)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.isolation.AsyncExecutor.lambda$submit$0(AsyncExecutor.java:213)\\\\\" stack_entries: \\\\\"com.google.common.context.ContextRunnable.runInContext(ContextRunnable.java:83)\\\\\" stack_entries: \\\\\"io.grpc.Context.run(Context.java:536)\\\\\" stack_entries: \\\\\"com.google.tracing.GenericContextCallback.runInInheritedContext(GenericContextCallback.java:58)\\\\\" stack_entries: \\\\\"com.google.common.context.ContextRunnable.run(ContextRunnable.java:74)\\\\\" stack_entries: \\\\\"java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)\\\\\" stack_entries: \\\\\"java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)\\\\\" stack_entries: \\\\\"java.base/java.lang.Thread.run(Unknown Source)\\\\\" } } } [blobstore2.GcsErrorDetails] { xml_code: \\\\\"AccessDenied\\\\\" msg: \\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\" http_code: 403 details { first: \\\\\"Details\\\\\" second: \\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\" } debug_info: \\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist). bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\" } [cloud.bigstore.GcsLatencyInfo] { requests { method: \\\\\"/BackendObjectsService.List\\\\\" deadline { seconds: 29 nanos: 979569783 } start { seconds: 1752257982 nanos: 549935579 } end { seconds: 1752257982 nanos: 570470964 } status { code: 1 space: \\\\\"cloud.bigstore.ResponseCode.ErrorCode\\\\\" message: \\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist). bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\" canonical_code: 7 } } requests { method: \\\\\"/BucketMdService.LookupBucket\\\\\" deadline { seconds: 4 nanos: 999967650 } start { seconds: 1752257982 nanos: 551314439 } end { seconds: 1752257982 nanos: 553833108 } status { } metadata_spanner_stats { read_walltime_millis: 0 read_cpu_millis: 0 read_scheduler_delay_millis: 0 read_throttle_delay_millis: 0 read_per_service_limit_queue_delay_millis: 0 read_locking_delay_millis: 0 read_client_overhead_delay_millis: 0 read_client_flow_control_delay_millis: 0 read_io_delay_millis: 0 } elapsed_time_isolator_metrics { } sunspot_verdict: VERDICT_CAT_UNKNOWN } requests { method: \\\\\"/AccessService.CheckListObjects\\\\\" deadline { seconds: 29 nanos: 969642965 } start { seconds: 1752257982 nanos: 553970188 } end { seconds: 1752257982 nanos: 569602062 } status { } } }\"\\n message_set {\\n [google.rpc.error_details_ext] {\\n message: \"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\'storage.objects.list\\\\\\' denied on resource (or it may not exist).\"\\n }\\n }\\n }\\n error_context {\\n table_name: \"bigframes-dev._8b037bfb7316dddf9d92b12dcf93e008906bfe52.bqdf20250711_sessionee050e_bb3969eb60f548be8a4b6f5e2564f69f\"\\n }\\n}\\n errorProto=code: \"ACCESS_DENIED\"\\nargument: \"BigQuery\"\\nargument: \"BigQuery\"\\nargument: \"Permission denied while globbing file pattern. bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\'storage.objects.list\\\\\\' denied on resource (or it may not exist). Please make sure gs://your-bucket/audio-files/* is accessible via appropriate IAM roles, e.g. Storage Object Viewer or Storage Object Creator.\"\\nlocation_type: OTHER\\nlocation: \"gs://your-bucket/audio-files/*\"\\n\\n\\tat com.google.cloud.helix.common.Exceptions.fromProto(Exceptions.java:2016)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl.mapDremelErrorsTohelixException(QueryExecutorImpl.java:1194)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:769)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:695)\\n\\tat com.google.net.rpc3.stream.RpcMessageCallback$ForwardingRpcMessageCallback.onMessage(RpcMessageCallback.java:128)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.processMessageUnlocked(RpcStreamInternalContext.java:1852)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksInternalUnlocked(RpcStreamInternalContext.java:2904)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksUnlocked(RpcStreamInternalContext.java:2830)\\n\\tat com.google.net.eventmanager.AbstractFutureTask$Sync.innerRun(AbstractFutureTask.java:259)\\n\\tat com.google.net.eventmanager.AbstractFutureTask.run(AbstractFutureTask.java:120)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runTaskTraced(EventManagerImpl.java:900)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runTask(EventManagerImpl.java:892)\\n\\tat com.google.net.eventmanager.EventManagerImpl.internalRunWorkerLoop(EventManagerImpl.java:1319)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runWorkerLoop(EventManagerImpl.java:1210)\\n\\tat com.google.net.eventmanager.WorkerThreadInfo.runWorkerLoop(WorkerThreadInfo.java:153)\\n\\tat com.google.net.eventmanager.EventManagerImpl$WorkerThread.run(EventManagerImpl.java:1999)\\n'}]", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mForbidden\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 20\u001b[0m\n\u001b[1;32m 9\u001b[0m flattened \u001b[38;5;241m=\u001b[39m bpd\u001b[38;5;241m.\u001b[39mfrom_glob_path( \n\u001b[1;32m 10\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgs://your-bucket/audio-files/*\u001b[39m\u001b[38;5;124m\"\u001b[39m, \n\u001b[1;32m 11\u001b[0m name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGCS Blob\u001b[39m\u001b[38;5;124m\"\u001b[39m \n\u001b[1;32m 12\u001b[0m ) \n\u001b[1;32m 14\u001b[0m \u001b[38;5;66;03m# Alternatively, create from URI strings with null index \u001b[39;00m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;66;03m# df = bpd.DataFrame({\"uri\": [\"gs://bucket/audio1.wav\", \"gs://bucket/audio2.wav\"]}) \u001b[39;00m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;66;03m# df[\"GCS Blob\"] = df[\"uri\"].str.to_blob() \u001b[39;00m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;66;03m# flattened = bpd.read_gbq_table(\"your_table\", index_col=bigframes.enums.DefaultIndexKind.NULL) \u001b[39;00m\n\u001b[1;32m 18\u001b[0m \n\u001b[1;32m 19\u001b[0m \u001b[38;5;66;03m# 3. This will trigger the NullIndexError \u001b[39;00m\n\u001b[0;32m---> 20\u001b[0m flattened[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTranscription\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mflattened\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mGCS Blob\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mblob\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43maudio_transcribe\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 21\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgemini-2.0-flash-001\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 22\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 23\u001b[0m \u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:175\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 172\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 175\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 178\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 179\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;66;03m# _block.session.bqclient.\u001b[39;00m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/operations/blob.py:822\u001b[0m, in \u001b[0;36mBlobAccessor.audio_transcribe\u001b[0;34m(self, engine, connection, model_name, verbose)\u001b[0m\n\u001b[1;32m 815\u001b[0m llm_model \u001b[38;5;241m=\u001b[39m llm\u001b[38;5;241m.\u001b[39mGeminiTextGenerator(\n\u001b[1;32m 816\u001b[0m model_name\u001b[38;5;241m=\u001b[39mmodel_name,\n\u001b[1;32m 817\u001b[0m session\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_block\u001b[38;5;241m.\u001b[39msession,\n\u001b[1;32m 818\u001b[0m connection_name\u001b[38;5;241m=\u001b[39mconnection,\n\u001b[1;32m 819\u001b[0m )\n\u001b[1;32m 821\u001b[0m \u001b[38;5;66;03m# transcribe audio using ML.GENERATE_TEXT\u001b[39;00m\n\u001b[0;32m--> 822\u001b[0m transcribed_results \u001b[38;5;241m=\u001b[39m \u001b[43mllm_model\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpredict\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 823\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maudio_series\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 824\u001b[0m \u001b[43m \u001b[49m\u001b[43mprompt\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43mprompt_text\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maudio_series\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 825\u001b[0m \u001b[43m \u001b[49m\u001b[43mtemperature\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.0\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 826\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 828\u001b[0m transcribed_content_series \u001b[38;5;241m=\u001b[39m cast(\n\u001b[1;32m 829\u001b[0m bpd\u001b[38;5;241m.\u001b[39mSeries, transcribed_results[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mml_generate_text_llm_result\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 830\u001b[0m )\u001b[38;5;241m.\u001b[39mrename(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtranscribed_content\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 832\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m verbose:\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:175\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 172\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 175\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 178\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 179\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;66;03m# _block.session.bqclient.\u001b[39;00m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/ml/llm.py:745\u001b[0m, in \u001b[0;36mGeminiTextGenerator.predict\u001b[0;34m(self, X, temperature, max_output_tokens, top_k, top_p, ground_with_google_search, max_retries, prompt, output_schema)\u001b[0m\n\u001b[1;32m 737\u001b[0m options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutput_schema\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m output_schema\n\u001b[1;32m 738\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_predict_and_retry(\n\u001b[1;32m 739\u001b[0m core\u001b[38;5;241m.\u001b[39mBqmlModel\u001b[38;5;241m.\u001b[39mgenerate_table_tvf,\n\u001b[1;32m 740\u001b[0m X,\n\u001b[1;32m 741\u001b[0m options\u001b[38;5;241m=\u001b[39moptions,\n\u001b[1;32m 742\u001b[0m max_retries\u001b[38;5;241m=\u001b[39mmax_retries,\n\u001b[1;32m 743\u001b[0m )\n\u001b[0;32m--> 745\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_predict_and_retry\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 746\u001b[0m \u001b[43m \u001b[49m\u001b[43mcore\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mBqmlModel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate_text_tvf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 747\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 748\u001b[0m \u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 749\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_retries\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_retries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 750\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/ml/base.py:266\u001b[0m, in \u001b[0;36mRetriableRemotePredictor._predict_and_retry\u001b[0;34m(self, bqml_model_predict_tvf, X, options, max_retries)\u001b[0m\n\u001b[1;32m 263\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(msg, category\u001b[38;5;241m=\u001b[39m\u001b[38;5;167;01mRuntimeWarning\u001b[39;00m)\n\u001b[1;32m 264\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[0;32m--> 266\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43mbqml_model_predict_tvf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtvf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_bqml_model\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdf_fail\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 268\u001b[0m success \u001b[38;5;241m=\u001b[39m df[bqml_model_predict_tvf\u001b[38;5;241m.\u001b[39mstatus_col]\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mlen() \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 269\u001b[0m df_succ \u001b[38;5;241m=\u001b[39m df[success]\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/ml/core.py:179\u001b[0m, in \u001b[0;36mBqmlModel.generate_text\u001b[0;34m(self, input_data, options)\u001b[0m\n\u001b[1;32m 173\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mgenerate_text\u001b[39m(\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 175\u001b[0m input_data: bpd\u001b[38;5;241m.\u001b[39mDataFrame,\n\u001b[1;32m 176\u001b[0m options: \u001b[38;5;28mdict\u001b[39m[\u001b[38;5;28mstr\u001b[39m, Union[\u001b[38;5;28mint\u001b[39m, \u001b[38;5;28mfloat\u001b[39m, \u001b[38;5;28mbool\u001b[39m]],\n\u001b[1;32m 177\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m bpd\u001b[38;5;241m.\u001b[39mDataFrame:\n\u001b[1;32m 178\u001b[0m options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mflatten_json_output\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 179\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_apply_ml_tvf\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 180\u001b[0m \u001b[43m \u001b[49m\u001b[43minput_data\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 181\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mlambda\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43msource_sql\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sql_generator\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mml_generate_text\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 182\u001b[0m \u001b[43m \u001b[49m\u001b[43msource_sql\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msource_sql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 183\u001b[0m \u001b[43m \u001b[49m\u001b[43mstruct_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 184\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 185\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/ml/core.py:98\u001b[0m, in \u001b[0;36mBqmlModel._apply_ml_tvf\u001b[0;34m(self, input_data, apply_sql_tvf)\u001b[0m\n\u001b[1;32m 93\u001b[0m input_sql, index_col_ids, index_labels \u001b[38;5;241m=\u001b[39m input_data\u001b[38;5;241m.\u001b[39m_to_sql_query(\n\u001b[1;32m 94\u001b[0m include_index\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 95\u001b[0m )\n\u001b[1;32m 97\u001b[0m result_sql \u001b[38;5;241m=\u001b[39m apply_sql_tvf(input_sql)\n\u001b[0;32m---> 98\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_session\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_gbq\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresult_sql\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex_col_ids\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 99\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m df\u001b[38;5;241m.\u001b[39m_has_index:\n\u001b[1;32m 100\u001b[0m df\u001b[38;5;241m.\u001b[39mindex\u001b[38;5;241m.\u001b[39mnames \u001b[38;5;241m=\u001b[39m index_labels\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:175\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 172\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 175\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 178\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 179\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;66;03m# _block.session.bqclient.\u001b[39;00m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/__init__.py:439\u001b[0m, in \u001b[0;36mSession.read_gbq\u001b[0;34m(self, query_or_table, index_col, columns, configuration, max_results, filters, use_cache, col_order, dry_run)\u001b[0m\n\u001b[1;32m 436\u001b[0m columns \u001b[38;5;241m=\u001b[39m col_order\n\u001b[1;32m 438\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m bf_io_bigquery\u001b[38;5;241m.\u001b[39mis_query(query_or_table):\n\u001b[0;32m--> 439\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_loader\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_gbq_query\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# type: ignore # for dry_run overload\u001b[39;49;00m\n\u001b[1;32m 440\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_or_table\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 441\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex_col\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 442\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 443\u001b[0m \u001b[43m \u001b[49m\u001b[43mconfiguration\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfiguration\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 444\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 445\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 446\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 447\u001b[0m \u001b[43m \u001b[49m\u001b[43mdry_run\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdry_run\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 448\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 449\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 450\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m configuration \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/loader.py:996\u001b[0m, in \u001b[0;36mGbqDataLoader.read_gbq_query\u001b[0;34m(self, query, index_col, columns, configuration, max_results, use_cache, filters, dry_run, force_total_order, allow_large_results)\u001b[0m\n\u001b[1;32m 993\u001b[0m \u001b[38;5;66;03m# TODO(b/421161077): If an explicit destination table is set in\u001b[39;00m\n\u001b[1;32m 994\u001b[0m \u001b[38;5;66;03m# configuration, should we respect that setting?\u001b[39;00m\n\u001b[1;32m 995\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m allow_large_results:\n\u001b[0;32m--> 996\u001b[0m destination, query_job \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_query_to_destination\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 997\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 998\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# No cluster candidates as user query might not be clusterable\u001b[39;49;00m\n\u001b[1;32m 999\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# (eg because of ORDER BY clause)\u001b[39;49;00m\n\u001b[1;32m 1000\u001b[0m \u001b[43m \u001b[49m\u001b[43mcluster_candidates\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1001\u001b[0m \u001b[43m \u001b[49m\u001b[43mconfiguration\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfiguration\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1002\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1003\u001b[0m query_job_for_metrics \u001b[38;5;241m=\u001b[39m query_job\n\u001b[1;32m 1004\u001b[0m rows \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/loader.py:1120\u001b[0m, in \u001b[0;36mGbqDataLoader._query_to_destination\u001b[0;34m(self, query, cluster_candidates, configuration, do_clustering)\u001b[0m\n\u001b[1;32m 1116\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1117\u001b[0m \u001b[38;5;66;03m# Write to temp table to workaround BigQuery 10 GB query results\u001b[39;00m\n\u001b[1;32m 1118\u001b[0m \u001b[38;5;66;03m# limit. See: internal issue 303057336.\u001b[39;00m\n\u001b[1;32m 1119\u001b[0m job_config\u001b[38;5;241m.\u001b[39mlabels[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124merror_caught\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtrue\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1120\u001b[0m query_job \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_start_query_with_job\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1121\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1122\u001b[0m \u001b[43m \u001b[49m\u001b[43mjob_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjob_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1123\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1124\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1125\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m query_job\u001b[38;5;241m.\u001b[39mdestination, query_job\n\u001b[1;32m 1126\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m google\u001b[38;5;241m.\u001b[39mapi_core\u001b[38;5;241m.\u001b[39mexceptions\u001b[38;5;241m.\u001b[39mBadRequest:\n\u001b[1;32m 1127\u001b[0m \u001b[38;5;66;03m# Some SELECT statements still aren't compatible with cluster\u001b[39;00m\n\u001b[1;32m 1128\u001b[0m \u001b[38;5;66;03m# tables as the destination. For example, if the query has a\u001b[39;00m\n\u001b[1;32m 1129\u001b[0m \u001b[38;5;66;03m# top-level ORDER BY, this conflicts with our ability to cluster\u001b[39;00m\n\u001b[1;32m 1130\u001b[0m \u001b[38;5;66;03m# the table by the index column(s).\u001b[39;00m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/loader.py:1186\u001b[0m, in \u001b[0;36mGbqDataLoader._start_query_with_job\u001b[0;34m(self, sql, job_config, timeout)\u001b[0m\n\u001b[1;32m 1180\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1181\u001b[0m \u001b[38;5;124;03mStarts BigQuery query job and waits for results.\u001b[39;00m\n\u001b[1;32m 1182\u001b[0m \n\u001b[1;32m 1183\u001b[0m \u001b[38;5;124;03mDo not execute dataframe through this API, instead use the executor.\u001b[39;00m\n\u001b[1;32m 1184\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1185\u001b[0m job_config \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_prepare_job_config(job_config)\n\u001b[0;32m-> 1186\u001b[0m _, query_job \u001b[38;5;241m=\u001b[39m \u001b[43mbf_io_bigquery\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstart_query_with_client\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1187\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_bqclient\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1188\u001b[0m \u001b[43m \u001b[49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1189\u001b[0m \u001b[43m \u001b[49m\u001b[43mjob_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjob_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1190\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1191\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 1192\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 1193\u001b[0m \u001b[43m \u001b[49m\u001b[43mmetrics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 1194\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_with_job\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 1195\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1196\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m query_job\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/_io/bigquery/__init__.py:314\u001b[0m, in \u001b[0;36mstart_query_with_client\u001b[0;34m(bq_client, sql, job_config, location, project, timeout, metrics, query_with_job)\u001b[0m\n\u001b[1;32m 312\u001b[0m opts \u001b[38;5;241m=\u001b[39m bigframes\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mdisplay\n\u001b[1;32m 313\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m opts\u001b[38;5;241m.\u001b[39mprogress_bar \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m query_job\u001b[38;5;241m.\u001b[39mconfiguration\u001b[38;5;241m.\u001b[39mdry_run:\n\u001b[0;32m--> 314\u001b[0m results_iterator \u001b[38;5;241m=\u001b[39m \u001b[43mformatting_helpers\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait_for_query_job\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 315\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_job\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 316\u001b[0m \u001b[43m \u001b[49m\u001b[43mprogress_bar\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mopts\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprogress_bar\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 317\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 318\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 319\u001b[0m results_iterator \u001b[38;5;241m=\u001b[39m query_job\u001b[38;5;241m.\u001b[39mresult()\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/formatting_helpers.py:149\u001b[0m, in \u001b[0;36mwait_for_query_job\u001b[0;34m(query_job, max_results, page_size, progress_bar)\u001b[0m\n\u001b[1;32m 147\u001b[0m loading_bar \u001b[38;5;241m=\u001b[39m display\u001b[38;5;241m.\u001b[39mHTML(get_query_job_loading_html(query_job))\n\u001b[1;32m 148\u001b[0m display\u001b[38;5;241m.\u001b[39mdisplay(loading_bar, display_id\u001b[38;5;241m=\u001b[39mdisplay_id)\n\u001b[0;32m--> 149\u001b[0m query_result \u001b[38;5;241m=\u001b[39m \u001b[43mquery_job\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 150\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpage_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpage_size\u001b[49m\n\u001b[1;32m 151\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 152\u001b[0m query_job\u001b[38;5;241m.\u001b[39mreload()\n\u001b[1;32m 153\u001b[0m display\u001b[38;5;241m.\u001b[39mupdate_display(\n\u001b[1;32m 154\u001b[0m display\u001b[38;5;241m.\u001b[39mHTML(get_query_job_loading_html(query_job)),\n\u001b[1;32m 155\u001b[0m display_id\u001b[38;5;241m=\u001b[39mdisplay_id,\n\u001b[1;32m 156\u001b[0m )\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/job/query.py:1696\u001b[0m, in \u001b[0;36mQueryJob.result\u001b[0;34m(self, page_size, max_results, retry, timeout, start_index, job_retry)\u001b[0m\n\u001b[1;32m 1691\u001b[0m remaining_timeout \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1693\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m remaining_timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1694\u001b[0m \u001b[38;5;66;03m# Since is_job_done() calls jobs.getQueryResults, which is a\u001b[39;00m\n\u001b[1;32m 1695\u001b[0m \u001b[38;5;66;03m# long-running API, don't delay the next request at all.\u001b[39;00m\n\u001b[0;32m-> 1696\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[43mis_job_done\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 1697\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[1;32m 1698\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1699\u001b[0m \u001b[38;5;66;03m# Use a monotonic clock since we don't actually care about\u001b[39;00m\n\u001b[1;32m 1700\u001b[0m \u001b[38;5;66;03m# daylight savings or similar, just the elapsed time.\u001b[39;00m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:294\u001b[0m, in \u001b[0;36mRetry.__call__..retry_wrapped_func\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 290\u001b[0m target \u001b[38;5;241m=\u001b[39m functools\u001b[38;5;241m.\u001b[39mpartial(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 291\u001b[0m sleep_generator \u001b[38;5;241m=\u001b[39m exponential_sleep_generator(\n\u001b[1;32m 292\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_initial, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_maximum, multiplier\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_multiplier\n\u001b[1;32m 293\u001b[0m )\n\u001b[0;32m--> 294\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mretry_target\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 295\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 296\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_predicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 297\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep_generator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 298\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 299\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 300\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:156\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 152\u001b[0m \u001b[38;5;66;03m# pylint: disable=broad-except\u001b[39;00m\n\u001b[1;32m 153\u001b[0m \u001b[38;5;66;03m# This function explicitly must deal with broad exceptions.\u001b[39;00m\n\u001b[1;32m 154\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 155\u001b[0m \u001b[38;5;66;03m# defer to shared logic for handling errors\u001b[39;00m\n\u001b[0;32m--> 156\u001b[0m next_sleep \u001b[38;5;241m=\u001b[39m \u001b[43m_retry_error_helper\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 157\u001b[0m \u001b[43m \u001b[49m\u001b[43mexc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 158\u001b[0m \u001b[43m \u001b[49m\u001b[43mdeadline\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 159\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep_iter\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 160\u001b[0m \u001b[43m \u001b[49m\u001b[43merror_list\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 161\u001b[0m \u001b[43m \u001b[49m\u001b[43mpredicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 162\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 163\u001b[0m \u001b[43m \u001b[49m\u001b[43mexception_factory\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 164\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 165\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 166\u001b[0m \u001b[38;5;66;03m# if exception not raised, sleep before next attempt\u001b[39;00m\n\u001b[1;32m 167\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(next_sleep)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_base.py:214\u001b[0m, in \u001b[0;36m_retry_error_helper\u001b[0;34m(exc, deadline, sleep_iterator, error_list, predicate_fn, on_error_fn, exc_factory_fn, original_timeout)\u001b[0m\n\u001b[1;32m 208\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m predicate_fn(exc):\n\u001b[1;32m 209\u001b[0m final_exc, source_exc \u001b[38;5;241m=\u001b[39m exc_factory_fn(\n\u001b[1;32m 210\u001b[0m error_list,\n\u001b[1;32m 211\u001b[0m RetryFailureReason\u001b[38;5;241m.\u001b[39mNON_RETRYABLE_ERROR,\n\u001b[1;32m 212\u001b[0m original_timeout,\n\u001b[1;32m 213\u001b[0m )\n\u001b[0;32m--> 214\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m final_exc \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01msource_exc\u001b[39;00m\n\u001b[1;32m 215\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m on_error_fn \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 216\u001b[0m on_error_fn(exc)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:147\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 145\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[1;32m 146\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 147\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mtarget\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 148\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inspect\u001b[38;5;241m.\u001b[39misawaitable(result):\n\u001b[1;32m 149\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(_ASYNC_RETRY_WARNING)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/job/query.py:1665\u001b[0m, in \u001b[0;36mQueryJob.result..is_job_done\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1659\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 1661\u001b[0m \u001b[38;5;66;03m# Call jobs.getQueryResults with max results set to 0 just to\u001b[39;00m\n\u001b[1;32m 1662\u001b[0m \u001b[38;5;66;03m# wait for the query to finish. Unlike most methods,\u001b[39;00m\n\u001b[1;32m 1663\u001b[0m \u001b[38;5;66;03m# jobs.getQueryResults hangs as long as it can to ensure we\u001b[39;00m\n\u001b[1;32m 1664\u001b[0m \u001b[38;5;66;03m# know when the query has finished as soon as possible.\u001b[39;00m\n\u001b[0;32m-> 1665\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_reload_query_results\u001b[49m\u001b[43m(\u001b[49m\u001b[43mretry\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mretry\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mreload_query_results_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1667\u001b[0m \u001b[38;5;66;03m# Even if the query is finished now according to\u001b[39;00m\n\u001b[1;32m 1668\u001b[0m \u001b[38;5;66;03m# jobs.getQueryResults, we'll want to reload the job status if\u001b[39;00m\n\u001b[1;32m 1669\u001b[0m \u001b[38;5;66;03m# it's not already DONE.\u001b[39;00m\n\u001b[1;32m 1670\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/job/query.py:1463\u001b[0m, in \u001b[0;36mQueryJob._reload_query_results\u001b[0;34m(self, retry, timeout, page_size)\u001b[0m\n\u001b[1;32m 1460\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(transport_timeout, (\u001b[38;5;28mfloat\u001b[39m, \u001b[38;5;28mint\u001b[39m)):\n\u001b[1;32m 1461\u001b[0m transport_timeout \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1463\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_query_results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_client\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_query_results\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1464\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjob_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1465\u001b[0m \u001b[43m \u001b[49m\u001b[43mretry\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1466\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mproject\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1467\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout_ms\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout_ms\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1468\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlocation\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1469\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtransport_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1470\u001b[0m \u001b[43m \u001b[49m\u001b[43mpage_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpage_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1471\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/client.py:2060\u001b[0m, in \u001b[0;36mClient._get_query_results\u001b[0;34m(self, job_id, retry, project, timeout_ms, location, timeout, page_size)\u001b[0m\n\u001b[1;32m 2056\u001b[0m \u001b[38;5;66;03m# This call is typically made in a polling loop that checks whether the\u001b[39;00m\n\u001b[1;32m 2057\u001b[0m \u001b[38;5;66;03m# job is complete (from QueryJob.done(), called ultimately from\u001b[39;00m\n\u001b[1;32m 2058\u001b[0m \u001b[38;5;66;03m# QueryJob.result()). So we don't need to poll here.\u001b[39;00m\n\u001b[1;32m 2059\u001b[0m span_attributes \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpath\u001b[39m\u001b[38;5;124m\"\u001b[39m: path}\n\u001b[0;32m-> 2060\u001b[0m resource \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_api\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2061\u001b[0m \u001b[43m \u001b[49m\u001b[43mretry\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2062\u001b[0m \u001b[43m \u001b[49m\u001b[43mspan_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mBigQuery.getQueryResults\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2063\u001b[0m \u001b[43m \u001b[49m\u001b[43mspan_attributes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mspan_attributes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2064\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mGET\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2065\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2066\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_params\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextra_params\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2067\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2068\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2069\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _QueryResults\u001b[38;5;241m.\u001b[39mfrom_api_repr(resource)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/client.py:858\u001b[0m, in \u001b[0;36mClient._call_api\u001b[0;34m(self, retry, span_name, span_attributes, job_ref, headers, **kwargs)\u001b[0m\n\u001b[1;32m 854\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m span_name \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 855\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m create_span(\n\u001b[1;32m 856\u001b[0m name\u001b[38;5;241m=\u001b[39mspan_name, attributes\u001b[38;5;241m=\u001b[39mspan_attributes, client\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m, job_ref\u001b[38;5;241m=\u001b[39mjob_ref\n\u001b[1;32m 857\u001b[0m ):\n\u001b[0;32m--> 858\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcall\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 860\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m call()\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:294\u001b[0m, in \u001b[0;36mRetry.__call__..retry_wrapped_func\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 290\u001b[0m target \u001b[38;5;241m=\u001b[39m functools\u001b[38;5;241m.\u001b[39mpartial(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 291\u001b[0m sleep_generator \u001b[38;5;241m=\u001b[39m exponential_sleep_generator(\n\u001b[1;32m 292\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_initial, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_maximum, multiplier\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_multiplier\n\u001b[1;32m 293\u001b[0m )\n\u001b[0;32m--> 294\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mretry_target\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 295\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 296\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_predicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 297\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep_generator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 298\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 299\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 300\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:156\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 152\u001b[0m \u001b[38;5;66;03m# pylint: disable=broad-except\u001b[39;00m\n\u001b[1;32m 153\u001b[0m \u001b[38;5;66;03m# This function explicitly must deal with broad exceptions.\u001b[39;00m\n\u001b[1;32m 154\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 155\u001b[0m \u001b[38;5;66;03m# defer to shared logic for handling errors\u001b[39;00m\n\u001b[0;32m--> 156\u001b[0m next_sleep \u001b[38;5;241m=\u001b[39m \u001b[43m_retry_error_helper\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 157\u001b[0m \u001b[43m \u001b[49m\u001b[43mexc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 158\u001b[0m \u001b[43m \u001b[49m\u001b[43mdeadline\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 159\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep_iter\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 160\u001b[0m \u001b[43m \u001b[49m\u001b[43merror_list\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 161\u001b[0m \u001b[43m \u001b[49m\u001b[43mpredicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 162\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 163\u001b[0m \u001b[43m \u001b[49m\u001b[43mexception_factory\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 164\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 165\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 166\u001b[0m \u001b[38;5;66;03m# if exception not raised, sleep before next attempt\u001b[39;00m\n\u001b[1;32m 167\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(next_sleep)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_base.py:214\u001b[0m, in \u001b[0;36m_retry_error_helper\u001b[0;34m(exc, deadline, sleep_iterator, error_list, predicate_fn, on_error_fn, exc_factory_fn, original_timeout)\u001b[0m\n\u001b[1;32m 208\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m predicate_fn(exc):\n\u001b[1;32m 209\u001b[0m final_exc, source_exc \u001b[38;5;241m=\u001b[39m exc_factory_fn(\n\u001b[1;32m 210\u001b[0m error_list,\n\u001b[1;32m 211\u001b[0m RetryFailureReason\u001b[38;5;241m.\u001b[39mNON_RETRYABLE_ERROR,\n\u001b[1;32m 212\u001b[0m original_timeout,\n\u001b[1;32m 213\u001b[0m )\n\u001b[0;32m--> 214\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m final_exc \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01msource_exc\u001b[39;00m\n\u001b[1;32m 215\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m on_error_fn \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 216\u001b[0m on_error_fn(exc)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:147\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 145\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[1;32m 146\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 147\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mtarget\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 148\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inspect\u001b[38;5;241m.\u001b[39misawaitable(result):\n\u001b[1;32m 149\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(_ASYNC_RETRY_WARNING)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/_http/__init__.py:494\u001b[0m, in \u001b[0;36mJSONConnection.api_request\u001b[0;34m(self, method, path, query_params, data, content_type, headers, api_base_url, api_version, expect_json, _target_object, timeout, extra_api_info)\u001b[0m\n\u001b[1;32m 482\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_make_request(\n\u001b[1;32m 483\u001b[0m method\u001b[38;5;241m=\u001b[39mmethod,\n\u001b[1;32m 484\u001b[0m url\u001b[38;5;241m=\u001b[39murl,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 490\u001b[0m extra_api_info\u001b[38;5;241m=\u001b[39mextra_api_info,\n\u001b[1;32m 491\u001b[0m )\n\u001b[1;32m 493\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;241m200\u001b[39m \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m response\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m300\u001b[39m:\n\u001b[0;32m--> 494\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exceptions\u001b[38;5;241m.\u001b[39mfrom_http_response(response)\n\u001b[1;32m 496\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m expect_json \u001b[38;5;129;01mand\u001b[39;00m response\u001b[38;5;241m.\u001b[39mcontent:\n\u001b[1;32m 497\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m response\u001b[38;5;241m.\u001b[39mjson()\n", + "\u001b[0;31mForbidden\u001b[0m: 403 GET https://bigquery.googleapis.com/bigquery/v2/projects/bigframes-dev/queries/0b18e95b-0728-448e-b27a-8094d27d8135?maxResults=0&location=US&prettyPrint=false: Access Denied: BigQuery BigQuery: Permission denied while globbing file pattern. bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission 'storage.objects.list' denied on resource (or it may not exist). Please make sure gs://your-bucket/audio-files/* is accessible via appropriate IAM roles, e.g. Storage Object Viewer or Storage Object Creator.\n\nLocation: US\nJob ID: 0b18e95b-0728-448e-b27a-8094d27d8135\n Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey. You are currently running BigFrames version 2.10.0. [{'@type': 'type.googleapis.com/google.rpc.DebugInfo', 'detail': '[ACCESS_DENIED] message=ACCESS_DENIED: [BigQuery, BigQuery, Permission denied while globbing file pattern. bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\'storage.objects.list\\' denied on resource (or it may not exist). Please make sure gs://your-bucket/audio-files/* is accessible via appropriate IAM roles, e.g. Storage Object Viewer or Storage Object Creator.] debug=code: \\t AUTHORIZATION_ERROR\\ndescription: \"Permission denied while globbing file pattern. bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\'storage.objects.list\\\\\\' denied on resource (or it may not exist). Please make sure gs://your-bucket/audio-files/* is accessible via appropriate IAM roles, e.g. Storage Object Viewer or Storage Object Creator. message: \\\\\"GCS aka Bigstore is not approved for storing Google user data (go/blobstore-gcs-getting-started). If you have obtained security exceptions for Bigstore instead of Blobstore, Please make sure Dremel has access to the files and all directories on the path to the files (http://go/dremel-access).\\\\\"\\\\npermission_denied_error {\\\\n accessed_resource_uri: \\\\\"/bigstore/your-bucket/audio-files/*\\\\\"\\\\n system: BIGSTORE\\\\n}\\\\nunderlying_status {\\\\n code: 7\\\\n space: \\\\\"generic\\\\\"\\\\n message: \\\\\"Calling Match with file \\\\\\\\\\\\\"/bigstore/your-bucket/audio-files/**\\\\\\\\\\\\\": cloud.bigstore.ResponseCode.ErrorCode::ACCESS_DENIED: bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist). bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist). [google.rpc.error_details_ext] { message: \\\\\\\\\\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\\\\\\\\\" details { [type.googleapis.com/google.rpc.DebugInfo] { stack_entries: \\\\\\\\\\\\\"com.google.net.rpc3.client.RpcClientException: APPLICATION_ERROR;cloud.bigstore/FrontendObjectsService.List;bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist). bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist).;AppErrorCode=1;StartTimeMs=1752257982548;tcp;Deadline(sec)=29.991333608;ResFormat=uncompressed;interceptors={[com.google.prod.fireaxe.filters.FireaxeRpcClientInterceptorImpl;com.google.cloud.bigstore.common.LatencyCollectingInterceptor;com.google.frameworks.debug.sherlog.core.rpcutil.Stubby3ClientInterceptor];overrides={}};ServerTimeSec=0.021769909;LogBytes=256;Non-FailFast;EffSecLevel=strong_privacy_and_integrity;ReqFormat=uncompressed;ReqID=f0298bdab819b7ef;GlobalID=632dd03b377a3319;Server=[::1]:14003\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.net.rpc3.client.RpcStub.startBlockingRpcInternal(RpcStub.java:571)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.net.rpc3.client.RpcStub.startBlockingRpc(RpcStub.java:471)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.proto.BackendObjectsService$Stub.list(BackendObjectsService.java:734)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.common.LampreyServiceBase.call(LampreyServiceBase.java:37)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.common.BigstoreFrontendObjectsServiceImpl.call(BigstoreFrontendObjectsServiceImpl.java:40)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.common.BigstoreFrontendObjectsServiceImpl.list(BigstoreFrontendObjectsServiceImpl.java:97)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.stubby.BigstoreStubbyImpl.lambda$listObjects$0(BigstoreStubbyImpl.java:832)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.common.CallbackRequestHandler.handleCallbackRequest(CallbackRequestHandler.java:288)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.common.CallbackRequestHandler.handleCallbackRequest(CallbackRequestHandler.java:158)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.stubby.BigstoreStubbyImpl.listObjects(BigstoreStubbyImpl.java:821)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.common.ProxyDelegatorBase.callAndRecordLatency(ProxyDelegatorBase.java:109)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.stubby.StubbyProxyDelegator.call(StubbyProxyDelegator.java:184)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.stubby.BigstoreStubbyProxy.listObjects(BigstoreStubbyProxy.java:334)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.isolation.RpcReceiver.lambda$processRequestAsync$0(RpcReceiver.java:198)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.isolation.AsyncExecutor.lambda$submit$0(AsyncExecutor.java:213)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.common.context.ContextRunnable.runInContext(ContextRunnable.java:83)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"io.grpc.Context.run(Context.java:536)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.tracing.GenericContextCallback.runInInheritedContext(GenericContextCallback.java:58)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.common.context.ContextRunnable.run(ContextRunnable.java:74)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"java.base/java.lang.Thread.run(Unknown Source)\\\\\\\\\\\\\" } } } [blobstore2.GcsErrorDetails] { xml_code: \\\\\\\\\\\\\"AccessDenied\\\\\\\\\\\\\" msg: \\\\\\\\\\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\\\\\\\\\" http_code: 403 details { first: \\\\\\\\\\\\\"Details\\\\\\\\\\\\\" second: \\\\\\\\\\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\\\\\\\\\" } debug_info: \\\\\\\\\\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist). bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\\\\\\\\\" } [cloud.bigstore.GcsLatencyInfo] { requests { method: \\\\\\\\\\\\\"/BackendObjectsService.List\\\\\\\\\\\\\" deadline { seconds: 29 nanos: 979569783 } start { seconds: 1752257982 nanos: 549935579 } end { seconds: 1752257982 nanos: 570470964 } status { code: 1 space: \\\\\\\\\\\\\"cloud.bigstore.ResponseCode.ErrorCode\\\\\\\\\\\\\" message: \\\\\\\\\\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist). bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\\\\\\\\\" canonical_code: 7 } } requests { method: \\\\\\\\\\\\\"/BucketMdService.LookupBucket\\\\\\\\\\\\\" deadline { seconds: 4 nanos: 999967650 } start { seconds: 1752257982 nanos: 551314439 } end { seconds: 1752257982 nanos: 553833108 } status { } metadata_spanner_stats { read_walltime_millis: 0 read_cpu_millis: 0 read_scheduler_delay_millis: 0 read_throttle_delay_millis: 0 read_per_service_limit_queue_delay_millis: 0 read_locking_delay_millis: 0 read_client_overhead_delay_millis: 0 read_client_flow_control_delay_millis: 0 read_io_delay_millis: 0 } elapsed_time_isolator_metrics { } sunspot_verdict: VERDICT_CAT_UNKNOWN } requests { method: \\\\\\\\\\\\\"/AccessService.CheckListObjects\\\\\\\\\\\\\" deadline { seconds: 29 nanos: 969642965 } start { seconds: 1752257982 nanos: 553970188 } end { seconds: 1752257982 nanos: 569602062 } status { } } }\\\\\"\\\\n message_set {\\\\n [google.rpc.error_details_ext] {\\\\n message: \\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\"\\\\n }\\\\n }\\\\n}\\\\nerror_context {\\\\n table_name: \\\\\"bigframes-dev._8b037bfb7316dddf9d92b12dcf93e008906bfe52.bqdf20250711_sessionee050e_bb3969eb60f548be8a4b6f5e2564f69f\\\\\"\\\\n}\\\\n\"\\ncause: USER_ERROR\\naddress: \"http://jfs5.prod.google.com:4901/task?handle=logs.2071.serving.shard-mals.cloud-dataengine.11785653463719 Partition description: __SHUFFLE0/0 TableDef \\\\\\'bigframes-dev._8b037bfb7316dddf9d92b12dcf93e008906bfe52.bqdf20250711_sessionee050e_bb3969eb60f548be8a4b6f5e2564f69f\\\\\\' of type \\\\\\'object-meta\\\\\\': /bigstore/your-bucket/audio-files/*\"\\nstatus_proto {\\n code: 7\\n space: \"generic\"\\n message: \"Permission denied while globbing file pattern. bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\'storage.objects.list\\\\\\' denied on resource (or it may not exist). Please make sure gs://your-bucket/audio-files/* is accessible via appropriate IAM roles, e.g. Storage Object Viewer or Storage Object Creator.\"\\n}\\nerror_details {\\n message: \"GCS aka Bigstore is not approved for storing Google user data (go/blobstore-gcs-getting-started). If you have obtained security exceptions for Bigstore instead of Blobstore, Please make sure Dremel has access to the files and all directories on the path to the files (http://go/dremel-access).\"\\n permission_denied_error {\\n accessed_resource_uri: \"/bigstore/your-bucket/audio-files/*\"\\n system: BIGSTORE\\n }\\n underlying_status {\\n code: 7\\n space: \"generic\"\\n message: \"Calling Match with file \\\\\"/bigstore/your-bucket/audio-files/**\\\\\": cloud.bigstore.ResponseCode.ErrorCode::ACCESS_DENIED: bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\'storage.objects.list\\\\\\' denied on resource (or it may not exist). bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\'storage.objects.list\\\\\\' denied on resource (or it may not exist). [google.rpc.error_details_ext] { message: \\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\" details { [type.googleapis.com/google.rpc.DebugInfo] { stack_entries: \\\\\"com.google.net.rpc3.client.RpcClientException: APPLICATION_ERROR;cloud.bigstore/FrontendObjectsService.List;bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist). bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist).;AppErrorCode=1;StartTimeMs=1752257982548;tcp;Deadline(sec)=29.991333608;ResFormat=uncompressed;interceptors={[com.google.prod.fireaxe.filters.FireaxeRpcClientInterceptorImpl;com.google.cloud.bigstore.common.LatencyCollectingInterceptor;com.google.frameworks.debug.sherlog.core.rpcutil.Stubby3ClientInterceptor];overrides={}};ServerTimeSec=0.021769909;LogBytes=256;Non-FailFast;EffSecLevel=strong_privacy_and_integrity;ReqFormat=uncompressed;ReqID=f0298bdab819b7ef;GlobalID=632dd03b377a3319;Server=[::1]:14003\\\\\" stack_entries: \\\\\"com.google.net.rpc3.client.RpcStub.startBlockingRpcInternal(RpcStub.java:571)\\\\\" stack_entries: \\\\\"com.google.net.rpc3.client.RpcStub.startBlockingRpc(RpcStub.java:471)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.proto.BackendObjectsService$Stub.list(BackendObjectsService.java:734)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.common.LampreyServiceBase.call(LampreyServiceBase.java:37)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.common.BigstoreFrontendObjectsServiceImpl.call(BigstoreFrontendObjectsServiceImpl.java:40)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.common.BigstoreFrontendObjectsServiceImpl.list(BigstoreFrontendObjectsServiceImpl.java:97)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.stubby.BigstoreStubbyImpl.lambda$listObjects$0(BigstoreStubbyImpl.java:832)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.common.CallbackRequestHandler.handleCallbackRequest(CallbackRequestHandler.java:288)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.common.CallbackRequestHandler.handleCallbackRequest(CallbackRequestHandler.java:158)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.stubby.BigstoreStubbyImpl.listObjects(BigstoreStubbyImpl.java:821)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.common.ProxyDelegatorBase.callAndRecordLatency(ProxyDelegatorBase.java:109)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.stubby.StubbyProxyDelegator.call(StubbyProxyDelegator.java:184)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.stubby.BigstoreStubbyProxy.listObjects(BigstoreStubbyProxy.java:334)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.isolation.RpcReceiver.lambda$processRequestAsync$0(RpcReceiver.java:198)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.isolation.AsyncExecutor.lambda$submit$0(AsyncExecutor.java:213)\\\\\" stack_entries: \\\\\"com.google.common.context.ContextRunnable.runInContext(ContextRunnable.java:83)\\\\\" stack_entries: \\\\\"io.grpc.Context.run(Context.java:536)\\\\\" stack_entries: \\\\\"com.google.tracing.GenericContextCallback.runInInheritedContext(GenericContextCallback.java:58)\\\\\" stack_entries: \\\\\"com.google.common.context.ContextRunnable.run(ContextRunnable.java:74)\\\\\" stack_entries: \\\\\"java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)\\\\\" stack_entries: \\\\\"java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)\\\\\" stack_entries: \\\\\"java.base/java.lang.Thread.run(Unknown Source)\\\\\" } } } [blobstore2.GcsErrorDetails] { xml_code: \\\\\"AccessDenied\\\\\" msg: \\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\" http_code: 403 details { first: \\\\\"Details\\\\\" second: \\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\" } debug_info: \\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist). bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\" } [cloud.bigstore.GcsLatencyInfo] { requests { method: \\\\\"/BackendObjectsService.List\\\\\" deadline { seconds: 29 nanos: 979569783 } start { seconds: 1752257982 nanos: 549935579 } end { seconds: 1752257982 nanos: 570470964 } status { code: 1 space: \\\\\"cloud.bigstore.ResponseCode.ErrorCode\\\\\" message: \\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist). bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\" canonical_code: 7 } } requests { method: \\\\\"/BucketMdService.LookupBucket\\\\\" deadline { seconds: 4 nanos: 999967650 } start { seconds: 1752257982 nanos: 551314439 } end { seconds: 1752257982 nanos: 553833108 } status { } metadata_spanner_stats { read_walltime_millis: 0 read_cpu_millis: 0 read_scheduler_delay_millis: 0 read_throttle_delay_millis: 0 read_per_service_limit_queue_delay_millis: 0 read_locking_delay_millis: 0 read_client_overhead_delay_millis: 0 read_client_flow_control_delay_millis: 0 read_io_delay_millis: 0 } elapsed_time_isolator_metrics { } sunspot_verdict: VERDICT_CAT_UNKNOWN } requests { method: \\\\\"/AccessService.CheckListObjects\\\\\" deadline { seconds: 29 nanos: 969642965 } start { seconds: 1752257982 nanos: 553970188 } end { seconds: 1752257982 nanos: 569602062 } status { } } }\"\\n message_set {\\n [google.rpc.error_details_ext] {\\n message: \"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\'storage.objects.list\\\\\\' denied on resource (or it may not exist).\"\\n }\\n }\\n }\\n error_context {\\n table_name: \"bigframes-dev._8b037bfb7316dddf9d92b12dcf93e008906bfe52.bqdf20250711_sessionee050e_bb3969eb60f548be8a4b6f5e2564f69f\"\\n }\\n}\\n errorProto=code: \"ACCESS_DENIED\"\\nargument: \"BigQuery\"\\nargument: \"BigQuery\"\\nargument: \"Permission denied while globbing file pattern. bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\'storage.objects.list\\\\\\' denied on resource (or it may not exist). Please make sure gs://your-bucket/audio-files/* is accessible via appropriate IAM roles, e.g. Storage Object Viewer or Storage Object Creator.\"\\nlocation_type: OTHER\\nlocation: \"gs://your-bucket/audio-files/*\"\\n\\n\\tat com.google.cloud.helix.common.Exceptions.fromProto(Exceptions.java:2016)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl.mapDremelErrorsTohelixException(QueryExecutorImpl.java:1194)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:769)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:695)\\n\\tat com.google.net.rpc3.stream.RpcMessageCallback$ForwardingRpcMessageCallback.onMessage(RpcMessageCallback.java:128)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.processMessageUnlocked(RpcStreamInternalContext.java:1852)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksInternalUnlocked(RpcStreamInternalContext.java:2904)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksUnlocked(RpcStreamInternalContext.java:2830)\\n\\tat com.google.net.eventmanager.AbstractFutureTask$Sync.innerRun(AbstractFutureTask.java:259)\\n\\tat com.google.net.eventmanager.AbstractFutureTask.run(AbstractFutureTask.java:120)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runTaskTraced(EventManagerImpl.java:900)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runTask(EventManagerImpl.java:892)\\n\\tat com.google.net.eventmanager.EventManagerImpl.internalRunWorkerLoop(EventManagerImpl.java:1319)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runWorkerLoop(EventManagerImpl.java:1210)\\n\\tat com.google.net.eventmanager.WorkerThreadInfo.runWorkerLoop(WorkerThreadInfo.java:153)\\n\\tat com.google.net.eventmanager.EventManagerImpl$WorkerThread.run(EventManagerImpl.java:1999)\\n'}]" + ] + } + ], + "source": [ + "import bigframes.pandas as bpd \n", + "import bigframes.enums \n", + " \n", + "# 1. Enable partial ordering mode \n", + "bpd.options.bigquery.ordering_mode = \"partial\" \n", + " \n", + "# 2. Create a DataFrame with blob data and null index \n", + "# Using from_glob_path which creates multimodal DataFrames \n", + "flattened = bpd.from_glob_path( \n", + " \"gs://your-bucket/audio-files/*\", \n", + " name=\"GCS Blob\" \n", + ") \n", + " \n", + "# Alternatively, create from URI strings with null index \n", + "# df = bpd.DataFrame({\"uri\": [\"gs://bucket/audio1.wav\", \"gs://bucket/audio2.wav\"]}) \n", + "# df[\"GCS Blob\"] = df[\"uri\"].str.to_blob() \n", + "# flattened = bpd.read_gbq_table(\"your_table\", index_col=bigframes.enums.DefaultIndexKind.NULL) \n", + " \n", + "# 3. This will trigger the NullIndexError \n", + "flattened[\"Transcription\"] = flattened[\"GCS Blob\"].blob.audio_transcribe( \n", + " model_name=\"gemini-2.0-flash-001\", \n", + " verbose=True, \n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/test.ipynb b/notebooks/test.ipynb new file mode 100644 index 0000000000..85202fc76e --- /dev/null +++ b/notebooks/test.ipynb @@ -0,0 +1,225 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import bigframes.pandas as bpd\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "data=pd.Series(np.random.randn(8),\n", + " index=[[\"a\",\"a\",\"a\",\"b\",\n", + " \"b\",\"b\",\"c\",\"c\"],\n", + " [1,2,3,1,2,3,1,2]])\n", + "data\n", + "bq_data = bpd.read_pandas(data)\n", + "print(bq_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data.loc[\"a\":\"b\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bq_data.loc[\"a\": \"b\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/global_session.py:103: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + " _global_session = bigframes.session.connect(\n" + ] + }, + { + "data": { + "text/html": [ + "Query job c7df462c-e617-4ca3-83a0-6d99f7494ad9 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1\n" + ] + } + ], + "source": [ + "import bigframes.pandas as bpd\n", + "\n", + "idx = bpd.Index(['Apple', 'Banana', 'Orange'])\n", + "print(idx.get_loc('Banana')) # Output: 1 " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "monotonic_index = bpd.Index(list('abbc'))\n", + "monotonic_index.get_loc('b') # Output: slice(1, 3, None)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job d6b32147-67cc-478d-80a4-ad1a0d6615bc is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job b143b4d9-70d2-4845-8d1c-aa1cb2f4e9ed is DONE. 68 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "0 False\n", + "1 True\n", + "2 False\n", + "3 True\n", + "dtype: boolean" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "non_monotonic_index = bpd.Index(list('abcb'))\n", + "non_monotonic_index.get_loc('b') # Expect array([False, True, False, True])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 9d531a74-26ec-4ede-97e6-f8fc25c00068 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/pandas/__init__.py:149: PreviewWarning: udf is in preview.\n", + " return global_session.with_default_session(\n", + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/functions/_function_session.py:971: PreviewWarning: input_types=Series is in preview.\n", + " warnings.warn(msg, stacklevel=1, category=bfe.PreviewWarning)\n", + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/functions/_utils.py:86: FunctionPackageVersionWarning: numpy, pandas, and pyarrow versions in the function execution\n", + "environment may not precisely match your local environment.\n", + " warnings.warn(msg, category=bfe.FunctionPackageVersionWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "Query job 0d58b789-d658-4dd6-ba9f-3f03fc77895a is RUNNING. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import bigframes.pandas as bpd\n", + "import pandas as pd\n", + "\n", + "bpd.options.bigquery.project = 'bigframes-dev' # project #: 1084210331973\n", + "bpd.options.bigquery.location = \"us\"\n", + "\n", + "@bpd.udf(\n", + " dataset='jialuo_test_us', name='test',\n", + " packages = ['pypdf[crypto]'],\n", + " )\n", + "def func(s: bpd.Series) -> bool:\n", + " return s['a'] + s['b'] > 0\n", + "\n", + "bdf = {'a': [0, 1, 2], 'b': [3, 4, 5]}\n", + "res = bdf.apply(func, axis=1)\n", + "print(res)\n", + "\n", + "res1 = bdf.where(func)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/test_blob_trancription.ipynb b/notebooks/test_blob_trancription.ipynb new file mode 100644 index 0000000000..ea7a64fc0b --- /dev/null +++ b/notebooks/test_blob_trancription.ipynb @@ -0,0 +1,310 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import bigframes.pandas as bpd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# 1. Enable partial ordering mode\n", + "bpd.options.bigquery.ordering_mode = \"partial\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/global_session.py:103: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + " _global_session = bigframes.session.connect(\n" + ] + }, + { + "data": { + "text/html": [ + "Query job 1034294a-3478-4a47-8f3c-1db4b8aab29c is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 7ff09bc9-bcd4-44bd-bed7-9a5ccc90d23a is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df = bpd.from_glob_path(\"gs://bigframes_blob_test/audio/*\", name=\"audio\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "df = df.reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:175: FutureWarning: Since upgrading the default model can cause unintended breakages, the\n", + "default model will be removed in BigFrames 3.0. Please supply an\n", + "explicit model to avoid this message.\n", + " return method(*args, **kwargs)\n" + ] + }, + { + "data": { + "text/html": [ + "Query job f7a6f58f-8bd2-4874-a2cd-76b53da0fb4b is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job ca9c9951-4394-4919-b9c0-56d984a56514 is DONE. 364 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/array_value.py:108: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + } + ], + "source": [ + "bq_connection = \"bigframes-dev.us.bigframes-default-connection\"\n", + "df[\"transcribe_audio\"] = df[\"audio\"].blob.audio_transcribe(\n", + " connection=bq_connection)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/array_value.py:254: AmbiguousWindowWarning: Window ordering may be ambiguous, this can cause unstable results.\n", + " warnings.warn(msg, category=bfe.AmbiguousWindowWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "Query job 52332330-b306-4531-9e3b-fe2b7b9fb1ba is DONE. 487 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/array_value.py:230: AmbiguousWindowWarning: Window ordering may be ambiguous, this can cause unstable results.\n", + " warnings.warn(msg, bfe.AmbiguousWindowWarning)\n", + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/array_value.py:254: AmbiguousWindowWarning: Window ordering may be ambiguous, this can cause unstable results.\n", + " warnings.warn(msg, category=bfe.AmbiguousWindowWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "Query job b11dbe0b-58a7-4971-a93f-5a4a96838640 is DONE. 223 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/array_value.py:230: AmbiguousWindowWarning: Window ordering may be ambiguous, this can cause unstable results.\n", + " warnings.warn(msg, bfe.AmbiguousWindowWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
audiotranscribe_audio
0uri: gs://bigframes_blob_test/audio/LJ001-0010.wav, authorizer: bigframes-dev.us.bigframes-default-connectionNow, as all books, not primarily intended as picture books, consist principally of types composed to form letter press,
\n", + "

1 rows × 2 columns

\n", + "
[1 rows x 2 columns in total]" + ], + "text/plain": [ + " audio \\\n", + "{'uri': 'gs://bigframes_blob_test/audio/LJ001-0... \n", + "\n", + " transcribe_audio \n", + "Now, as all books, not primarily intended as pi... \n", + "\n", + "[1 rows x 2 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\n# Example test\\nimport pandas as pd\\nimport bigframes.pandas as bpd # Assuming you use this for Series creation in tests\\n\\ntest_source_uris = bpd.Series([\\n \"gs://source-bucket/path/audio_001.wav\",\\n \"gs://source-bucket/another/path/to/track_beta.mp3?version=2\"\\n])\\nprint(test_source_uris)\\n# YOUR FILE_FOLDER_REGEX (ensure this matches what\\'s in your class scope)\\nTEST_REGEX = r\"gs://[^/]+/(?:.+/)?([^/?]+)(?:\\\\?.*)?\"\\nfixed_folder = \"gs://bigframes_blob_test/audio/chunked/\"\\n\\ntry:\\n expected_dest_bases = test_source_uris.str.replace(TEST_REGEX, rf\"{fixed_folder}\\x01\", regex=True)\\n print(\"Test Regex Output (destination bases):\")\\n print(expected_dest_bases.to_pandas())\\n # Expected:\\n # gs://bigframes_blob_test/audio/chunked/audio_001.wav\\n # gs://bigframes_blob_test/audio/chunked/track_beta.mp3\\nexcept Exception as e:\\n print(f\"Error in regex test: {e}\")\\n'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "'''\n", + "# Example test\n", + "import pandas as pd\n", + "import bigframes.pandas as bpd # Assuming you use this for Series creation in tests\n", + "\n", + "test_source_uris = bpd.Series([\n", + " \"gs://source-bucket/path/audio_001.wav\",\n", + " \"gs://source-bucket/another/path/to/track_beta.mp3?version=2\"\n", + "])\n", + "print(test_source_uris)\n", + "# YOUR FILE_FOLDER_REGEX (ensure this matches what's in your class scope)\n", + "TEST_REGEX = r\"gs://[^/]+/(?:.+/)?([^/?]+)(?:\\?.*)?\"\n", + "fixed_folder = \"gs://bigframes_blob_test/audio/chunked/\"\n", + "\n", + "try:\n", + " expected_dest_bases = test_source_uris.str.replace(TEST_REGEX, rf\"{fixed_folder}\\1\", regex=True)\n", + " print(\"Test Regex Output (destination bases):\")\n", + " print(expected_dest_bases.to_pandas())\n", + " # Expected:\n", + " # gs://bigframes_blob_test/audio/chunked/audio_001.wav\n", + " # gs://bigframes_blob_test/audio/chunked/track_beta.mp3\n", + "except Exception as e:\n", + " print(f\"Error in regex test: {e}\")\n", + "'''" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/test_blob_trans_blur_image.ipynb b/notebooks/test_blob_trans_blur_image.ipynb new file mode 100644 index 0000000000..e117db4475 --- /dev/null +++ b/notebooks/test_blob_trans_blur_image.ipynb @@ -0,0 +1,200 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/_config/experiment_options.py:71: ApiDeprecationWarning: BigFrames Blob is in preview now. This flag is no longer needed.\n", + " warnings.warn(msg, category=bfe.ApiDeprecationWarning)\n", + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/_config/bigquery_options.py:363: UserWarning: This is an advanced configuration option for directly setting\n", + "endpoints. Incorrect use may lead to unexpected behavior or system\n", + "instability. Proceed only if you fully understand its implications.\n", + " warnings.warn(msg)\n" + ] + } + ], + "source": [ + "import bigframes\n", + "bigframes.options.experiments.blob = True\n", + "bigframes.options._bigquery_options.client_endpoints_override = {\"bqclient\": \"https://test-bigquery.sandbox.google.com\", \n", + " \"bqconnectionclient\": \"test-bigqueryconnection.sandbox.googleapis.com\", \n", + " \"bqstoragereadclient\": \"test-bigquerystorage-grpc.sandbox.googleapis.com\"}\n", + "import bigframes.pandas as bpd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/global_session.py:114: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + " return func(get_global_session(), *args, **kwargs)\n" + ] + }, + { + "data": { + "text/html": [ + "Query job 5da10498-1978-42fe-afea-d3b07c933daa is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#import bigframes.pandas as bpd\n", + "df = bpd.DataFrame({\"s\": [\"gs://shuowei_bucket/images/images.jpeg\", \"gs://shuowei_bucket/images/tree.jpeg\"]})\n", + "df[\"src\"] = df[\"s\"].str.to_blob(connection=\"bigframes-dev.us.bigframes-default-connection\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 3b5d9f0e-0df7-46df-a44c-b07d9d0e96af is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:164: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", + " return method(self, *args, **kwargs)\n" + ] + }, + { + "data": { + "text/html": [ + "Query job 4bf61049-4507-4925-817e-cbb8b19d5721 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df[\"result\"] = df[\"src\"].blob.image_blur(ksize=[8,8],dst=\"gs://shuowei_bucket/image_blur_transformed2/\", connection=\"bigframes-dev.us.bigframes-default-connection\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ssrcresult
0gs://shuowei_bucket/images/images.jpeg
1gs://shuowei_bucket/images/tree.jpeg
\n", + "

2 rows × 3 columns

\n", + "
[2 rows x 3 columns in total]" + ], + "text/plain": [ + " s \\\n", + "0 gs://shuowei_bucket/images/images.jpeg \n", + "1 gs://shuowei_bucket/images/tree.jpeg \n", + "\n", + " src \\\n", + "0 {'uri': 'gs://shuowei_bucket/images/images.jpe... \n", + "1 {'uri': 'gs://shuowei_bucket/images/tree.jpeg'... \n", + "\n", + " result \n", + "0 {'uri': 'gs://shuowei_bucket/image_blur_transf... \n", + "1 {'uri': 'gs://shuowei_bucket/image_blur_transf... \n", + "\n", + "[2 rows x 3 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/test_blob_trans_pdf_extract.ipynb b/notebooks/test_blob_trans_pdf_extract.ipynb new file mode 100644 index 0000000000..c05acfdd01 --- /dev/null +++ b/notebooks/test_blob_trans_pdf_extract.ipynb @@ -0,0 +1,445 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/_config/experiment_options.py:69: PreviewWarning: BigFrames Blob is still under experiments. It may not work and subject\n", + "to change in the future.\n", + " warnings.warn(msg, category=bfe.PreviewWarning)\n" + ] + } + ], + "source": [ + "import bigframes\n", + "bigframes.options.experiments.blob = True\n", + "#bigframes.options._bigquery_options.client_endpoints_override = {\"bqclient\": \"https://test-bigquery.sandbox.google.com\", \n", + "# \"bqconnectionclient\": \"test-bigqueryconnection.sandbox.googleapis.com\", \n", + "# \"bqstoragereadclient\": \"test-bigquerystorage-grpc.sandbox.googleapis.com\"}\n", + "import bigframes.pandas as bpd" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/global_session.py:114: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + " return func(get_global_session(), *args, **kwargs)\n" + ] + }, + { + "data": { + "text/html": [ + "Query job d86dc475-9976-46bf-ac41-dd5a726d7093 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job d1dacc17-89b9-4e2b-ac6c-10d550d967bb is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "extract_df = bpd.from_glob_path(\"gs://bigframes_blob_test/pdfs/*\", name=\"pdf\")\n", + "extract_df = extract_df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job a42b5a1c-9f68-4257-a7c5-2680e7036993 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "bq_connection = \"bigframes-dev.us.bigframes-default-connection\"\n", + "extract_df[\"extract_text\"] = extract_df[\"pdf\"].blob.pdf_extract(\n", + " connection=bq_connection, verbose=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job a2306203-146a-4efc-b591-9928a5b6301a is DONE. 228 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 454cca1b-1c8c-493b-8d59-80d08f8a45ce is DONE. 228 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pdfextract_text
0uri: gs://bigframes_blob_test/pdfs/test-protected.pdf, authorizer: bigframes-dev.us.bigframes-default-connection{'status': 'File has not been decrypted', 'content': ''}
1uri: gs://bigframes_blob_test/pdfs/sample-local-pdf.pdf, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': 'Sample PDF This is a testing file. Some dummy messages are used for testing purposes. '}
\n", + "

2 rows × 2 columns

\n", + "
[2 rows x 2 columns in total]" + ], + "text/plain": [ + " pdf \\\n", + "0 {'uri': 'gs://bigframes_blob_test/pdfs/test-pr... \n", + "1 {'uri': 'gs://bigframes_blob_test/pdfs/sample-... \n", + "\n", + " extract_text \n", + "0 {'status': 'File has not been decrypted', 'con... \n", + "1 {'status': '', 'content': 'Sample PDF This... \n", + "\n", + "[2 rows x 2 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "extract_df" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 609f1624-8658-4b6e-bf9f-da8cc860b97c is DONE. 228 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 5e5306c3-2379-4e87-841a-dfe748afcc7f is DONE. 171 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "0 {'status': 'File has not been decrypted', 'con...\n", + "1 {'status': '', 'content': 'Sample PDF This...\n", + "Name: extract_text, dtype: struct[pyarrow]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "extract_df[\"extract_text\"].explode()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job ba554f28-afa3-47b0-a4fc-6128e86f7547 is DONE. 367 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 86d8b95f-aa96-4bb0-8a12-da8b5cf5ca20 is DONE. 171 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "0 {'status': 'File has not been decrypted', 'con...\n", + "1 {'status': '', 'content': 'Sample PDF This...\n", + "Name: extract_text, dtype: struct[pyarrow]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "extract_df[\"extract_text\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 0848e6d2-11b8-4908-b9c3-b7d5e0449b3a is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/operations/blob.py:736: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", + " \n" + ] + }, + { + "data": { + "text/html": [ + "Query job 9edb0395-9cfe-4e6b-8779-643080630518 is DONE. 228 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "0 \n", + "1 Sample PDF This is a testing file. So...\n", + "dtype: string" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "extract_df[\"pdf\"].blob.pdf_chunk(connection=bq_connection, verbose=False).explode().to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 0fd80b90-55d2-4827-8547-2a5c36e810ec is DONE. 765 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 9dc25ae3-fe67-4547-886a-612074660749 is DONE. 140 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "0 \n", + "1 Sample PDF This is a testing file. So...\n", + "Name: test, dtype: string" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "extract_df[\"test\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 \n", + "1 Sample PDF This is a testing file.\n", + "2 Some dummy messages are used for testing...\n", + "dtype: object\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "chunk_data = [\n", + " {\"status\": \"File has not been decrypted\", \"content\": []},\n", + " {\n", + " \"status\": \"\",\n", + " \"content\": [\"Sample PDF This is a testing file.\", \"Some dummy messages are used for testing purposes. \"],\n", + " },\n", + "]\n", + "\n", + "\n", + "content_values = []\n", + "for item in chunk_data:\n", + " if not item[\"content\"]:\n", + " content_values.append(pd.NA)\n", + " else:\n", + " content_values.extend(item[\"content\"])\n", + "\n", + "expected = pd.Series(content_values)\n", + "print(expected)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/test_blob_transcribe.ipynb b/notebooks/test_blob_transcribe.ipynb new file mode 100644 index 0000000000..4db81494bd --- /dev/null +++ b/notebooks/test_blob_transcribe.ipynb @@ -0,0 +1,157 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import bigframes\n", + "#bigframes.options._bigquery_options.client_endpoints_override = {\"bqclient\": \"https://test-bigquery.sandbox.google.com\", \n", + "# \"bqconnectionclient\": \"test-bigqueryconnection.sandbox.googleapis.com\", \n", + "# \"bqstoragereadclient\": \"test-bigquerystorage-grpc.sandbox.googleapis.com\"}\n", + "import bigframes.pandas as bpd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/global_session.py:103: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + " _global_session = bigframes.session.connect(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=262006177488-ka1m0ue4fptfmt9siejdd5lom7p39upa.apps.googleusercontent.com&redirect_uri=https%3A%2F%2Fpydata-google-auth.readthedocs.io%2Fen%2Flatest%2Foauth.html&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform&state=dACVRKbeOGCf4ooMnkd63GGaampwXy&prompt=consent&access_type=offline\n" + ] + }, + { + "ename": "ValueError", + "evalue": "Please supply either code or authorization_response parameters.", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[2], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m chunks_df \u001b[38;5;241m=\u001b[39m \u001b[43mbpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_glob_path\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgs://garrettwu_bucket/pdfs/*\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2\u001b[0m chunks_df\u001b[38;5;241m.\u001b[39mcolumns \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124muri\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 3\u001b[0m bq_connection \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbigframes-dev.us.bigframes-default-connection\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:175\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 172\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 175\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 178\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 179\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;66;03m# _block.session.bqclient.\u001b[39;00m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/pandas/io/api.py:606\u001b[0m, in \u001b[0;36mfrom_glob_path\u001b[0;34m(path, connection, name)\u001b[0m\n\u001b[1;32m 603\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mfrom_glob_path\u001b[39m(\n\u001b[1;32m 604\u001b[0m path: \u001b[38;5;28mstr\u001b[39m, \u001b[38;5;241m*\u001b[39m, connection: Optional[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m, name: Optional[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 605\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m bigframes\u001b[38;5;241m.\u001b[39mdataframe\u001b[38;5;241m.\u001b[39mDataFrame:\n\u001b[0;32m--> 606\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mglobal_session\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwith_default_session\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 607\u001b[0m \u001b[43m \u001b[49m\u001b[43mbigframes\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mSession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_glob_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 608\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 609\u001b[0m \u001b[43m \u001b[49m\u001b[43mconnection\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconnection\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 610\u001b[0m \u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 611\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/global_session.py:114\u001b[0m, in \u001b[0;36mwith_default_session\u001b[0;34m(func_, *args, **kwargs)\u001b[0m\n\u001b[1;32m 113\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mwith_default_session\u001b[39m(func_: Callable[\u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m, _T], \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m _T:\n\u001b[0;32m--> 114\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func_(\u001b[43mget_global_session\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/global_session.py:103\u001b[0m, in \u001b[0;36mget_global_session\u001b[0;34m()\u001b[0m\n\u001b[1;32m 101\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _global_session_lock:\n\u001b[1;32m 102\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m _global_session \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 103\u001b[0m _global_session \u001b[38;5;241m=\u001b[39m \u001b[43mbigframes\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconnect\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 104\u001b[0m \u001b[43m \u001b[49m\u001b[43mbigframes\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbigquery\u001b[49m\n\u001b[1;32m 105\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 107\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _global_session\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/__init__.py:2188\u001b[0m, in \u001b[0;36mconnect\u001b[0;34m(context)\u001b[0m\n\u001b[1;32m 2187\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mconnect\u001b[39m(context: Optional[bigquery_options\u001b[38;5;241m.\u001b[39mBigQueryOptions] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Session:\n\u001b[0;32m-> 2188\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mSession\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcontext\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:175\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 172\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 175\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 178\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 179\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;66;03m# _block.session.bqclient.\u001b[39;00m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/__init__.py:170\u001b[0m, in \u001b[0;36mSession.__init__\u001b[0;34m(self, context, clients_provider)\u001b[0m\n\u001b[1;32m 168\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_clients_provider \u001b[38;5;241m=\u001b[39m clients_provider\n\u001b[1;32m 169\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 170\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_clients_provider \u001b[38;5;241m=\u001b[39m \u001b[43mbigframes\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mclients\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mClientsProvider\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 171\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcontext\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mproject\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 172\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_location\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 173\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_regional_endpoints\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcontext\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43muse_regional_endpoints\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 174\u001b[0m \u001b[43m \u001b[49m\u001b[43mcredentials\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcontext\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcredentials\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 175\u001b[0m \u001b[43m \u001b[49m\u001b[43mapplication_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcontext\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapplication_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[43m \u001b[49m\u001b[43mbq_kms_key_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_bq_kms_key_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 177\u001b[0m \u001b[43m \u001b[49m\u001b[43mclient_endpoints_override\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcontext\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mclient_endpoints_override\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 178\u001b[0m \u001b[43m \u001b[49m\u001b[43mrequests_transport_adapters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcontext\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrequests_transport_adapters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 179\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;66;03m# TODO(shobs): Remove this logic after https://github.com/ibis-project/ibis/issues/8494\u001b[39;00m\n\u001b[1;32m 182\u001b[0m \u001b[38;5;66;03m# has been fixed. The ibis client changes the default query job config\u001b[39;00m\n\u001b[1;32m 183\u001b[0m \u001b[38;5;66;03m# so we are going to remember the current config and restore it after\u001b[39;00m\n\u001b[1;32m 184\u001b[0m \u001b[38;5;66;03m# the ibis client has been created\u001b[39;00m\n\u001b[1;32m 185\u001b[0m original_default_query_job_config \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbqclient\u001b[38;5;241m.\u001b[39mdefault_query_job_config\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/clients.py:91\u001b[0m, in \u001b[0;36mClientsProvider.__init__\u001b[0;34m(self, project, location, use_regional_endpoints, credentials, application_name, bq_kms_key_name, client_endpoints_override, requests_transport_adapters)\u001b[0m\n\u001b[1;32m 89\u001b[0m credentials_project \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 90\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m credentials \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m---> 91\u001b[0m credentials, credentials_project \u001b[38;5;241m=\u001b[39m \u001b[43m_get_default_credentials_with_project\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 93\u001b[0m \u001b[38;5;66;03m# Ensure an access token is available.\u001b[39;00m\n\u001b[1;32m 94\u001b[0m credentials\u001b[38;5;241m.\u001b[39mrefresh(google\u001b[38;5;241m.\u001b[39mauth\u001b[38;5;241m.\u001b[39mtransport\u001b[38;5;241m.\u001b[39mrequests\u001b[38;5;241m.\u001b[39mRequest())\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/clients.py:54\u001b[0m, in \u001b[0;36m_get_default_credentials_with_project\u001b[0;34m()\u001b[0m\n\u001b[1;32m 53\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m_get_default_credentials_with_project\u001b[39m():\n\u001b[0;32m---> 54\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mpydata_google_auth\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdefault\u001b[49m\u001b[43m(\u001b[49m\u001b[43mscopes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m_SCOPES\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43muse_local_webserver\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/pydata_google_auth/auth.py:151\u001b[0m, in \u001b[0;36mdefault\u001b[0;34m(scopes, client_id, client_secret, credentials_cache, use_local_webserver, auth_local_webserver, redirect_uri)\u001b[0m\n\u001b[1;32m 148\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m credentials \u001b[38;5;129;01mand\u001b[39;00m credentials\u001b[38;5;241m.\u001b[39mvalid:\n\u001b[1;32m 149\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m credentials, default_project\n\u001b[0;32m--> 151\u001b[0m credentials \u001b[38;5;241m=\u001b[39m \u001b[43mget_user_credentials\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 152\u001b[0m \u001b[43m \u001b[49m\u001b[43mscopes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 153\u001b[0m \u001b[43m \u001b[49m\u001b[43mclient_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mclient_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 154\u001b[0m \u001b[43m \u001b[49m\u001b[43mclient_secret\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mclient_secret\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 155\u001b[0m \u001b[43m \u001b[49m\u001b[43mcredentials_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcredentials_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 156\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_local_webserver\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_local_webserver\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 157\u001b[0m \u001b[43m \u001b[49m\u001b[43mredirect_uri\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mredirect_uri\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 158\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 160\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m credentials \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m credentials\u001b[38;5;241m.\u001b[39mvalid:\n\u001b[1;32m 161\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exceptions\u001b[38;5;241m.\u001b[39mPyDataCredentialsError(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCould not get any valid credentials.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/pydata_google_auth/auth.py:400\u001b[0m, in \u001b[0;36mget_user_credentials\u001b[0;34m(scopes, client_id, client_secret, credentials_cache, use_local_webserver, auth_local_webserver, redirect_uri)\u001b[0m\n\u001b[1;32m 398\u001b[0m credentials \u001b[38;5;241m=\u001b[39m _webserver\u001b[38;5;241m.\u001b[39mrun_local_server(app_flow, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mAUTH_URI_KWARGS)\n\u001b[1;32m 399\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 400\u001b[0m credentials \u001b[38;5;241m=\u001b[39m \u001b[43m_run_webapp\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 401\u001b[0m \u001b[43m \u001b[49m\u001b[43mapp_flow\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mredirect_uri\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mredirect_uri\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mAUTH_URI_KWARGS\u001b[49m\n\u001b[1;32m 402\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 404\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m oauthlib\u001b[38;5;241m.\u001b[39moauth2\u001b[38;5;241m.\u001b[39mrfc6749\u001b[38;5;241m.\u001b[39merrors\u001b[38;5;241m.\u001b[39mOAuth2Error \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 405\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exceptions\u001b[38;5;241m.\u001b[39mPyDataCredentialsError(\n\u001b[1;32m 406\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnable to get valid credentials: \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(exc)\n\u001b[1;32m 407\u001b[0m )\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/pydata_google_auth/auth.py:59\u001b[0m, in \u001b[0;36m_run_webapp\u001b[0;34m(flow, redirect_uri, **kwargs)\u001b[0m\n\u001b[1;32m 56\u001b[0m authorization_code_message \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEnter the authorization code: \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 58\u001b[0m code \u001b[38;5;241m=\u001b[39m \u001b[38;5;28minput\u001b[39m(authorization_code_message)\n\u001b[0;32m---> 59\u001b[0m \u001b[43mflow\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfetch_token\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcode\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 60\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m flow\u001b[38;5;241m.\u001b[39mcredentials\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google_auth_oauthlib/flow.py:285\u001b[0m, in \u001b[0;36mFlow.fetch_token\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m 283\u001b[0m kwargs\u001b[38;5;241m.\u001b[39msetdefault(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mclient_secret\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclient_config[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mclient_secret\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m 284\u001b[0m kwargs\u001b[38;5;241m.\u001b[39msetdefault(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcode_verifier\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcode_verifier)\n\u001b[0;32m--> 285\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moauth2session\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfetch_token\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mclient_config\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtoken_uri\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/requests_oauthlib/oauth2_session.py:278\u001b[0m, in \u001b[0;36mOAuth2Session.fetch_token\u001b[0;34m(self, token_url, code, authorization_response, body, auth, username, password, method, force_querystring, timeout, headers, verify, proxies, include_client_id, client_secret, cert, **kwargs)\u001b[0m\n\u001b[1;32m 276\u001b[0m code \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_client\u001b[38;5;241m.\u001b[39mcode\n\u001b[1;32m 277\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m code:\n\u001b[0;32m--> 278\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 279\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease supply either code or \u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mauthorization_response parameters.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 280\u001b[0m )\n\u001b[1;32m 282\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pkce:\n\u001b[1;32m 283\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_code_verifier \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", + "\u001b[0;31mValueError\u001b[0m: Please supply either code or authorization_response parameters." + ] + } + ], + "source": [ + "chunks_df = bpd.from_glob_path(\"gs://garrettwu_bucket/pdfs/*\")\n", + "chunks_df.columns = [\"uri\"]\n", + "bq_connection = \"bigframes-dev.us.bigframes-default-connection\"\n", + "chunks_df[\"chunk_text\"] = chunks_df[\"uri\"].blob.pdf_chunk(\n", + " connection=bq_connection, chunk_size=2000, overlap_size=200,\n", + " max_batching_rows=1\n", + ")\n", + "chunk_df_exploded = chunks_df[\"chunk_text\"].explode()\n", + "chunk_df_exploded.cache()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = bpd.from_glob_path(\"gs://bigframes_blob_test/audio/*\", name=\"audio\")\n", + "\n", + "#df[\"audio\"] = \"gs://bigframes_blob_test/audio/LJ001-0010.wav\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# copy files again, now we have 1000 audio files\n", + "copies = [df] * 3\n", + "df = bpd.concat(copies, ignore_index=True)\n", + "df = df.cache()\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# copy files again, now we have 1,000,000 audio files\n", + "#copies = [df] * 2 * 100\n", + "#df = bpd.concat(copies, ignore_index=True)\n", + "#df = df.cache()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#bq_connection = \"bigframes-dev.us.bigframes-default-connection\"\n", + "df[\"text\"] = df[\"audio\"].blob.audio_transcribe(model_name=\"gemini-2.0-flash-001\", verbose=True)\n", + "# gemini-2.5-pro-preview-05-20" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/test_blob_transcribe_1M_short_audio.ipynb b/notebooks/test_blob_transcribe_1M_short_audio.ipynb new file mode 100644 index 0000000000..f494133f53 --- /dev/null +++ b/notebooks/test_blob_transcribe_1M_short_audio.ipynb @@ -0,0 +1,450 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/_config/experiment_options.py:71: ApiDeprecationWarning: BigFrames Blob is in preview now. This flag is no longer needed.\n", + " warnings.warn(msg, category=bfe.ApiDeprecationWarning)\n" + ] + } + ], + "source": [ + "import bigframes\n", + "bigframes.options.experiments.blob = True\n", + "#bigframes.options._bigquery_options.client_endpoints_override = {\"bqclient\": \"https://test-bigquery.sandbox.google.com\", \n", + "# \"bqconnectionclient\": \"test-bigqueryconnection.sandbox.googleapis.com\", \n", + "# \"bqstoragereadclient\": \"test-bigquerystorage-grpc.sandbox.googleapis.com\"}\n", + "import bigframes.pandas as bpd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/global_session.py:103: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + " _global_session = bigframes.session.connect(\n" + ] + }, + { + "data": { + "text/html": [ + "Query job 7d45f84a-01f6-413a-a8f5-04fe41cf60c7 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 79486ecb-ac6d-4e05-b348-809d4baa6da4 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df = bpd.from_glob_path(\"gs://shuowei_bucket/audio/*\", name=\"audio\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job cc70ecf3-5095-4f3f-8c49-796dfc68af69 is DONE. 354.8 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# copy files again, now we have 1000 audio files\n", + "copies = [df] * 2 * 100\n", + "df = bpd.concat(copies, ignore_index=True)\n", + "df = df.cache()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job d3b7a0c7-0359-4aa9-846d-f99250cbe37b is DONE. 96.0 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# copy files again, now we have 1,000,000 audio files\n", + "copies = [df] * 2 * 100\n", + "df = bpd.concat(copies, ignore_index=True)\n", + "df = df.cache()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 5ab7d996-8290-4526-954f-64e31a3f3d81 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job fcfd94e4-9e14-46b0-a187-6cc65885d559 is DONE. 19.2 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/array_value.py:109: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "Load job 586c243f-92c4-43cd-871e-09add370f1c9 is DONE. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "#bq_connection = \"bigframes-dev.us.bigframes-default-connection\"\n", + "df[\"text\"] = df[\"audio\"].blob.transcribe(df=df, audio_column=\"audio\", model_name=\"gemini-2.0-flash-001\", verbose=True)\n", + "# gemini-2.5-pro-preview-05-06" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job b3ca3d9c-b359-4e8d-8675-7ad6d5609762 is DONE. 164.0 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 0405effe-08f2-4d4d-ad7f-6e8876e23e94 is DONE. 64.5 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
audiotext
0uri: gs://shuowei_bucket/audio/ID014clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
1uri: gs://shuowei_bucket/audio/HI009clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
2uri: gs://shuowei_bucket/audio/AK012clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
3uri: gs://shuowei_bucket/audio/CA138clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': \"was rather interesting just to watch them gathering their materials and bouncing around. Yeah, they are what they call it kangaroo walk or or something. Really? Uh, something like that. They named it that. I don't know. I bet those men are going to get quite a reception when they get back to earth. Oh, yes. I'll be so glad when they land back now, but I think that's uh pretty well uh fact because they've landed so many safely now that I I feel relieved. Just getting off of the moon was the thing that was. Have they met with the um one that was circling? Yes, they've rendezvoused. So I understand. I that wasn't shown either, so I but so they say they have rendezvoused, so that's a matter of making the circles and then coming down. What do you sort of imagine for the future? Do you imagine them sending a I think they will. I think they will do some more exploring up there. Right. Positive, you know, to continue with this. Uh-huh. Because that was such a very small area when you think of it that they just gathered uh rocks and and the oh samples of soil and all and they did uh probe for some samples. And just what is going to come of that, I don't know. I'd be glad to read it.\"}
4uri: gs://shuowei_bucket/audio/TX048clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': \"Say this is entered customs. And uh the next step is to prepare a Mexican customs entry. Mexico call them pedimentos. It's really an appraisal sheet. It's it's a breakdown of the sheet. You use the same information. That's right. But here you give them all the information, for example, let me see, I found one right here. You give them all the information on on what the invoice contains. Here's one. On what the invoice contains, what uh what what the material is itself. If you hear, for example, this is some bottling equipment going over there and on the appraisal sheet, so you tell them who it's for, who made the invoice out of the states. This came from St. Louis, Missouri. Yeah. How much the freight was. And on the reverse of this of this thing, but the everything has to be marked. So complete identification. This identification number is that traffic number I spoke to you a little while ago. Number of boxes, what they contain, the tariff duty, the tariff number, the ad valorem rate, and what the actual duties would be paid. For example, here would be 996 pesos and 73 cents. And as you can see, this is a form that has already been approved by the appraiser. These appraisers are called Vistas. It's very different from the Vistas we have nowadays here.\"}
5uri: gs://shuowei_bucket/audio/ID014clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': \"I'll tell you something, when the first revelation Joseph Smith received about marriage of any kind was that you couldn't get anywhere, you into the celestial kingdom. No man could get into the celestial kingdom without a wife, because he's only half there if he hasn't got a wife. And no woman, of course, could get there without a husband either, because she's only half there. Well, now at that time, the Mormons were very strict, uh, uh, religious people, you see. Sure. Sure. And, uh, thought that a man had to live clean all his life as well as a woman did in order to be worthy to be a man. Well, this is still true. Well, it's still true certainly. But I said, in that those circumstances, at that time, there were, uh, about three times as many women that joined the church and were faithful Latter-day Saints than there were men. When Joseph Smith comes out with the revelation that, uh, marriage was for eternity. And if if you're going to be separated from death do you part, then you can't get into the celestial kingdom at all to become as God is. Now, they got that doctrine. And that was, the prophet said that, they believed that. What about these here two-thirds of the women? Yeah, that's a good question. What is going to become of us then? It won't do us any good to go out here and marry one of these scallywags that spends his time drinking and boozing around and mining and maybe works in the mine a while and does all kind of skullduggery. That won't do us any good to marry that kind of a man. Besides, we don't have a chance to raise a respectable family with that kind of a man. So, what are we going to do about it now? What's going to become of us? And Joseph Smith always prayed about every problem that came to him. And there's where he got the revelation that if they live thoroughly clean and faithful, if they can understand the purity and sacredness of sex, so they don't desecrate it, then one man can have a number of wives and and permit them to raise a respectable, honorable family as well as have those wives throughout eternity, have their husband to take them into the celestial kingdom. Now, there was the origin of polygamy.\"}
6uri: gs://shuowei_bucket/audio/HI009clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
7uri: gs://shuowei_bucket/audio/AK012clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
8uri: gs://shuowei_bucket/audio/CA138clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
9uri: gs://shuowei_bucket/audio/TX048clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
10uri: gs://shuowei_bucket/audio/ID014clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
11uri: gs://shuowei_bucket/audio/HI009clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
12uri: gs://shuowei_bucket/audio/AK012clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
13uri: gs://shuowei_bucket/audio/CA138clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
14uri: gs://shuowei_bucket/audio/TX048clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
15uri: gs://shuowei_bucket/audio/ID014clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
16uri: gs://shuowei_bucket/audio/HI009clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
17uri: gs://shuowei_bucket/audio/AK012clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
18uri: gs://shuowei_bucket/audio/CA138clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
19uri: gs://shuowei_bucket/audio/TX048clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
20uri: gs://shuowei_bucket/audio/ID014clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
21uri: gs://shuowei_bucket/audio/HI009clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': \"Uh, I hear that there's a Hawaiian custom of something called a Hukilau. Hukilau, Hukilau is a they have a rope and on the rope they have a net there. Mhm. They surround at a certain place and they drag the net. They surround it, you mean they carry the net out? Carry the net, uh-huh. Uh, how do they is there something to keep the net floating? Yeah, some they have those floaters uh floating. Mhm. Some they have a lead down at the bottom, then they drag that uh net in. What do the floats what are the floats made of? The float made of those uh some coconut uh dry coconut I guess. Oh, dry coconut. That's a good use for coconut. Uh, and that floats the top of the net. Top of the net. Now, the word lau means leaf, doesn't it? Lau, lau is a yeah, yeah, lau is a leaf. All right, well now the the where does the leaf come in in this Hukilau? Hukilau, the leaves, you know, they stay on at the top top of the rope. So that the when they pull, the fish going to not to jump over the rope. The ropes, the the uh the leaves keep the fish from jumping over to the Jump over. They scare they scare them away so they want to go underneath the water, see. When moment she go underneath the water there, they will go to the net. They have a pocket on the net. They have a pocket on the net. Now the bottom of the net then Bottom of the net they have a pocket in. Mhm. The the bottom must have some kind of things to make it sink. Sink is a lead, they have a lead. Oh, they have lead on it. I see. Then uh uh somebody goes out and puts the net out around. Yeah, they throw it with a boat. With a boat? Two two boats they goes out. Throw it all around. And then uh who pulls it in? Well, they have a lots of uh from the shore about 40, 50 guys they pull that rope up. Ah, I see. Yeah. And uh they must have a pretty big school of fish there. Yeah, once in a while they have a nice school school, right? And then they pull them all in and the fish don't are scared to jump out so they get caught. No, they all in up on the net already. Mhm. They get caught in the net. I suppose the top of the net comes in sooner than the bottom so they if they do try to go out there, that's pocket. Yeah, they have to they going to, you know, about six or seven guys is swimming on it around the net to they're watching how the fish going. Oh, I see. They they're chasing at the hole. Uh-huh. Well, about how much fish do they get out of a good haul of those? Once I saw up Waimea they caught about about a ton I guess, you know, those Akule. Mhm. Oh, they have a nice crew up there. What's Akule? Mm, the Akule some kind of, you know, that's Hawaiian used to call Akule, Akule. They have been calling it Aji in Japan. \"}
22uri: gs://shuowei_bucket/audio/AK012clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
23uri: gs://shuowei_bucket/audio/CA138clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
24uri: gs://shuowei_bucket/audio/TX048clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
\n", + "

25 rows × 2 columns

\n", + "
[200000 rows x 2 columns in total]" + ], + "text/plain": [ + " audio \\\n", + "0 {'uri': 'gs://shuowei_bucket/audio/ID014clip.m... \n", + "1 {'uri': 'gs://shuowei_bucket/audio/HI009clip.m... \n", + "2 {'uri': 'gs://shuowei_bucket/audio/AK012clip.m... \n", + "3 {'uri': 'gs://shuowei_bucket/audio/CA138clip.m... \n", + "4 {'uri': 'gs://shuowei_bucket/audio/TX048clip.m... \n", + "5 {'uri': 'gs://shuowei_bucket/audio/ID014clip.m... \n", + "6 {'uri': 'gs://shuowei_bucket/audio/HI009clip.m... \n", + "7 {'uri': 'gs://shuowei_bucket/audio/AK012clip.m... \n", + "8 {'uri': 'gs://shuowei_bucket/audio/CA138clip.m... \n", + "9 {'uri': 'gs://shuowei_bucket/audio/TX048clip.m... \n", + "10 {'uri': 'gs://shuowei_bucket/audio/ID014clip.m... \n", + "11 {'uri': 'gs://shuowei_bucket/audio/HI009clip.m... \n", + "12 {'uri': 'gs://shuowei_bucket/audio/AK012clip.m... \n", + "13 {'uri': 'gs://shuowei_bucket/audio/CA138clip.m... \n", + "14 {'uri': 'gs://shuowei_bucket/audio/TX048clip.m... \n", + "15 {'uri': 'gs://shuowei_bucket/audio/ID014clip.m... \n", + "16 {'uri': 'gs://shuowei_bucket/audio/HI009clip.m... \n", + "17 {'uri': 'gs://shuowei_bucket/audio/AK012clip.m... \n", + "18 {'uri': 'gs://shuowei_bucket/audio/CA138clip.m... \n", + "19 {'uri': 'gs://shuowei_bucket/audio/TX048clip.m... \n", + "20 {'uri': 'gs://shuowei_bucket/audio/ID014clip.m... \n", + "21 {'uri': 'gs://shuowei_bucket/audio/HI009clip.m... \n", + "22 {'uri': 'gs://shuowei_bucket/audio/AK012clip.m... \n", + "23 {'uri': 'gs://shuowei_bucket/audio/CA138clip.m... \n", + "24 {'uri': 'gs://shuowei_bucket/audio/TX048clip.m... \n", + "\n", + " text \n", + "0 {'status': '', 'content': None} \n", + "1 {'status': '', 'content': None} \n", + "2 {'status': '', 'content': None} \n", + "3 {'status': '', 'content': \"was rather interest... \n", + "4 {'status': '', 'content': \"Say this is entered... \n", + "5 {'status': '', 'content': \"I'll tell you somet... \n", + "6 {'status': '', 'content': None} \n", + "7 {'status': '', 'content': None} \n", + "8 {'status': '', 'content': None} \n", + "9 {'status': '', 'content': None} \n", + "10 {'status': '', 'content': None} \n", + "11 {'status': '', 'content': None} \n", + "12 {'status': '', 'content': None} \n", + "13 {'status': '', 'content': None} \n", + "14 {'status': '', 'content': None} \n", + "15 {'status': '', 'content': None} \n", + "16 {'status': '', 'content': None} \n", + "17 {'status': '', 'content': None} \n", + "18 {'status': '', 'content': None} \n", + "19 {'status': '', 'content': None} \n", + "20 {'status': '', 'content': None} \n", + "21 {'status': '', 'content': \"Uh, I hear that the... \n", + "22 {'status': '', 'content': None} \n", + "23 {'status': '', 'content': None} \n", + "24 {'status': '', 'content': None} \n", + "...\n", + "\n", + "[200000 rows x 2 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/test_blob_transcribe_1M_short_audio_v1.ipynb b/notebooks/test_blob_transcribe_1M_short_audio_v1.ipynb new file mode 100644 index 0000000000..94ecbccd24 --- /dev/null +++ b/notebooks/test_blob_transcribe_1M_short_audio_v1.ipynb @@ -0,0 +1,345 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/_config/experiment_options.py:71: ApiDeprecationWarning: BigFrames Blob is in preview now. This flag is no longer needed.\n", + " warnings.warn(msg, category=bfe.ApiDeprecationWarning)\n" + ] + } + ], + "source": [ + "import bigframes\n", + "bigframes.options.experiments.blob = True\n", + "#bigframes.options._bigquery_options.client_endpoints_override = {\"bqclient\": \"https://test-bigquery.sandbox.google.com\", \n", + "# \"bqconnectionclient\": \"test-bigqueryconnection.sandbox.googleapis.com\", \n", + "# \"bqstoragereadclient\": \"test-bigquerystorage-grpc.sandbox.googleapis.com\"}\n", + "import bigframes.pandas as bpd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/global_session.py:103: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + " _global_session = bigframes.session.connect(\n" + ] + }, + { + "data": { + "text/html": [ + "Query job 111af7c8-313c-4884-b630-632a4b1ea5fe is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 3a680a1a-208b-4c9f-8637-eb048b0a651d is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df = bpd.from_glob_path(\"gs://shuowei_bucket/audio/*\", name=\"audio\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job ce7097d7-b758-49d4-af1b-3963f44c7df3 is DONE. 354.8 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# copy files again, now we have 1000 audio files\n", + "copies = [df] * 2 * 100\n", + "df = bpd.concat(copies, ignore_index=True)\n", + "df = df.cache()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job f94ca3c8-fc8b-4102-ada0-c0db8c4716e1 is DONE. 96.0 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# copy files again, now we have 1,000,000 audio files\n", + "copies = [df] * 2 * 100\n", + "df = bpd.concat(copies, ignore_index=True)\n", + "df = df.cache()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 0a889fd5-0cc6-4f1d-8154-30fb00b394be is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 853cc347-ae70-4d1a-aa20-c5801153217f is RUNNING. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requested cancellation for Query job 853cc347-ae70-4d1a-aa20-c5801153217f in location US...\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[5], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m#bq_connection = \"bigframes-dev.us.bigframes-default-connection\"\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43maudio\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mblob\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtranscribe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maudio_column\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43maudio\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgemini-2.0-flash-001\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# gemini-2.5-pro-preview-05-06\u001b[39;00m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:175\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 172\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 175\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 178\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 179\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;66;03m# _block.session.bqclient.\u001b[39;00m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/operations/blob.py:820\u001b[0m, in \u001b[0;36mBlobAccessor.transcribe\u001b[0;34m(self, df, audio_column, model_name, prompt_text, temperature, output_schema, verbose)\u001b[0m\n\u001b[1;32m 817\u001b[0m model \u001b[38;5;241m=\u001b[39m llm\u001b[38;5;241m.\u001b[39mGeminiTextGenerator(model_name\u001b[38;5;241m=\u001b[39mmodel_name)\n\u001b[1;32m 819\u001b[0m \u001b[38;5;66;03m# transcribe audio using ML.GENERATE_TEXT\u001b[39;00m\n\u001b[0;32m--> 820\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpredict\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 821\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdf_prompt\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 822\u001b[0m \u001b[43m \u001b[49m\u001b[43mprompt\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43mdf_prompt\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mprompt\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdf_prompt\u001b[49m\u001b[43m[\u001b[49m\u001b[43maudio_column\u001b[49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 823\u001b[0m \u001b[43m \u001b[49m\u001b[43mtemperature\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtemperature\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 824\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_schema\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_schema\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 825\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 827\u001b[0m content_series \u001b[38;5;241m=\u001b[39m cast(bpd\u001b[38;5;241m.\u001b[39mSeries, results[transcribe_col_name])\n\u001b[1;32m 829\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m verbose:\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:175\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 172\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 175\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 178\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 179\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;66;03m# _block.session.bqclient.\u001b[39;00m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/ml/llm.py:738\u001b[0m, in \u001b[0;36mGeminiTextGenerator.predict\u001b[0;34m(self, X, temperature, max_output_tokens, top_k, top_p, ground_with_google_search, max_retries, prompt, output_schema)\u001b[0m\n\u001b[1;32m 734\u001b[0m output_schema \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 735\u001b[0m k: utils\u001b[38;5;241m.\u001b[39mstandardize_type(v) \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m output_schema\u001b[38;5;241m.\u001b[39mitems()\n\u001b[1;32m 736\u001b[0m }\n\u001b[1;32m 737\u001b[0m options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutput_schema\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m output_schema\n\u001b[0;32m--> 738\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_predict_and_retry\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 739\u001b[0m \u001b[43m \u001b[49m\u001b[43mcore\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mBqmlModel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate_table_tvf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 740\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 741\u001b[0m \u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 742\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_retries\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_retries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 743\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 745\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_predict_and_retry(\n\u001b[1;32m 746\u001b[0m core\u001b[38;5;241m.\u001b[39mBqmlModel\u001b[38;5;241m.\u001b[39mgenerate_text_tvf,\n\u001b[1;32m 747\u001b[0m X,\n\u001b[1;32m 748\u001b[0m options\u001b[38;5;241m=\u001b[39moptions,\n\u001b[1;32m 749\u001b[0m max_retries\u001b[38;5;241m=\u001b[39mmax_retries,\n\u001b[1;32m 750\u001b[0m )\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/ml/base.py:266\u001b[0m, in \u001b[0;36mRetriableRemotePredictor._predict_and_retry\u001b[0;34m(self, bqml_model_predict_tvf, X, options, max_retries)\u001b[0m\n\u001b[1;32m 263\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(msg, category\u001b[38;5;241m=\u001b[39m\u001b[38;5;167;01mRuntimeWarning\u001b[39;00m)\n\u001b[1;32m 264\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[0;32m--> 266\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43mbqml_model_predict_tvf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtvf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_bqml_model\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdf_fail\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 268\u001b[0m success \u001b[38;5;241m=\u001b[39m df[bqml_model_predict_tvf\u001b[38;5;241m.\u001b[39mstatus_col]\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mlen() \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 269\u001b[0m df_succ \u001b[38;5;241m=\u001b[39m df[success]\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/ml/core.py:202\u001b[0m, in \u001b[0;36mBqmlModel.generate_table\u001b[0;34m(self, input_data, options)\u001b[0m\n\u001b[1;32m 197\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mgenerate_table\u001b[39m(\n\u001b[1;32m 198\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 199\u001b[0m input_data: bpd\u001b[38;5;241m.\u001b[39mDataFrame,\n\u001b[1;32m 200\u001b[0m options: \u001b[38;5;28mdict\u001b[39m[\u001b[38;5;28mstr\u001b[39m, Union[\u001b[38;5;28mint\u001b[39m, \u001b[38;5;28mfloat\u001b[39m, \u001b[38;5;28mbool\u001b[39m, Mapping]],\n\u001b[1;32m 201\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m bpd\u001b[38;5;241m.\u001b[39mDataFrame:\n\u001b[0;32m--> 202\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_apply_ml_tvf\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 203\u001b[0m \u001b[43m \u001b[49m\u001b[43minput_data\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 204\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mlambda\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43msource_sql\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_model_manipulation_sql_generator\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mai_generate_table\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 205\u001b[0m \u001b[43m \u001b[49m\u001b[43msource_sql\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msource_sql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 206\u001b[0m \u001b[43m \u001b[49m\u001b[43mstruct_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 207\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 208\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/ml/core.py:88\u001b[0m, in \u001b[0;36mBqmlModel._apply_ml_tvf\u001b[0;34m(self, input_data, apply_sql_tvf)\u001b[0m\n\u001b[1;32m 83\u001b[0m input_sql, index_col_ids, index_labels \u001b[38;5;241m=\u001b[39m input_data\u001b[38;5;241m.\u001b[39m_to_sql_query(\n\u001b[1;32m 84\u001b[0m include_index\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 85\u001b[0m )\n\u001b[1;32m 87\u001b[0m result_sql \u001b[38;5;241m=\u001b[39m apply_sql_tvf(input_sql)\n\u001b[0;32m---> 88\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_session\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_gbq\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresult_sql\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex_col_ids\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 89\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m df\u001b[38;5;241m.\u001b[39m_has_index:\n\u001b[1;32m 90\u001b[0m df\u001b[38;5;241m.\u001b[39mindex\u001b[38;5;241m.\u001b[39mnames \u001b[38;5;241m=\u001b[39m index_labels\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:175\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 172\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 175\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 178\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 179\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;66;03m# _block.session.bqclient.\u001b[39;00m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/__init__.py:445\u001b[0m, in \u001b[0;36mSession.read_gbq\u001b[0;34m(self, query_or_table, index_col, columns, configuration, max_results, filters, use_cache, col_order, dry_run)\u001b[0m\n\u001b[1;32m 442\u001b[0m columns \u001b[38;5;241m=\u001b[39m col_order\n\u001b[1;32m 444\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m bf_io_bigquery\u001b[38;5;241m.\u001b[39mis_query(query_or_table):\n\u001b[0;32m--> 445\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_loader\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_gbq_query\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# type: ignore # for dry_run overload\u001b[39;49;00m\n\u001b[1;32m 446\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_or_table\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 447\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex_col\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 448\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 449\u001b[0m \u001b[43m \u001b[49m\u001b[43mconfiguration\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfiguration\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 450\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 451\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 452\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 453\u001b[0m \u001b[43m \u001b[49m\u001b[43mdry_run\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdry_run\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 454\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 455\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 456\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m configuration \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/loader.py:824\u001b[0m, in \u001b[0;36mGbqDataLoader.read_gbq_query\u001b[0;34m(self, query, index_col, columns, configuration, max_results, use_cache, filters, dry_run, force_total_order)\u001b[0m\n\u001b[1;32m 819\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m dry_runs\u001b[38;5;241m.\u001b[39mget_query_stats_with_inferred_dtypes(\n\u001b[1;32m 820\u001b[0m query_job, \u001b[38;5;28mlist\u001b[39m(columns), index_cols\n\u001b[1;32m 821\u001b[0m )\n\u001b[1;32m 823\u001b[0m \u001b[38;5;66;03m# No cluster candidates as user query might not be clusterable (eg because of ORDER BY clause)\u001b[39;00m\n\u001b[0;32m--> 824\u001b[0m destination, query_job \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_query_to_destination\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 825\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 826\u001b[0m \u001b[43m \u001b[49m\u001b[43mcluster_candidates\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 827\u001b[0m \u001b[43m \u001b[49m\u001b[43mconfiguration\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfiguration\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 828\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 830\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_metrics \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 831\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_metrics\u001b[38;5;241m.\u001b[39mcount_job_stats(query_job)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/loader.py:906\u001b[0m, in \u001b[0;36mGbqDataLoader._query_to_destination\u001b[0;34m(self, query, cluster_candidates, configuration, do_clustering)\u001b[0m\n\u001b[1;32m 902\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 903\u001b[0m \u001b[38;5;66;03m# Write to temp table to workaround BigQuery 10 GB query results\u001b[39;00m\n\u001b[1;32m 904\u001b[0m \u001b[38;5;66;03m# limit. See: internal issue 303057336.\u001b[39;00m\n\u001b[1;32m 905\u001b[0m job_config\u001b[38;5;241m.\u001b[39mlabels[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124merror_caught\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtrue\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m--> 906\u001b[0m _, query_job \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_start_query\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 907\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 908\u001b[0m \u001b[43m \u001b[49m\u001b[43mjob_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjob_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 909\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 910\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 911\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m query_job\u001b[38;5;241m.\u001b[39mdestination, query_job\n\u001b[1;32m 912\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m google\u001b[38;5;241m.\u001b[39mapi_core\u001b[38;5;241m.\u001b[39mexceptions\u001b[38;5;241m.\u001b[39mBadRequest:\n\u001b[1;32m 913\u001b[0m \u001b[38;5;66;03m# Some SELECT statements still aren't compatible with cluster\u001b[39;00m\n\u001b[1;32m 914\u001b[0m \u001b[38;5;66;03m# tables as the destination. For example, if the query has a\u001b[39;00m\n\u001b[1;32m 915\u001b[0m \u001b[38;5;66;03m# top-level ORDER BY, this conflicts with our ability to cluster\u001b[39;00m\n\u001b[1;32m 916\u001b[0m \u001b[38;5;66;03m# the table by the index column(s).\u001b[39;00m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/loader.py:937\u001b[0m, in \u001b[0;36mGbqDataLoader._start_query\u001b[0;34m(self, sql, job_config, timeout)\u001b[0m\n\u001b[1;32m 932\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m bigframes\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mcompute\u001b[38;5;241m.\u001b[39mmaximum_bytes_billed \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 933\u001b[0m \u001b[38;5;66;03m# Maybe this should be pushed down into start_query_with_client\u001b[39;00m\n\u001b[1;32m 934\u001b[0m job_config\u001b[38;5;241m.\u001b[39mmaximum_bytes_billed \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 935\u001b[0m bigframes\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mcompute\u001b[38;5;241m.\u001b[39mmaximum_bytes_billed\n\u001b[1;32m 936\u001b[0m )\n\u001b[0;32m--> 937\u001b[0m iterator, query_job \u001b[38;5;241m=\u001b[39m \u001b[43mbf_io_bigquery\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstart_query_with_client\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 938\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_bqclient\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 939\u001b[0m \u001b[43m \u001b[49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 940\u001b[0m \u001b[43m \u001b[49m\u001b[43mjob_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjob_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 941\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 942\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 943\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m query_job \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 944\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m iterator, query_job\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/_io/bigquery/__init__.py:280\u001b[0m, in \u001b[0;36mstart_query_with_client\u001b[0;34m(bq_client, sql, job_config, location, project, timeout, metrics, query_with_job)\u001b[0m\n\u001b[1;32m 278\u001b[0m opts \u001b[38;5;241m=\u001b[39m bigframes\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mdisplay\n\u001b[1;32m 279\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m opts\u001b[38;5;241m.\u001b[39mprogress_bar \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m query_job\u001b[38;5;241m.\u001b[39mconfiguration\u001b[38;5;241m.\u001b[39mdry_run:\n\u001b[0;32m--> 280\u001b[0m results_iterator \u001b[38;5;241m=\u001b[39m \u001b[43mformatting_helpers\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait_for_query_job\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 281\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_job\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 282\u001b[0m \u001b[43m \u001b[49m\u001b[43mprogress_bar\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mopts\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprogress_bar\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 283\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 284\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 285\u001b[0m results_iterator \u001b[38;5;241m=\u001b[39m query_job\u001b[38;5;241m.\u001b[39mresult()\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/formatting_helpers.py:149\u001b[0m, in \u001b[0;36mwait_for_query_job\u001b[0;34m(query_job, max_results, page_size, progress_bar)\u001b[0m\n\u001b[1;32m 147\u001b[0m loading_bar \u001b[38;5;241m=\u001b[39m display\u001b[38;5;241m.\u001b[39mHTML(get_query_job_loading_html(query_job))\n\u001b[1;32m 148\u001b[0m display\u001b[38;5;241m.\u001b[39mdisplay(loading_bar, display_id\u001b[38;5;241m=\u001b[39mdisplay_id)\n\u001b[0;32m--> 149\u001b[0m query_result \u001b[38;5;241m=\u001b[39m \u001b[43mquery_job\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 150\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpage_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpage_size\u001b[49m\n\u001b[1;32m 151\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 152\u001b[0m query_job\u001b[38;5;241m.\u001b[39mreload()\n\u001b[1;32m 153\u001b[0m display\u001b[38;5;241m.\u001b[39mupdate_display(\n\u001b[1;32m 154\u001b[0m display\u001b[38;5;241m.\u001b[39mHTML(get_query_job_loading_html(query_job)),\n\u001b[1;32m 155\u001b[0m display_id\u001b[38;5;241m=\u001b[39mdisplay_id,\n\u001b[1;32m 156\u001b[0m )\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/job/query.py:1681\u001b[0m, in \u001b[0;36mQueryJob.result\u001b[0;34m(self, page_size, max_results, retry, timeout, start_index, job_retry)\u001b[0m\n\u001b[1;32m 1676\u001b[0m remaining_timeout \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1678\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m remaining_timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1679\u001b[0m \u001b[38;5;66;03m# Since is_job_done() calls jobs.getQueryResults, which is a\u001b[39;00m\n\u001b[1;32m 1680\u001b[0m \u001b[38;5;66;03m# long-running API, don't delay the next request at all.\u001b[39;00m\n\u001b[0;32m-> 1681\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[43mis_job_done\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 1682\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[1;32m 1683\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1684\u001b[0m \u001b[38;5;66;03m# Use a monotonic clock since we don't actually care about\u001b[39;00m\n\u001b[1;32m 1685\u001b[0m \u001b[38;5;66;03m# daylight savings or similar, just the elapsed time.\u001b[39;00m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:293\u001b[0m, in \u001b[0;36mRetry.__call__..retry_wrapped_func\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 289\u001b[0m target \u001b[38;5;241m=\u001b[39m functools\u001b[38;5;241m.\u001b[39mpartial(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 290\u001b[0m sleep_generator \u001b[38;5;241m=\u001b[39m exponential_sleep_generator(\n\u001b[1;32m 291\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_initial, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_maximum, multiplier\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_multiplier\n\u001b[1;32m 292\u001b[0m )\n\u001b[0;32m--> 293\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mretry_target\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 294\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 295\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_predicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 296\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep_generator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 297\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 298\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 299\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:144\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m sleep \u001b[38;5;129;01min\u001b[39;00m sleep_generator:\n\u001b[1;32m 143\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 144\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mtarget\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 145\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inspect\u001b[38;5;241m.\u001b[39misawaitable(result):\n\u001b[1;32m 146\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(_ASYNC_RETRY_WARNING)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/job/query.py:1650\u001b[0m, in \u001b[0;36mQueryJob.result..is_job_done\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1644\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 1646\u001b[0m \u001b[38;5;66;03m# Call jobs.getQueryResults with max results set to 0 just to\u001b[39;00m\n\u001b[1;32m 1647\u001b[0m \u001b[38;5;66;03m# wait for the query to finish. Unlike most methods,\u001b[39;00m\n\u001b[1;32m 1648\u001b[0m \u001b[38;5;66;03m# jobs.getQueryResults hangs as long as it can to ensure we\u001b[39;00m\n\u001b[1;32m 1649\u001b[0m \u001b[38;5;66;03m# know when the query has finished as soon as possible.\u001b[39;00m\n\u001b[0;32m-> 1650\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_reload_query_results\u001b[49m\u001b[43m(\u001b[49m\u001b[43mretry\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mretry\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mreload_query_results_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1652\u001b[0m \u001b[38;5;66;03m# Even if the query is finished now according to\u001b[39;00m\n\u001b[1;32m 1653\u001b[0m \u001b[38;5;66;03m# jobs.getQueryResults, we'll want to reload the job status if\u001b[39;00m\n\u001b[1;32m 1654\u001b[0m \u001b[38;5;66;03m# it's not already DONE.\u001b[39;00m\n\u001b[1;32m 1655\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/job/query.py:1448\u001b[0m, in \u001b[0;36mQueryJob._reload_query_results\u001b[0;34m(self, retry, timeout, page_size)\u001b[0m\n\u001b[1;32m 1445\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(transport_timeout, (\u001b[38;5;28mfloat\u001b[39m, \u001b[38;5;28mint\u001b[39m)):\n\u001b[1;32m 1446\u001b[0m transport_timeout \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1448\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_query_results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_client\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_query_results\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1449\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjob_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1450\u001b[0m \u001b[43m \u001b[49m\u001b[43mretry\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1451\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mproject\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1452\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout_ms\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout_ms\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1453\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlocation\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1454\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtransport_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1455\u001b[0m \u001b[43m \u001b[49m\u001b[43mpage_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpage_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1456\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/client.py:2034\u001b[0m, in \u001b[0;36mClient._get_query_results\u001b[0;34m(self, job_id, retry, project, timeout_ms, location, timeout, page_size)\u001b[0m\n\u001b[1;32m 2030\u001b[0m \u001b[38;5;66;03m# This call is typically made in a polling loop that checks whether the\u001b[39;00m\n\u001b[1;32m 2031\u001b[0m \u001b[38;5;66;03m# job is complete (from QueryJob.done(), called ultimately from\u001b[39;00m\n\u001b[1;32m 2032\u001b[0m \u001b[38;5;66;03m# QueryJob.result()). So we don't need to poll here.\u001b[39;00m\n\u001b[1;32m 2033\u001b[0m span_attributes \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpath\u001b[39m\u001b[38;5;124m\"\u001b[39m: path}\n\u001b[0;32m-> 2034\u001b[0m resource \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_api\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2035\u001b[0m \u001b[43m \u001b[49m\u001b[43mretry\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2036\u001b[0m \u001b[43m \u001b[49m\u001b[43mspan_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mBigQuery.getQueryResults\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2037\u001b[0m \u001b[43m \u001b[49m\u001b[43mspan_attributes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mspan_attributes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2038\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mGET\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2039\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2040\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_params\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextra_params\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2041\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2042\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2043\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _QueryResults\u001b[38;5;241m.\u001b[39mfrom_api_repr(resource)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/client.py:843\u001b[0m, in \u001b[0;36mClient._call_api\u001b[0;34m(self, retry, span_name, span_attributes, job_ref, headers, **kwargs)\u001b[0m\n\u001b[1;32m 839\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m span_name \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 840\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m create_span(\n\u001b[1;32m 841\u001b[0m name\u001b[38;5;241m=\u001b[39mspan_name, attributes\u001b[38;5;241m=\u001b[39mspan_attributes, client\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m, job_ref\u001b[38;5;241m=\u001b[39mjob_ref\n\u001b[1;32m 842\u001b[0m ):\n\u001b[0;32m--> 843\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcall\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 845\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m call()\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:293\u001b[0m, in \u001b[0;36mRetry.__call__..retry_wrapped_func\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 289\u001b[0m target \u001b[38;5;241m=\u001b[39m functools\u001b[38;5;241m.\u001b[39mpartial(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 290\u001b[0m sleep_generator \u001b[38;5;241m=\u001b[39m exponential_sleep_generator(\n\u001b[1;32m 291\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_initial, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_maximum, multiplier\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_multiplier\n\u001b[1;32m 292\u001b[0m )\n\u001b[0;32m--> 293\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mretry_target\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 294\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 295\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_predicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 296\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep_generator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 297\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 298\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 299\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:144\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m sleep \u001b[38;5;129;01min\u001b[39;00m sleep_generator:\n\u001b[1;32m 143\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 144\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mtarget\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 145\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inspect\u001b[38;5;241m.\u001b[39misawaitable(result):\n\u001b[1;32m 146\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(_ASYNC_RETRY_WARNING)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/_http/__init__.py:482\u001b[0m, in \u001b[0;36mJSONConnection.api_request\u001b[0;34m(self, method, path, query_params, data, content_type, headers, api_base_url, api_version, expect_json, _target_object, timeout, extra_api_info)\u001b[0m\n\u001b[1;32m 479\u001b[0m data \u001b[38;5;241m=\u001b[39m json\u001b[38;5;241m.\u001b[39mdumps(data)\n\u001b[1;32m 480\u001b[0m content_type \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mapplication/json\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m--> 482\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 483\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 484\u001b[0m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 485\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 486\u001b[0m \u001b[43m \u001b[49m\u001b[43mcontent_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcontent_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 487\u001b[0m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 488\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget_object\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m_target_object\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 489\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 490\u001b[0m \u001b[43m \u001b[49m\u001b[43mextra_api_info\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextra_api_info\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 491\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 493\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;241m200\u001b[39m \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m response\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m300\u001b[39m:\n\u001b[1;32m 494\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exceptions\u001b[38;5;241m.\u001b[39mfrom_http_response(response)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/_http/__init__.py:341\u001b[0m, in \u001b[0;36mJSONConnection._make_request\u001b[0;34m(self, method, url, data, content_type, headers, target_object, timeout, extra_api_info)\u001b[0m\n\u001b[1;32m 338\u001b[0m headers[CLIENT_INFO_HEADER] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39muser_agent\n\u001b[1;32m 339\u001b[0m headers[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUser-Agent\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39muser_agent\n\u001b[0;32m--> 341\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_do_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 342\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtarget_object\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\n\u001b[1;32m 343\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/_http/__init__.py:379\u001b[0m, in \u001b[0;36mJSONConnection._do_request\u001b[0;34m(self, method, url, headers, data, target_object, timeout)\u001b[0m\n\u001b[1;32m 345\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m_do_request\u001b[39m(\n\u001b[1;32m 346\u001b[0m \u001b[38;5;28mself\u001b[39m, method, url, headers, data, target_object, timeout\u001b[38;5;241m=\u001b[39m_DEFAULT_TIMEOUT\n\u001b[1;32m 347\u001b[0m ): \u001b[38;5;66;03m# pylint: disable=unused-argument\u001b[39;00m\n\u001b[1;32m 348\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Low-level helper: perform the actual API request over HTTP.\u001b[39;00m\n\u001b[1;32m 349\u001b[0m \n\u001b[1;32m 350\u001b[0m \u001b[38;5;124;03m Allows batch context managers to override and defer a request.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 377\u001b[0m \u001b[38;5;124;03m :returns: The HTTP response.\u001b[39;00m\n\u001b[1;32m 378\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 379\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mhttp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 380\u001b[0m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\n\u001b[1;32m 381\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/auth/transport/requests.py:537\u001b[0m, in \u001b[0;36mAuthorizedSession.request\u001b[0;34m(self, method, url, data, headers, max_allowed_time, timeout, **kwargs)\u001b[0m\n\u001b[1;32m 534\u001b[0m remaining_time \u001b[38;5;241m=\u001b[39m guard\u001b[38;5;241m.\u001b[39mremaining_timeout\n\u001b[1;32m 536\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m TimeoutGuard(remaining_time) \u001b[38;5;28;01mas\u001b[39;00m guard:\n\u001b[0;32m--> 537\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mAuthorizedSession\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 538\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 539\u001b[0m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 540\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 541\u001b[0m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest_headers\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 542\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 543\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\n\u001b[1;32m 544\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 545\u001b[0m remaining_time \u001b[38;5;241m=\u001b[39m guard\u001b[38;5;241m.\u001b[39mremaining_timeout\n\u001b[1;32m 547\u001b[0m \u001b[38;5;66;03m# If the response indicated that the credentials needed to be\u001b[39;00m\n\u001b[1;32m 548\u001b[0m \u001b[38;5;66;03m# refreshed, then refresh the credentials and re-attempt the\u001b[39;00m\n\u001b[1;32m 549\u001b[0m \u001b[38;5;66;03m# request.\u001b[39;00m\n\u001b[1;32m 550\u001b[0m \u001b[38;5;66;03m# A stored token may expire between the time it is retrieved and\u001b[39;00m\n\u001b[1;32m 551\u001b[0m \u001b[38;5;66;03m# the time the request is made, so we may need to try twice.\u001b[39;00m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/requests/sessions.py:589\u001b[0m, in \u001b[0;36mSession.request\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m 584\u001b[0m send_kwargs \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 585\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtimeout\u001b[39m\u001b[38;5;124m\"\u001b[39m: timeout,\n\u001b[1;32m 586\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mallow_redirects\u001b[39m\u001b[38;5;124m\"\u001b[39m: allow_redirects,\n\u001b[1;32m 587\u001b[0m }\n\u001b[1;32m 588\u001b[0m send_kwargs\u001b[38;5;241m.\u001b[39mupdate(settings)\n\u001b[0;32m--> 589\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprep\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43msend_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 591\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m resp\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/requests/sessions.py:703\u001b[0m, in \u001b[0;36mSession.send\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m 700\u001b[0m start \u001b[38;5;241m=\u001b[39m preferred_clock()\n\u001b[1;32m 702\u001b[0m \u001b[38;5;66;03m# Send the request\u001b[39;00m\n\u001b[0;32m--> 703\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[43madapter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 705\u001b[0m \u001b[38;5;66;03m# Total elapsed time of the request (approximately)\u001b[39;00m\n\u001b[1;32m 706\u001b[0m elapsed \u001b[38;5;241m=\u001b[39m preferred_clock() \u001b[38;5;241m-\u001b[39m start\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/requests/adapters.py:667\u001b[0m, in \u001b[0;36mHTTPAdapter.send\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 664\u001b[0m timeout \u001b[38;5;241m=\u001b[39m TimeoutSauce(connect\u001b[38;5;241m=\u001b[39mtimeout, read\u001b[38;5;241m=\u001b[39mtimeout)\n\u001b[1;32m 666\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 667\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[43mconn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43murlopen\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 668\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 669\u001b[0m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 670\u001b[0m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 671\u001b[0m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 672\u001b[0m \u001b[43m \u001b[49m\u001b[43mredirect\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 673\u001b[0m \u001b[43m \u001b[49m\u001b[43massert_same_host\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 674\u001b[0m \u001b[43m \u001b[49m\u001b[43mpreload_content\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 675\u001b[0m \u001b[43m \u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 676\u001b[0m \u001b[43m \u001b[49m\u001b[43mretries\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmax_retries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 677\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 678\u001b[0m \u001b[43m \u001b[49m\u001b[43mchunked\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchunked\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 679\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 681\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (ProtocolError, \u001b[38;5;167;01mOSError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[1;32m 682\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mConnectionError\u001b[39;00m(err, request\u001b[38;5;241m=\u001b[39mrequest)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/urllib3/connectionpool.py:787\u001b[0m, in \u001b[0;36mHTTPConnectionPool.urlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, preload_content, decode_content, **response_kw)\u001b[0m\n\u001b[1;32m 784\u001b[0m response_conn \u001b[38;5;241m=\u001b[39m conn \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m release_conn \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 786\u001b[0m \u001b[38;5;66;03m# Make the request on the HTTPConnection object\u001b[39;00m\n\u001b[0;32m--> 787\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 788\u001b[0m \u001b[43m \u001b[49m\u001b[43mconn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 789\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 790\u001b[0m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 791\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout_obj\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 792\u001b[0m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 793\u001b[0m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 794\u001b[0m \u001b[43m \u001b[49m\u001b[43mchunked\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchunked\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 795\u001b[0m \u001b[43m \u001b[49m\u001b[43mretries\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mretries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 796\u001b[0m \u001b[43m \u001b[49m\u001b[43mresponse_conn\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresponse_conn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 797\u001b[0m \u001b[43m \u001b[49m\u001b[43mpreload_content\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpreload_content\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 798\u001b[0m \u001b[43m \u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecode_content\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 799\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mresponse_kw\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 800\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 802\u001b[0m \u001b[38;5;66;03m# Everything went great!\u001b[39;00m\n\u001b[1;32m 803\u001b[0m clean_exit \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/urllib3/connectionpool.py:534\u001b[0m, in \u001b[0;36mHTTPConnectionPool._make_request\u001b[0;34m(self, conn, method, url, body, headers, retries, timeout, chunked, response_conn, preload_content, decode_content, enforce_content_length)\u001b[0m\n\u001b[1;32m 532\u001b[0m \u001b[38;5;66;03m# Receive the response from the server\u001b[39;00m\n\u001b[1;32m 533\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 534\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[43mconn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgetresponse\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 535\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (BaseSSLError, \u001b[38;5;167;01mOSError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 536\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_raise_timeout(err\u001b[38;5;241m=\u001b[39me, url\u001b[38;5;241m=\u001b[39murl, timeout_value\u001b[38;5;241m=\u001b[39mread_timeout)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/urllib3/connection.py:516\u001b[0m, in \u001b[0;36mHTTPConnection.getresponse\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 513\u001b[0m _shutdown \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msock, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mshutdown\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m 515\u001b[0m \u001b[38;5;66;03m# Get the response from http.client.HTTPConnection\u001b[39;00m\n\u001b[0;32m--> 516\u001b[0m httplib_response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgetresponse\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 518\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 519\u001b[0m assert_header_parsing(httplib_response\u001b[38;5;241m.\u001b[39mmsg)\n", + "File \u001b[0;32m~/.pyenv/versions/3.10.15/lib/python3.10/http/client.py:1375\u001b[0m, in \u001b[0;36mHTTPConnection.getresponse\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1373\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1374\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1375\u001b[0m \u001b[43mresponse\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbegin\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1376\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mConnectionError\u001b[39;00m:\n\u001b[1;32m 1377\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclose()\n", + "File \u001b[0;32m~/.pyenv/versions/3.10.15/lib/python3.10/http/client.py:318\u001b[0m, in \u001b[0;36mHTTPResponse.begin\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 316\u001b[0m \u001b[38;5;66;03m# read until we get a non-100 response\u001b[39;00m\n\u001b[1;32m 317\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[0;32m--> 318\u001b[0m version, status, reason \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_read_status\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 319\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m status \u001b[38;5;241m!=\u001b[39m CONTINUE:\n\u001b[1;32m 320\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n", + "File \u001b[0;32m~/.pyenv/versions/3.10.15/lib/python3.10/http/client.py:279\u001b[0m, in \u001b[0;36mHTTPResponse._read_status\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 278\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m_read_status\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m--> 279\u001b[0m line \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreadline\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_MAXLINE\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124miso-8859-1\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 280\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(line) \u001b[38;5;241m>\u001b[39m _MAXLINE:\n\u001b[1;32m 281\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m LineTooLong(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstatus line\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m~/.pyenv/versions/3.10.15/lib/python3.10/socket.py:717\u001b[0m, in \u001b[0;36mSocketIO.readinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m 715\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[1;32m 716\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 717\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sock\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrecv_into\u001b[49m\u001b[43m(\u001b[49m\u001b[43mb\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 718\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m timeout:\n\u001b[1;32m 719\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_timeout_occurred \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n", + "File \u001b[0;32m~/.pyenv/versions/3.10.15/lib/python3.10/ssl.py:1307\u001b[0m, in \u001b[0;36mSSLSocket.recv_into\u001b[0;34m(self, buffer, nbytes, flags)\u001b[0m\n\u001b[1;32m 1303\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m flags \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 1304\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 1305\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnon-zero flags not allowed in calls to recv_into() on \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m\n\u001b[1;32m 1306\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m)\n\u001b[0;32m-> 1307\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnbytes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuffer\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1308\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1309\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39mrecv_into(buffer, nbytes, flags)\n", + "File \u001b[0;32m~/.pyenv/versions/3.10.15/lib/python3.10/ssl.py:1163\u001b[0m, in \u001b[0;36mSSLSocket.read\u001b[0;34m(self, len, buffer)\u001b[0m\n\u001b[1;32m 1161\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1162\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m buffer \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 1163\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sslobj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuffer\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1164\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1165\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sslobj\u001b[38;5;241m.\u001b[39mread(\u001b[38;5;28mlen\u001b[39m)\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "#bq_connection = \"bigframes-dev.us.bigframes-default-connection\"\n", + "df[\"text\"] = df[\"audio\"].blob.transcribe(df=df, audio_column=\"audio\", model_name=\"gemini-2.0-flash-001\", verbose=True)\n", + "# gemini-2.5-pro-preview-05-06" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job ed3d84ac-2e1b-42ab-b867-58560e1a3167 is DONE. 8.8 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job a65b6550-f250-4052-92e2-492ab58da692 is DONE. 8.8 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
audiotext
0uri: gs://shuowei_bucket/audio/ID014clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': \"I'll tell you something, when the first revelation Joseph Smith received about marriage of any kind was that you couldn't get anywhere, you into the celestial kingdom. No man could get into the celestial kingdom without a wife, because he's only half there if he hasn't got a wife. And no woman, of course, could get there without a husband either, because she's only half there. Well, now at that time, the Mormons were very strict, uh, uh, religious people, you see. Sure. Sure. And, uh, thought that a man had to live clean all his life as well as a woman did in order to be worthy to be a man. Well, this is still true. Well, it's still true certainly. But I said, in that those circumstances, at that time, there were, uh, about three times as many women that joined the church and were faithful Latter-day Saints than there were men. When Joseph Smith comes out with the revelation that, uh, marriage was for eternity. And if if you're going to be separated from death do you part, then you can't get into the celestial kingdom at all to become as God is. Now, they got that doctrine. And that was, the prophet said that, they believed that. What about these here two-thirds of the women? Yeah, that's a good question. What is going to become of us then? It won't do us any good to go out here and marry one of these scallywags that spends his time drinking and boozing around and mining and maybe works in the mine a while and does all kind of skullduggery. That won't do us any good to marry that kind of a man. Besides, we don't have a chance to raise a respectable family with that kind of a man. So, what are we going to do about it now? What's going to become of us? And Joseph Smith always prayed about every problem that came to him. And there's where he got the revelation that if they live thoroughly clean and faithful, if they can understand the purity and sacredness of sex, so they don't desecrate it, then one man can have a number of wives and and permit them to raise a respectable, honorable family as well as have those wives throughout eternity, have their husband to take them into the celestial kingdom. Now, there was the origin of polygamy.\"}
1uri: gs://shuowei_bucket/audio/HI009clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': \"Uh, I hear that there's a Hawaiian custom of something called a hukilau. Hukilau. Hukilau is they have a rope and on the rope they have a net there. Mhm. They surround it a certain place and they drag the net. They surround it, you mean they carry the net out? Carry the net, uh-huh. Uh, how do they, is there something to keep the net floating? Yeah, some they have those floaters uh floating. Mhm. Some they have a lead down on the bottom, then they drag that uh net in. What do the floats, what are the floats made of? The float made of those uh some coconut uh dry coconut, I guess. Oh, dry coconut. Yeah. That's a good use for coconut. Uh, and that floats the top of the net. Top of the net. Now, the word lau means leaf, doesn't it? Lau, lau is a yeah, yeah, lau is a leaf. All right. Well now, the the where does the leaf come in in this hukilau? Hukilau, the leaves, you know, they stay on the top, top of the rope. So that the when they pull, the fish going to not to jump over the rope. The ropes, the the uh the leaves keep the fish from jumping over to the scared, they scared them away. They don't want to go underneath the water, see. When some moment she go underneath the water there, they will go to the net. They have a pocket on the net. They have a pocket on the net. Now the bottom of the net then. Bottom of the net they have a pocket in. Mhm. The water must have some kind of things to make it sink. Sink is a lead, they have a lead. Oh, they have lead on it. I see. Then uh uh somebody goes out and puts the net out around. Yeah, they tow it with a boat. With a boat? Two, two boats they goes out. Tow it all around. And then uh who pulls it in? Well, they have a lots of uh from the shore about 40, 50 guys they pull that rope up. Ah, I see. Yeah. And uh they must have a pretty big school of fish there. Yeah, once in a while they have a nice school, right? And then they pull them all in and the fish don't are scared to jump out so they get caught. No, they all in up on the net already. Mhm. They get caught in the net. I suppose the top of the net comes in sooner than the bottom so they if they do try to go out there, that's pocket. Yeah, they have to take on uh, you know, about six or seven guys is swimming on it around the net to they're watching how the fish going. Oh, I see. They they're chasing at the hole. Uh-huh. Well, about how much fish do they get out of a good haul of those? Once I saw up Waimea, they caught about about a ton, I guess, you know, those akule. Mhm. Oh, they have a nice crew up there. What's akule? Mm, the akule some kind of, you know, that's Hawaiian used to call akule, akule. They have been calling it Aji in Japan.\"}
2uri: gs://shuowei_bucket/audio/AK012clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': \"the soapstone carvings are those done, have you done any of those? Yes. Yeah. It's pretty much like ivory carving. Only it takes more care. What? To work on soapstone. Why? Why is it? Because it is brittle. Oh, it is. And very soft. Mhm. Uh, you you can hack on ivory, but you can't do that on soapstone. Chip too easily. Uh-huh. But then you have to use your files on those. Yes. And then once you've done the filing and so on, how do you smooth it down? Uh, we I use uh fine file. Mhm. To finish it up and then use uh sandpaper or emery cloth. Mhm. I think you said that um quite often the thing that is carved is determined by the shape of the piece of stone that one starts carving. Yes, yeah. Sometimes uh uh an ivory carver or a soapstone carver will take advantage of the shape of the stone. Mhm. And uh try to visualize what it'd look like. Uh, maybe a polar bear or or Mhm. He makes it just the way it is.\"}
3uri: gs://shuowei_bucket/audio/CA138clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': \"was rather interesting just to watch them gathering their materials and bouncing around. Yeah, they they are what they call it kangaroo walk or or something. Really? Uh, something like that. They named it that. I don't know. I bet those men are going to get quite a reception when they get back to earth. Oh, yes. I'll be so glad when they land back now, but I think that's uh pretty well uh fact because they've landed so many safely now that I I feel really relieved. Just getting off of the moon was the thing that was. Have they met with the um one that was circling? Yes, they've rendezvoused. So I understand. I that wasn't shown either, so I but uh they say they have rendezvoused, so that's a matter of making the circles and then coming down. What do you sort of imagine for the future? Do you imagine them sending a I think they will. I think they will will do some more exploring up there. Right. Positive, you know, to continue with this. Uh-huh. Because that was such a very small area when you think of it that they just gathered uh rocks and and uh oh samples of soil and all and they did uh probe for some samples. And just what is going to come of that, I don't know. I'd be glad to read it.\"}
4uri: gs://shuowei_bucket/audio/TX048clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': \"Say this is entered customs. And uh the next step is to prepare a Mexican customs entry. Mexico call them pedimentos. It's really an appraisal sheet. It's it's a breakdown of the sheet. You use the same information. That's right. But here you give them all the information, for example, let me see, I found one right here. You give them all the information on on what the invoice contains. Here's one. On what the invoice contains, what uh what what the material is itself. If you hear, for example, this is some bottling equipment going over there and on the appraisal sheet, so you tell them who it's for, who made the invoice out of the states. This came from St. Louis, Missouri. Yeah. How much the freight was. And on the reverse of this of this thing, but the everything has to be marked. So complete identification. This identification number is that traffic number I spoke to you a little while ago. Number of boxes, what they contain, the tariff duty, the tariff number, the ad valorem rate, and what the actual duties would be paid. For example, here would be 996 pesos and 73 cents. And as you can see, this is a form that has already been approved by the appraiser. These appraisers are called Vistas. It's very different from the Vistas we have nowadays here.\"}
\n", + "

5 rows × 2 columns

\n", + "
[5 rows x 2 columns in total]" + ], + "text/plain": [ + " audio \\\n", + "0 {'uri': 'gs://shuowei_bucket/audio/ID014clip.m... \n", + "1 {'uri': 'gs://shuowei_bucket/audio/HI009clip.m... \n", + "2 {'uri': 'gs://shuowei_bucket/audio/AK012clip.m... \n", + "3 {'uri': 'gs://shuowei_bucket/audio/CA138clip.m... \n", + "4 {'uri': 'gs://shuowei_bucket/audio/TX048clip.m... \n", + "\n", + " text \n", + "0 {'status': '', 'content': \"I'll tell you somet... \n", + "1 {'status': '', 'content': \"Uh, I hear that the... \n", + "2 {'status': '', 'content': \"the soapstone carvi... \n", + "3 {'status': '', 'content': \"was rather interest... \n", + "4 {'status': '', 'content': \"Say this is entered... \n", + "\n", + "[5 rows x 2 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/test_blob_transcribe_long_audio.ipynb b/notebooks/test_blob_transcribe_long_audio.ipynb new file mode 100644 index 0000000000..7292981d7d --- /dev/null +++ b/notebooks/test_blob_transcribe_long_audio.ipynb @@ -0,0 +1,315 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/_config/experiment_options.py:71: ApiDeprecationWarning: BigFrames Blob is in preview now. This flag is no longer needed.\n", + " warnings.warn(msg, category=bfe.ApiDeprecationWarning)\n" + ] + } + ], + "source": [ + "import bigframes\n", + "bigframes.options.experiments.blob = True\n", + "#bigframes.options._bigquery_options.client_endpoints_override = {\"bqclient\": \"https://test-bigquery.sandbox.google.com\", \n", + "# \"bqconnectionclient\": \"test-bigqueryconnection.sandbox.googleapis.com\", \n", + "# \"bqstoragereadclient\": \"test-bigquerystorage-grpc.sandbox.googleapis.com\"}\n", + "import bigframes.pandas as bpd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/global_session.py:103: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + " _global_session = bigframes.session.connect(\n" + ] + }, + { + "data": { + "text/html": [ + "Query job 5bbcf8fc-1e44-46c1-84b8-0629383cc1e7 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 7f3923d3-999f-45d3-9b86-dc8595de53ac is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df = bpd.from_glob_path(\"gs://shuowei_bucket/long_audio/*\", name=\"audio\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 05eae664-16d7-4d12-a4f0-94ffcd81dde6 is DONE. 533.0 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# copy files again, now we have 1000 audio files\n", + "copies = [df] * 2 * 100\n", + "df = bpd.concat(copies, ignore_index=True)\n", + "df = df.cache()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job eb3e5b47-54c5-4ec4-9edf-8ef0b2d0629d is DONE. 183.8 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# copy files again, now we have 1,000,000 audio files\n", + "copies = [df] * 2 * 100\n", + "df = bpd.concat(copies, ignore_index=True)\n", + "df = df.cache()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 42a64933-a2dc-4b59-9198-2e746f7fa38f is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 475b4580-c2ae-4c22-8cb6-0aa06f419c69 is RUNNING. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job b1947d40-947f-4693-bf90-24c48564df5d is RUNNING. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "ename": "BadRequest", + "evalue": "400 GET https://bigquery.googleapis.com/bigquery/v2/projects/bigframes-dev/queries/b1947d40-947f-4693-bf90-24c48564df5d?maxResults=0&location=US&prettyPrint=false: Operation timed out after 6.0 hours. Consider reducing the amount of work performed by your operation so that it can complete within this limit.\n\nLocation: US\nJob ID: b1947d40-947f-4693-bf90-24c48564df5d\n Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey. You are currently running BigFrames version 2.5.0. [{'@type': 'type.googleapis.com/google.rpc.DebugInfo', 'detail': '[TIMEOUT] errorProto=code: \"TIMEOUT\"\\nargument: \"Operation timed out after 6.0 hours. Consider reducing the amount of work performed by your operation so that it can complete within this limit.\"\\n\\n\\tat com.google.cloud.helix.common.Exceptions$Public.timeout(Exceptions.java:958)\\n\\tat com.google.cloud.helix.server.job.DremelErrorUtil.createHelixErrorForDeadlineExceeded(DremelErrorUtil.java:75)\\n\\tat com.google.cloud.helix.server.job.DremelErrorUtil.createHelixErrorFromDremelRpcException(DremelErrorUtil.java:61)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:784)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:696)\\n\\tat com.google.net.rpc3.stream.RpcMessageCallback$ForwardingRpcMessageCallback.onMessage(RpcMessageCallback.java:128)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.processMessageUnlocked(RpcStreamInternalContext.java:1876)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksInternalUnlocked(RpcStreamInternalContext.java:2930)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksUnlocked(RpcStreamInternalContext.java:2854)\\n\\tat com.google.net.eventmanager.AbstractFutureTask$Sync.innerRun(AbstractFutureTask.java:259)\\n\\tat com.google.net.eventmanager.AbstractFutureTask.run(AbstractFutureTask.java:120)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runTaskTraced(EventManagerImpl.java:900)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runTask(EventManagerImpl.java:892)\\n\\tat com.google.net.eventmanager.EventManagerImpl.internalRunWorkerLoop(EventManagerImpl.java:1319)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runWorkerLoop(EventManagerImpl.java:1210)\\n\\tat com.google.net.eventmanager.WorkerThreadInfo.runWorkerLoop(WorkerThreadInfo.java:153)\\n\\tat com.google.net.eventmanager.EventManagerImpl$WorkerThread.run(EventManagerImpl.java:1999)\\n'}]", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mBadRequest\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[7], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m#bq_connection = \"bigframes-dev.us.bigframes-default-connection\"\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43maudio\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mblob\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43maudio_transcribe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgemini-2.0-flash-001\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# gemini-2.5-pro-preview-05-06\u001b[39;00m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:175\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 172\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 175\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 178\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 179\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;66;03m# _block.session.bqclient.\u001b[39;00m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/operations/blob.py:789\u001b[0m, in \u001b[0;36maudio_transcribe\u001b[0;34m(self, connection, model_name, verbose)\u001b[0m\n\u001b[1;32m 0\u001b[0m \n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:175\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 172\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 175\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 178\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 179\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;66;03m# _block.session.bqclient.\u001b[39;00m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/ml/llm.py:745\u001b[0m, in \u001b[0;36mGeminiTextGenerator.predict\u001b[0;34m(self, X, temperature, max_output_tokens, top_k, top_p, ground_with_google_search, max_retries, prompt, output_schema)\u001b[0m\n\u001b[1;32m 737\u001b[0m options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutput_schema\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m output_schema\n\u001b[1;32m 738\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_predict_and_retry(\n\u001b[1;32m 739\u001b[0m core\u001b[38;5;241m.\u001b[39mBqmlModel\u001b[38;5;241m.\u001b[39mgenerate_table_tvf,\n\u001b[1;32m 740\u001b[0m X,\n\u001b[1;32m 741\u001b[0m options\u001b[38;5;241m=\u001b[39moptions,\n\u001b[1;32m 742\u001b[0m max_retries\u001b[38;5;241m=\u001b[39mmax_retries,\n\u001b[1;32m 743\u001b[0m )\n\u001b[0;32m--> 745\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_predict_and_retry\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 746\u001b[0m \u001b[43m \u001b[49m\u001b[43mcore\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mBqmlModel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate_text_tvf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 747\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 748\u001b[0m \u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 749\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_retries\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_retries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 750\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/ml/base.py:266\u001b[0m, in \u001b[0;36mRetriableRemotePredictor._predict_and_retry\u001b[0;34m(self, bqml_model_predict_tvf, X, options, max_retries)\u001b[0m\n\u001b[1;32m 263\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(msg, category\u001b[38;5;241m=\u001b[39m\u001b[38;5;167;01mRuntimeWarning\u001b[39;00m)\n\u001b[1;32m 264\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[0;32m--> 266\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43mbqml_model_predict_tvf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtvf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_bqml_model\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdf_fail\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 268\u001b[0m success \u001b[38;5;241m=\u001b[39m df[bqml_model_predict_tvf\u001b[38;5;241m.\u001b[39mstatus_col]\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mlen() \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 269\u001b[0m df_succ \u001b[38;5;241m=\u001b[39m df[success]\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/ml/core.py:171\u001b[0m, in \u001b[0;36mBqmlModel.generate_text\u001b[0;34m(self, input_data, options)\u001b[0m\n\u001b[1;32m 165\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mgenerate_text\u001b[39m(\n\u001b[1;32m 166\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 167\u001b[0m input_data: bpd\u001b[38;5;241m.\u001b[39mDataFrame,\n\u001b[1;32m 168\u001b[0m options: \u001b[38;5;28mdict\u001b[39m[\u001b[38;5;28mstr\u001b[39m, Union[\u001b[38;5;28mint\u001b[39m, \u001b[38;5;28mfloat\u001b[39m, \u001b[38;5;28mbool\u001b[39m]],\n\u001b[1;32m 169\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m bpd\u001b[38;5;241m.\u001b[39mDataFrame:\n\u001b[1;32m 170\u001b[0m options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mflatten_json_output\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 171\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_apply_ml_tvf\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 172\u001b[0m \u001b[43m \u001b[49m\u001b[43minput_data\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 173\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mlambda\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43msource_sql\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_model_manipulation_sql_generator\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mml_generate_text\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 174\u001b[0m \u001b[43m \u001b[49m\u001b[43msource_sql\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msource_sql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 175\u001b[0m \u001b[43m \u001b[49m\u001b[43mstruct_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 177\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/ml/core.py:88\u001b[0m, in \u001b[0;36mBqmlModel._apply_ml_tvf\u001b[0;34m(self, input_data, apply_sql_tvf)\u001b[0m\n\u001b[1;32m 83\u001b[0m input_sql, index_col_ids, index_labels \u001b[38;5;241m=\u001b[39m input_data\u001b[38;5;241m.\u001b[39m_to_sql_query(\n\u001b[1;32m 84\u001b[0m include_index\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 85\u001b[0m )\n\u001b[1;32m 87\u001b[0m result_sql \u001b[38;5;241m=\u001b[39m apply_sql_tvf(input_sql)\n\u001b[0;32m---> 88\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_session\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_gbq\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresult_sql\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex_col_ids\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 89\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m df\u001b[38;5;241m.\u001b[39m_has_index:\n\u001b[1;32m 90\u001b[0m df\u001b[38;5;241m.\u001b[39mindex\u001b[38;5;241m.\u001b[39mnames \u001b[38;5;241m=\u001b[39m index_labels\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:175\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 172\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 175\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 178\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 179\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;66;03m# _block.session.bqclient.\u001b[39;00m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/__init__.py:446\u001b[0m, in \u001b[0;36mSession.read_gbq\u001b[0;34m(self, query_or_table, index_col, columns, configuration, max_results, filters, use_cache, col_order, dry_run)\u001b[0m\n\u001b[1;32m 443\u001b[0m columns \u001b[38;5;241m=\u001b[39m col_order\n\u001b[1;32m 445\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m bf_io_bigquery\u001b[38;5;241m.\u001b[39mis_query(query_or_table):\n\u001b[0;32m--> 446\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_loader\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_gbq_query\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# type: ignore # for dry_run overload\u001b[39;49;00m\n\u001b[1;32m 447\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_or_table\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 448\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex_col\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 449\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 450\u001b[0m \u001b[43m \u001b[49m\u001b[43mconfiguration\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfiguration\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 451\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 452\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 453\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 454\u001b[0m \u001b[43m \u001b[49m\u001b[43mdry_run\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdry_run\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 455\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 456\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 457\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m configuration \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/loader.py:837\u001b[0m, in \u001b[0;36mGbqDataLoader.read_gbq_query\u001b[0;34m(self, query, index_col, columns, configuration, max_results, use_cache, filters, dry_run, force_total_order, allow_large_results)\u001b[0m\n\u001b[1;32m 834\u001b[0m \u001b[38;5;66;03m# TODO(b/421161077): If an explicit destination table is set in\u001b[39;00m\n\u001b[1;32m 835\u001b[0m \u001b[38;5;66;03m# configuration, should we respect that setting?\u001b[39;00m\n\u001b[1;32m 836\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m allow_large_results:\n\u001b[0;32m--> 837\u001b[0m destination, query_job \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_query_to_destination\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 838\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 839\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# No cluster candidates as user query might not be clusterable\u001b[39;49;00m\n\u001b[1;32m 840\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# (eg because of ORDER BY clause)\u001b[39;49;00m\n\u001b[1;32m 841\u001b[0m \u001b[43m \u001b[49m\u001b[43mcluster_candidates\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 842\u001b[0m \u001b[43m \u001b[49m\u001b[43mconfiguration\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfiguration\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 843\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 844\u001b[0m query_job_for_metrics \u001b[38;5;241m=\u001b[39m query_job\n\u001b[1;32m 845\u001b[0m rows \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/loader.py:972\u001b[0m, in \u001b[0;36mGbqDataLoader._query_to_destination\u001b[0;34m(self, query, cluster_candidates, configuration, do_clustering)\u001b[0m\n\u001b[1;32m 966\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m query_job\u001b[38;5;241m.\u001b[39mdestination, query_job\n\u001b[1;32m 967\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m google\u001b[38;5;241m.\u001b[39mapi_core\u001b[38;5;241m.\u001b[39mexceptions\u001b[38;5;241m.\u001b[39mBadRequest:\n\u001b[1;32m 968\u001b[0m \u001b[38;5;66;03m# Some SELECT statements still aren't compatible with cluster\u001b[39;00m\n\u001b[1;32m 969\u001b[0m \u001b[38;5;66;03m# tables as the destination. For example, if the query has a\u001b[39;00m\n\u001b[1;32m 970\u001b[0m \u001b[38;5;66;03m# top-level ORDER BY, this conflicts with our ability to cluster\u001b[39;00m\n\u001b[1;32m 971\u001b[0m \u001b[38;5;66;03m# the table by the index column(s).\u001b[39;00m\n\u001b[0;32m--> 972\u001b[0m query_job \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_start_query_with_job\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 973\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m query_job\u001b[38;5;241m.\u001b[39mdestination, query_job\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/loader.py:1027\u001b[0m, in \u001b[0;36mGbqDataLoader._start_query_with_job\u001b[0;34m(self, sql, job_config, timeout)\u001b[0m\n\u001b[1;32m 1021\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1022\u001b[0m \u001b[38;5;124;03mStarts BigQuery query job and waits for results.\u001b[39;00m\n\u001b[1;32m 1023\u001b[0m \n\u001b[1;32m 1024\u001b[0m \u001b[38;5;124;03mDo not execute dataframe through this API, instead use the executor.\u001b[39;00m\n\u001b[1;32m 1025\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1026\u001b[0m job_config \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_prepare_job_config(job_config)\n\u001b[0;32m-> 1027\u001b[0m _, query_job \u001b[38;5;241m=\u001b[39m \u001b[43mbf_io_bigquery\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstart_query_with_client\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1028\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_bqclient\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1029\u001b[0m \u001b[43m \u001b[49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1030\u001b[0m \u001b[43m \u001b[49m\u001b[43mjob_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjob_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1031\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1032\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 1033\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 1034\u001b[0m \u001b[43m \u001b[49m\u001b[43mmetrics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 1035\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_with_job\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 1036\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1037\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m query_job\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/_io/bigquery/__init__.py:314\u001b[0m, in \u001b[0;36mstart_query_with_client\u001b[0;34m(bq_client, sql, job_config, location, project, timeout, metrics, query_with_job)\u001b[0m\n\u001b[1;32m 312\u001b[0m opts \u001b[38;5;241m=\u001b[39m bigframes\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mdisplay\n\u001b[1;32m 313\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m opts\u001b[38;5;241m.\u001b[39mprogress_bar \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m query_job\u001b[38;5;241m.\u001b[39mconfiguration\u001b[38;5;241m.\u001b[39mdry_run:\n\u001b[0;32m--> 314\u001b[0m results_iterator \u001b[38;5;241m=\u001b[39m \u001b[43mformatting_helpers\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait_for_query_job\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 315\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_job\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 316\u001b[0m \u001b[43m \u001b[49m\u001b[43mprogress_bar\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mopts\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprogress_bar\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 317\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 318\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 319\u001b[0m results_iterator \u001b[38;5;241m=\u001b[39m query_job\u001b[38;5;241m.\u001b[39mresult()\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/formatting_helpers.py:149\u001b[0m, in \u001b[0;36mwait_for_query_job\u001b[0;34m(query_job, max_results, page_size, progress_bar)\u001b[0m\n\u001b[1;32m 147\u001b[0m loading_bar \u001b[38;5;241m=\u001b[39m display\u001b[38;5;241m.\u001b[39mHTML(get_query_job_loading_html(query_job))\n\u001b[1;32m 148\u001b[0m display\u001b[38;5;241m.\u001b[39mdisplay(loading_bar, display_id\u001b[38;5;241m=\u001b[39mdisplay_id)\n\u001b[0;32m--> 149\u001b[0m query_result \u001b[38;5;241m=\u001b[39m \u001b[43mquery_job\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 150\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpage_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpage_size\u001b[49m\n\u001b[1;32m 151\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 152\u001b[0m query_job\u001b[38;5;241m.\u001b[39mreload()\n\u001b[1;32m 153\u001b[0m display\u001b[38;5;241m.\u001b[39mupdate_display(\n\u001b[1;32m 154\u001b[0m display\u001b[38;5;241m.\u001b[39mHTML(get_query_job_loading_html(query_job)),\n\u001b[1;32m 155\u001b[0m display_id\u001b[38;5;241m=\u001b[39mdisplay_id,\n\u001b[1;32m 156\u001b[0m )\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/job/query.py:1681\u001b[0m, in \u001b[0;36mQueryJob.result\u001b[0;34m(self, page_size, max_results, retry, timeout, start_index, job_retry)\u001b[0m\n\u001b[1;32m 1676\u001b[0m remaining_timeout \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1678\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m remaining_timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1679\u001b[0m \u001b[38;5;66;03m# Since is_job_done() calls jobs.getQueryResults, which is a\u001b[39;00m\n\u001b[1;32m 1680\u001b[0m \u001b[38;5;66;03m# long-running API, don't delay the next request at all.\u001b[39;00m\n\u001b[0;32m-> 1681\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[43mis_job_done\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 1682\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[1;32m 1683\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1684\u001b[0m \u001b[38;5;66;03m# Use a monotonic clock since we don't actually care about\u001b[39;00m\n\u001b[1;32m 1685\u001b[0m \u001b[38;5;66;03m# daylight savings or similar, just the elapsed time.\u001b[39;00m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:293\u001b[0m, in \u001b[0;36mRetry.__call__..retry_wrapped_func\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 289\u001b[0m target \u001b[38;5;241m=\u001b[39m functools\u001b[38;5;241m.\u001b[39mpartial(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 290\u001b[0m sleep_generator \u001b[38;5;241m=\u001b[39m exponential_sleep_generator(\n\u001b[1;32m 291\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_initial, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_maximum, multiplier\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_multiplier\n\u001b[1;32m 292\u001b[0m )\n\u001b[0;32m--> 293\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mretry_target\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 294\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 295\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_predicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 296\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep_generator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 297\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 298\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 299\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:153\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 149\u001b[0m \u001b[38;5;66;03m# pylint: disable=broad-except\u001b[39;00m\n\u001b[1;32m 150\u001b[0m \u001b[38;5;66;03m# This function explicitly must deal with broad exceptions.\u001b[39;00m\n\u001b[1;32m 151\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 152\u001b[0m \u001b[38;5;66;03m# defer to shared logic for handling errors\u001b[39;00m\n\u001b[0;32m--> 153\u001b[0m \u001b[43m_retry_error_helper\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 154\u001b[0m \u001b[43m \u001b[49m\u001b[43mexc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 155\u001b[0m \u001b[43m \u001b[49m\u001b[43mdeadline\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 156\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 157\u001b[0m \u001b[43m \u001b[49m\u001b[43merror_list\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 158\u001b[0m \u001b[43m \u001b[49m\u001b[43mpredicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 159\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 160\u001b[0m \u001b[43m \u001b[49m\u001b[43mexception_factory\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 161\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 162\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 163\u001b[0m \u001b[38;5;66;03m# if exception not raised, sleep before next attempt\u001b[39;00m\n\u001b[1;32m 164\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(sleep)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_base.py:212\u001b[0m, in \u001b[0;36m_retry_error_helper\u001b[0;34m(exc, deadline, next_sleep, error_list, predicate_fn, on_error_fn, exc_factory_fn, original_timeout)\u001b[0m\n\u001b[1;32m 206\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m predicate_fn(exc):\n\u001b[1;32m 207\u001b[0m final_exc, source_exc \u001b[38;5;241m=\u001b[39m exc_factory_fn(\n\u001b[1;32m 208\u001b[0m error_list,\n\u001b[1;32m 209\u001b[0m RetryFailureReason\u001b[38;5;241m.\u001b[39mNON_RETRYABLE_ERROR,\n\u001b[1;32m 210\u001b[0m original_timeout,\n\u001b[1;32m 211\u001b[0m )\n\u001b[0;32m--> 212\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m final_exc \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01msource_exc\u001b[39;00m\n\u001b[1;32m 213\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m on_error_fn \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 214\u001b[0m on_error_fn(exc)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:144\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m sleep \u001b[38;5;129;01min\u001b[39;00m sleep_generator:\n\u001b[1;32m 143\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 144\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mtarget\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 145\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inspect\u001b[38;5;241m.\u001b[39misawaitable(result):\n\u001b[1;32m 146\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(_ASYNC_RETRY_WARNING)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/job/query.py:1650\u001b[0m, in \u001b[0;36mQueryJob.result..is_job_done\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1644\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 1646\u001b[0m \u001b[38;5;66;03m# Call jobs.getQueryResults with max results set to 0 just to\u001b[39;00m\n\u001b[1;32m 1647\u001b[0m \u001b[38;5;66;03m# wait for the query to finish. Unlike most methods,\u001b[39;00m\n\u001b[1;32m 1648\u001b[0m \u001b[38;5;66;03m# jobs.getQueryResults hangs as long as it can to ensure we\u001b[39;00m\n\u001b[1;32m 1649\u001b[0m \u001b[38;5;66;03m# know when the query has finished as soon as possible.\u001b[39;00m\n\u001b[0;32m-> 1650\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_reload_query_results\u001b[49m\u001b[43m(\u001b[49m\u001b[43mretry\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mretry\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mreload_query_results_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1652\u001b[0m \u001b[38;5;66;03m# Even if the query is finished now according to\u001b[39;00m\n\u001b[1;32m 1653\u001b[0m \u001b[38;5;66;03m# jobs.getQueryResults, we'll want to reload the job status if\u001b[39;00m\n\u001b[1;32m 1654\u001b[0m \u001b[38;5;66;03m# it's not already DONE.\u001b[39;00m\n\u001b[1;32m 1655\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/job/query.py:1448\u001b[0m, in \u001b[0;36mQueryJob._reload_query_results\u001b[0;34m(self, retry, timeout, page_size)\u001b[0m\n\u001b[1;32m 1445\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(transport_timeout, (\u001b[38;5;28mfloat\u001b[39m, \u001b[38;5;28mint\u001b[39m)):\n\u001b[1;32m 1446\u001b[0m transport_timeout \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1448\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_query_results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_client\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_query_results\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1449\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjob_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1450\u001b[0m \u001b[43m \u001b[49m\u001b[43mretry\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1451\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mproject\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1452\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout_ms\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout_ms\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1453\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlocation\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1454\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtransport_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1455\u001b[0m \u001b[43m \u001b[49m\u001b[43mpage_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpage_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1456\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/client.py:2034\u001b[0m, in \u001b[0;36mClient._get_query_results\u001b[0;34m(self, job_id, retry, project, timeout_ms, location, timeout, page_size)\u001b[0m\n\u001b[1;32m 2030\u001b[0m \u001b[38;5;66;03m# This call is typically made in a polling loop that checks whether the\u001b[39;00m\n\u001b[1;32m 2031\u001b[0m \u001b[38;5;66;03m# job is complete (from QueryJob.done(), called ultimately from\u001b[39;00m\n\u001b[1;32m 2032\u001b[0m \u001b[38;5;66;03m# QueryJob.result()). So we don't need to poll here.\u001b[39;00m\n\u001b[1;32m 2033\u001b[0m span_attributes \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpath\u001b[39m\u001b[38;5;124m\"\u001b[39m: path}\n\u001b[0;32m-> 2034\u001b[0m resource \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_api\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2035\u001b[0m \u001b[43m \u001b[49m\u001b[43mretry\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2036\u001b[0m \u001b[43m \u001b[49m\u001b[43mspan_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mBigQuery.getQueryResults\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2037\u001b[0m \u001b[43m \u001b[49m\u001b[43mspan_attributes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mspan_attributes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2038\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mGET\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2039\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2040\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_params\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextra_params\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2041\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2042\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2043\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _QueryResults\u001b[38;5;241m.\u001b[39mfrom_api_repr(resource)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/client.py:843\u001b[0m, in \u001b[0;36mClient._call_api\u001b[0;34m(self, retry, span_name, span_attributes, job_ref, headers, **kwargs)\u001b[0m\n\u001b[1;32m 839\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m span_name \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 840\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m create_span(\n\u001b[1;32m 841\u001b[0m name\u001b[38;5;241m=\u001b[39mspan_name, attributes\u001b[38;5;241m=\u001b[39mspan_attributes, client\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m, job_ref\u001b[38;5;241m=\u001b[39mjob_ref\n\u001b[1;32m 842\u001b[0m ):\n\u001b[0;32m--> 843\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcall\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 845\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m call()\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:293\u001b[0m, in \u001b[0;36mRetry.__call__..retry_wrapped_func\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 289\u001b[0m target \u001b[38;5;241m=\u001b[39m functools\u001b[38;5;241m.\u001b[39mpartial(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 290\u001b[0m sleep_generator \u001b[38;5;241m=\u001b[39m exponential_sleep_generator(\n\u001b[1;32m 291\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_initial, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_maximum, multiplier\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_multiplier\n\u001b[1;32m 292\u001b[0m )\n\u001b[0;32m--> 293\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mretry_target\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 294\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 295\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_predicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 296\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep_generator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 297\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 298\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 299\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:153\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 149\u001b[0m \u001b[38;5;66;03m# pylint: disable=broad-except\u001b[39;00m\n\u001b[1;32m 150\u001b[0m \u001b[38;5;66;03m# This function explicitly must deal with broad exceptions.\u001b[39;00m\n\u001b[1;32m 151\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 152\u001b[0m \u001b[38;5;66;03m# defer to shared logic for handling errors\u001b[39;00m\n\u001b[0;32m--> 153\u001b[0m \u001b[43m_retry_error_helper\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 154\u001b[0m \u001b[43m \u001b[49m\u001b[43mexc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 155\u001b[0m \u001b[43m \u001b[49m\u001b[43mdeadline\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 156\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 157\u001b[0m \u001b[43m \u001b[49m\u001b[43merror_list\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 158\u001b[0m \u001b[43m \u001b[49m\u001b[43mpredicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 159\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 160\u001b[0m \u001b[43m \u001b[49m\u001b[43mexception_factory\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 161\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 162\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 163\u001b[0m \u001b[38;5;66;03m# if exception not raised, sleep before next attempt\u001b[39;00m\n\u001b[1;32m 164\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(sleep)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_base.py:212\u001b[0m, in \u001b[0;36m_retry_error_helper\u001b[0;34m(exc, deadline, next_sleep, error_list, predicate_fn, on_error_fn, exc_factory_fn, original_timeout)\u001b[0m\n\u001b[1;32m 206\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m predicate_fn(exc):\n\u001b[1;32m 207\u001b[0m final_exc, source_exc \u001b[38;5;241m=\u001b[39m exc_factory_fn(\n\u001b[1;32m 208\u001b[0m error_list,\n\u001b[1;32m 209\u001b[0m RetryFailureReason\u001b[38;5;241m.\u001b[39mNON_RETRYABLE_ERROR,\n\u001b[1;32m 210\u001b[0m original_timeout,\n\u001b[1;32m 211\u001b[0m )\n\u001b[0;32m--> 212\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m final_exc \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01msource_exc\u001b[39;00m\n\u001b[1;32m 213\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m on_error_fn \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 214\u001b[0m on_error_fn(exc)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:144\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m sleep \u001b[38;5;129;01min\u001b[39;00m sleep_generator:\n\u001b[1;32m 143\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 144\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mtarget\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 145\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inspect\u001b[38;5;241m.\u001b[39misawaitable(result):\n\u001b[1;32m 146\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(_ASYNC_RETRY_WARNING)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/_http/__init__.py:494\u001b[0m, in \u001b[0;36mJSONConnection.api_request\u001b[0;34m(self, method, path, query_params, data, content_type, headers, api_base_url, api_version, expect_json, _target_object, timeout, extra_api_info)\u001b[0m\n\u001b[1;32m 482\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_make_request(\n\u001b[1;32m 483\u001b[0m method\u001b[38;5;241m=\u001b[39mmethod,\n\u001b[1;32m 484\u001b[0m url\u001b[38;5;241m=\u001b[39murl,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 490\u001b[0m extra_api_info\u001b[38;5;241m=\u001b[39mextra_api_info,\n\u001b[1;32m 491\u001b[0m )\n\u001b[1;32m 493\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;241m200\u001b[39m \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m response\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m300\u001b[39m:\n\u001b[0;32m--> 494\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exceptions\u001b[38;5;241m.\u001b[39mfrom_http_response(response)\n\u001b[1;32m 496\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m expect_json \u001b[38;5;129;01mand\u001b[39;00m response\u001b[38;5;241m.\u001b[39mcontent:\n\u001b[1;32m 497\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m response\u001b[38;5;241m.\u001b[39mjson()\n", + "\u001b[0;31mBadRequest\u001b[0m: 400 GET https://bigquery.googleapis.com/bigquery/v2/projects/bigframes-dev/queries/b1947d40-947f-4693-bf90-24c48564df5d?maxResults=0&location=US&prettyPrint=false: Operation timed out after 6.0 hours. Consider reducing the amount of work performed by your operation so that it can complete within this limit.\n\nLocation: US\nJob ID: b1947d40-947f-4693-bf90-24c48564df5d\n Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey. You are currently running BigFrames version 2.5.0. [{'@type': 'type.googleapis.com/google.rpc.DebugInfo', 'detail': '[TIMEOUT] errorProto=code: \"TIMEOUT\"\\nargument: \"Operation timed out after 6.0 hours. Consider reducing the amount of work performed by your operation so that it can complete within this limit.\"\\n\\n\\tat com.google.cloud.helix.common.Exceptions$Public.timeout(Exceptions.java:958)\\n\\tat com.google.cloud.helix.server.job.DremelErrorUtil.createHelixErrorForDeadlineExceeded(DremelErrorUtil.java:75)\\n\\tat com.google.cloud.helix.server.job.DremelErrorUtil.createHelixErrorFromDremelRpcException(DremelErrorUtil.java:61)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:784)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:696)\\n\\tat com.google.net.rpc3.stream.RpcMessageCallback$ForwardingRpcMessageCallback.onMessage(RpcMessageCallback.java:128)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.processMessageUnlocked(RpcStreamInternalContext.java:1876)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksInternalUnlocked(RpcStreamInternalContext.java:2930)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksUnlocked(RpcStreamInternalContext.java:2854)\\n\\tat com.google.net.eventmanager.AbstractFutureTask$Sync.innerRun(AbstractFutureTask.java:259)\\n\\tat com.google.net.eventmanager.AbstractFutureTask.run(AbstractFutureTask.java:120)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runTaskTraced(EventManagerImpl.java:900)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runTask(EventManagerImpl.java:892)\\n\\tat com.google.net.eventmanager.EventManagerImpl.internalRunWorkerLoop(EventManagerImpl.java:1319)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runWorkerLoop(EventManagerImpl.java:1210)\\n\\tat com.google.net.eventmanager.WorkerThreadInfo.runWorkerLoop(WorkerThreadInfo.java:153)\\n\\tat com.google.net.eventmanager.EventManagerImpl$WorkerThread.run(EventManagerImpl.java:1999)\\n'}]" + ] + } + ], + "source": [ + "#bq_connection = \"bigframes-dev.us.bigframes-default-connection\"\n", + "df[\"text\"] = df[\"audio\"].blob.audio_transcribe(model_name=\"gemini-2.0-flash-001\", verbose=True)\n", + "# gemini-2.5-pro-preview-05-06" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 310b7d0c-1369-4f83-b3ab-0627540e8c66 is DONE. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "ename": "NotFound", + "evalue": "404 Not found: Table bigframes-dev:_8b037bfb7316dddf9d92b12dcf93e008906bfe52._641b89a4_f2cc_4eee_a38e_a79c7c92293b_bqdf_5b24fca6-6b41-4342-8733-6fd57a90e0e3 was not found in location US; reason: notFound, message: Not found: Table bigframes-dev:_8b037bfb7316dddf9d92b12dcf93e008906bfe52._641b89a4_f2cc_4eee_a38e_a79c7c92293b_bqdf_5b24fca6-6b41-4342-8733-6fd57a90e0e3 was not found in location US\n\nLocation: US\nJob ID: 310b7d0c-1369-4f83-b3ab-0627540e8c66\n Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey. You are currently running BigFrames version 2.5.0.", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNotFound\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/IPython/core/formatters.py:770\u001b[0m, in \u001b[0;36mPlainTextFormatter.__call__\u001b[0;34m(self, obj)\u001b[0m\n\u001b[1;32m 763\u001b[0m stream \u001b[38;5;241m=\u001b[39m StringIO()\n\u001b[1;32m 764\u001b[0m printer \u001b[38;5;241m=\u001b[39m pretty\u001b[38;5;241m.\u001b[39mRepresentationPrinter(stream, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mverbose,\n\u001b[1;32m 765\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmax_width, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnewline,\n\u001b[1;32m 766\u001b[0m max_seq_length\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmax_seq_length,\n\u001b[1;32m 767\u001b[0m singleton_pprinters\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msingleton_printers,\n\u001b[1;32m 768\u001b[0m type_pprinters\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtype_printers,\n\u001b[1;32m 769\u001b[0m deferred_pprinters\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdeferred_printers)\n\u001b[0;32m--> 770\u001b[0m \u001b[43mprinter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpretty\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 771\u001b[0m printer\u001b[38;5;241m.\u001b[39mflush()\n\u001b[1;32m 772\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m stream\u001b[38;5;241m.\u001b[39mgetvalue()\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/IPython/lib/pretty.py:419\u001b[0m, in \u001b[0;36mRepresentationPrinter.pretty\u001b[0;34m(self, obj)\u001b[0m\n\u001b[1;32m 408\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m meth(obj, \u001b[38;5;28mself\u001b[39m, cycle)\n\u001b[1;32m 409\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 410\u001b[0m \u001b[38;5;28mcls\u001b[39m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mobject\u001b[39m\n\u001b[1;32m 411\u001b[0m \u001b[38;5;66;03m# check if cls defines __repr__\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 417\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mcallable\u001b[39m(_safe_getattr(\u001b[38;5;28mcls\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m__repr__\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m 418\u001b[0m ):\n\u001b[0;32m--> 419\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_repr_pprint\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcycle\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 421\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _default_pprint(obj, \u001b[38;5;28mself\u001b[39m, cycle)\n\u001b[1;32m 422\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/IPython/lib/pretty.py:794\u001b[0m, in \u001b[0;36m_repr_pprint\u001b[0;34m(obj, p, cycle)\u001b[0m\n\u001b[1;32m 792\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"A pprint that just redirects to the normal repr function.\"\"\"\u001b[39;00m\n\u001b[1;32m 793\u001b[0m \u001b[38;5;66;03m# Find newlines and replace them with p.break_()\u001b[39;00m\n\u001b[0;32m--> 794\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mrepr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 795\u001b[0m lines \u001b[38;5;241m=\u001b[39m output\u001b[38;5;241m.\u001b[39msplitlines()\n\u001b[1;32m 796\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m p\u001b[38;5;241m.\u001b[39mgroup():\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:175\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 172\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 175\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 178\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 179\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;66;03m# _block.session.bqclient.\u001b[39;00m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/dataframe.py:742\u001b[0m, in \u001b[0;36mDataFrame.__repr__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 737\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m formatter\u001b[38;5;241m.\u001b[39mrepr_query_job(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compute_dry_run())\n\u001b[1;32m 739\u001b[0m \u001b[38;5;66;03m# TODO(swast): pass max_columns and get the true column count back. Maybe\u001b[39;00m\n\u001b[1;32m 740\u001b[0m \u001b[38;5;66;03m# get 1 more column than we have requested so that pandas can add the\u001b[39;00m\n\u001b[1;32m 741\u001b[0m \u001b[38;5;66;03m# ... for us?\u001b[39;00m\n\u001b[0;32m--> 742\u001b[0m pandas_df, row_count, query_job \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_block\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mretrieve_repr_request_results\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 743\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\n\u001b[1;32m 744\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 746\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_set_internal_query_job(query_job)\n\u001b[1;32m 748\u001b[0m column_count \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(pandas_df\u001b[38;5;241m.\u001b[39mcolumns)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/blocks.py:1515\u001b[0m, in \u001b[0;36mBlock.retrieve_repr_request_results\u001b[0;34m(self, max_results)\u001b[0m\n\u001b[1;32m 1513\u001b[0m \u001b[38;5;66;03m# head caches full underlying expression, so row_count will be free after\u001b[39;00m\n\u001b[1;32m 1514\u001b[0m executor \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msession\u001b[38;5;241m.\u001b[39m_executor\n\u001b[0;32m-> 1515\u001b[0m \u001b[43mexecutor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcached\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1516\u001b[0m \u001b[43m \u001b[49m\u001b[43marray_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexpr\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1517\u001b[0m \u001b[43m \u001b[49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mexecutors\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mCacheConfig\u001b[49m\u001b[43m(\u001b[49m\u001b[43moptimize_for\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mhead\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mif_cached\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mreuse-strict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1518\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1519\u001b[0m head_result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msession\u001b[38;5;241m.\u001b[39m_executor\u001b[38;5;241m.\u001b[39mexecute(\n\u001b[1;32m 1520\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexpr\u001b[38;5;241m.\u001b[39mslice(start\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, stop\u001b[38;5;241m=\u001b[39mmax_results, step\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m 1521\u001b[0m )\n\u001b[1;32m 1522\u001b[0m row_count \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msession\u001b[38;5;241m.\u001b[39m_executor\u001b[38;5;241m.\u001b[39mexecute(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexpr\u001b[38;5;241m.\u001b[39mrow_count())\u001b[38;5;241m.\u001b[39mto_py_scalar()\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/bq_caching_executor.py:363\u001b[0m, in \u001b[0;36mBigQueryCachingExecutor.cached\u001b[0;34m(self, array_value, config)\u001b[0m\n\u001b[1;32m 361\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_cache_with_session_awareness(array_value)\n\u001b[1;32m 362\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m config\u001b[38;5;241m.\u001b[39moptimize_for \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhead\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m--> 363\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_cache_with_offsets\u001b[49m\u001b[43m(\u001b[49m\u001b[43marray_value\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 364\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 365\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(config\u001b[38;5;241m.\u001b[39moptimize_for, executor\u001b[38;5;241m.\u001b[39mHierarchicalKey)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/bq_caching_executor.py:475\u001b[0m, in \u001b[0;36mBigQueryCachingExecutor._cache_with_offsets\u001b[0;34m(self, array_value)\u001b[0m\n\u001b[1;32m 468\u001b[0m w_offsets, offset_column \u001b[38;5;241m=\u001b[39m array_value\u001b[38;5;241m.\u001b[39mpromote_offsets()\n\u001b[1;32m 469\u001b[0m compiled \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mcompile\u001b[39m\u001b[38;5;241m.\u001b[39mcompile_sql(\n\u001b[1;32m 470\u001b[0m \u001b[38;5;28mcompile\u001b[39m\u001b[38;5;241m.\u001b[39mCompileRequest(\n\u001b[1;32m 471\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlogical_plan(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_substitute_large_local_sources(w_offsets\u001b[38;5;241m.\u001b[39mnode)),\n\u001b[1;32m 472\u001b[0m sort_rows\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m 473\u001b[0m )\n\u001b[1;32m 474\u001b[0m )\n\u001b[0;32m--> 475\u001b[0m tmp_table_ref \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sql_as_cached_temp_table\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 476\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompiled\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 477\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompiled\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msql_schema\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 478\u001b[0m \u001b[43m \u001b[49m\u001b[43mcluster_cols\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43moffset_column\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 479\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 480\u001b[0m tmp_table \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbqclient\u001b[38;5;241m.\u001b[39mget_table(tmp_table_ref)\n\u001b[1;32m 481\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m compiled\u001b[38;5;241m.\u001b[39mrow_order \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/bq_caching_executor.py:550\u001b[0m, in \u001b[0;36mBigQueryCachingExecutor._sql_as_cached_temp_table\u001b[0;34m(self, sql, schema, cluster_cols)\u001b[0m\n\u001b[1;32m 545\u001b[0m job_config \u001b[38;5;241m=\u001b[39m cast(\n\u001b[1;32m 546\u001b[0m bigquery\u001b[38;5;241m.\u001b[39mQueryJobConfig,\n\u001b[1;32m 547\u001b[0m bigquery\u001b[38;5;241m.\u001b[39mQueryJobConfig\u001b[38;5;241m.\u001b[39mfrom_api_repr({}),\n\u001b[1;32m 548\u001b[0m )\n\u001b[1;32m 549\u001b[0m job_config\u001b[38;5;241m.\u001b[39mdestination \u001b[38;5;241m=\u001b[39m temp_table\n\u001b[0;32m--> 550\u001b[0m _, query_job \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_execute_query\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 551\u001b[0m \u001b[43m \u001b[49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 552\u001b[0m \u001b[43m \u001b[49m\u001b[43mjob_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjob_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 553\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 554\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m query_job \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 555\u001b[0m query_job\u001b[38;5;241m.\u001b[39mresult()\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/bq_caching_executor.py:392\u001b[0m, in \u001b[0;36mBigQueryCachingExecutor._run_execute_query\u001b[0;34m(self, sql, job_config, query_with_job)\u001b[0m\n\u001b[1;32m 389\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 390\u001b[0m \u001b[38;5;66;03m# Trick the type checker into thinking we got a literal.\u001b[39;00m\n\u001b[1;32m 391\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m query_with_job:\n\u001b[0;32m--> 392\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mbq_io\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstart_query_with_client\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 393\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbqclient\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 394\u001b[0m \u001b[43m \u001b[49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 395\u001b[0m \u001b[43m \u001b[49m\u001b[43mjob_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjob_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 396\u001b[0m \u001b[43m \u001b[49m\u001b[43mmetrics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmetrics\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 397\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 398\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 399\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 400\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_with_job\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 401\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 402\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 403\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m bq_io\u001b[38;5;241m.\u001b[39mstart_query_with_client(\n\u001b[1;32m 404\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbqclient,\n\u001b[1;32m 405\u001b[0m sql,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 411\u001b[0m query_with_job\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m 412\u001b[0m )\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/_io/bigquery/__init__.py:314\u001b[0m, in \u001b[0;36mstart_query_with_client\u001b[0;34m(bq_client, sql, job_config, location, project, timeout, metrics, query_with_job)\u001b[0m\n\u001b[1;32m 312\u001b[0m opts \u001b[38;5;241m=\u001b[39m bigframes\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mdisplay\n\u001b[1;32m 313\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m opts\u001b[38;5;241m.\u001b[39mprogress_bar \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m query_job\u001b[38;5;241m.\u001b[39mconfiguration\u001b[38;5;241m.\u001b[39mdry_run:\n\u001b[0;32m--> 314\u001b[0m results_iterator \u001b[38;5;241m=\u001b[39m \u001b[43mformatting_helpers\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait_for_query_job\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 315\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_job\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 316\u001b[0m \u001b[43m \u001b[49m\u001b[43mprogress_bar\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mopts\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprogress_bar\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 317\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 318\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 319\u001b[0m results_iterator \u001b[38;5;241m=\u001b[39m query_job\u001b[38;5;241m.\u001b[39mresult()\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/formatting_helpers.py:149\u001b[0m, in \u001b[0;36mwait_for_query_job\u001b[0;34m(query_job, max_results, page_size, progress_bar)\u001b[0m\n\u001b[1;32m 147\u001b[0m loading_bar \u001b[38;5;241m=\u001b[39m display\u001b[38;5;241m.\u001b[39mHTML(get_query_job_loading_html(query_job))\n\u001b[1;32m 148\u001b[0m display\u001b[38;5;241m.\u001b[39mdisplay(loading_bar, display_id\u001b[38;5;241m=\u001b[39mdisplay_id)\n\u001b[0;32m--> 149\u001b[0m query_result \u001b[38;5;241m=\u001b[39m \u001b[43mquery_job\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 150\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpage_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpage_size\u001b[49m\n\u001b[1;32m 151\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 152\u001b[0m query_job\u001b[38;5;241m.\u001b[39mreload()\n\u001b[1;32m 153\u001b[0m display\u001b[38;5;241m.\u001b[39mupdate_display(\n\u001b[1;32m 154\u001b[0m display\u001b[38;5;241m.\u001b[39mHTML(get_query_job_loading_html(query_job)),\n\u001b[1;32m 155\u001b[0m display_id\u001b[38;5;241m=\u001b[39mdisplay_id,\n\u001b[1;32m 156\u001b[0m )\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/job/query.py:1681\u001b[0m, in \u001b[0;36mQueryJob.result\u001b[0;34m(self, page_size, max_results, retry, timeout, start_index, job_retry)\u001b[0m\n\u001b[1;32m 1676\u001b[0m remaining_timeout \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1678\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m remaining_timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1679\u001b[0m \u001b[38;5;66;03m# Since is_job_done() calls jobs.getQueryResults, which is a\u001b[39;00m\n\u001b[1;32m 1680\u001b[0m \u001b[38;5;66;03m# long-running API, don't delay the next request at all.\u001b[39;00m\n\u001b[0;32m-> 1681\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[43mis_job_done\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 1682\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[1;32m 1683\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1684\u001b[0m \u001b[38;5;66;03m# Use a monotonic clock since we don't actually care about\u001b[39;00m\n\u001b[1;32m 1685\u001b[0m \u001b[38;5;66;03m# daylight savings or similar, just the elapsed time.\u001b[39;00m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:293\u001b[0m, in \u001b[0;36mRetry.__call__..retry_wrapped_func\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 289\u001b[0m target \u001b[38;5;241m=\u001b[39m functools\u001b[38;5;241m.\u001b[39mpartial(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 290\u001b[0m sleep_generator \u001b[38;5;241m=\u001b[39m exponential_sleep_generator(\n\u001b[1;32m 291\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_initial, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_maximum, multiplier\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_multiplier\n\u001b[1;32m 292\u001b[0m )\n\u001b[0;32m--> 293\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mretry_target\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 294\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 295\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_predicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 296\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep_generator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 297\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 298\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 299\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:153\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 149\u001b[0m \u001b[38;5;66;03m# pylint: disable=broad-except\u001b[39;00m\n\u001b[1;32m 150\u001b[0m \u001b[38;5;66;03m# This function explicitly must deal with broad exceptions.\u001b[39;00m\n\u001b[1;32m 151\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 152\u001b[0m \u001b[38;5;66;03m# defer to shared logic for handling errors\u001b[39;00m\n\u001b[0;32m--> 153\u001b[0m \u001b[43m_retry_error_helper\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 154\u001b[0m \u001b[43m \u001b[49m\u001b[43mexc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 155\u001b[0m \u001b[43m \u001b[49m\u001b[43mdeadline\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 156\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 157\u001b[0m \u001b[43m \u001b[49m\u001b[43merror_list\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 158\u001b[0m \u001b[43m \u001b[49m\u001b[43mpredicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 159\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 160\u001b[0m \u001b[43m \u001b[49m\u001b[43mexception_factory\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 161\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 162\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 163\u001b[0m \u001b[38;5;66;03m# if exception not raised, sleep before next attempt\u001b[39;00m\n\u001b[1;32m 164\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(sleep)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_base.py:212\u001b[0m, in \u001b[0;36m_retry_error_helper\u001b[0;34m(exc, deadline, next_sleep, error_list, predicate_fn, on_error_fn, exc_factory_fn, original_timeout)\u001b[0m\n\u001b[1;32m 206\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m predicate_fn(exc):\n\u001b[1;32m 207\u001b[0m final_exc, source_exc \u001b[38;5;241m=\u001b[39m exc_factory_fn(\n\u001b[1;32m 208\u001b[0m error_list,\n\u001b[1;32m 209\u001b[0m RetryFailureReason\u001b[38;5;241m.\u001b[39mNON_RETRYABLE_ERROR,\n\u001b[1;32m 210\u001b[0m original_timeout,\n\u001b[1;32m 211\u001b[0m )\n\u001b[0;32m--> 212\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m final_exc \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01msource_exc\u001b[39;00m\n\u001b[1;32m 213\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m on_error_fn \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 214\u001b[0m on_error_fn(exc)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:144\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m sleep \u001b[38;5;129;01min\u001b[39;00m sleep_generator:\n\u001b[1;32m 143\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 144\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mtarget\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 145\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inspect\u001b[38;5;241m.\u001b[39misawaitable(result):\n\u001b[1;32m 146\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(_ASYNC_RETRY_WARNING)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/job/query.py:1630\u001b[0m, in \u001b[0;36mQueryJob.result..is_job_done\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1607\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m job_failed_exception \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1608\u001b[0m \u001b[38;5;66;03m# Only try to restart the query job if the job failed for\u001b[39;00m\n\u001b[1;32m 1609\u001b[0m \u001b[38;5;66;03m# a retriable reason. For example, don't restart the query\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1627\u001b[0m \u001b[38;5;66;03m# into an exception that can be processed by the\u001b[39;00m\n\u001b[1;32m 1628\u001b[0m \u001b[38;5;66;03m# `job_retry` predicate.\u001b[39;00m\n\u001b[1;32m 1629\u001b[0m restart_query_job \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m-> 1630\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m job_failed_exception\n\u001b[1;32m 1631\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1632\u001b[0m \u001b[38;5;66;03m# Make sure that the _query_results are cached so we\u001b[39;00m\n\u001b[1;32m 1633\u001b[0m \u001b[38;5;66;03m# can return a complete RowIterator.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1639\u001b[0m \u001b[38;5;66;03m# making any extra API calls if the previous loop\u001b[39;00m\n\u001b[1;32m 1640\u001b[0m \u001b[38;5;66;03m# iteration fetched the finished job.\u001b[39;00m\n\u001b[1;32m 1641\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reload_query_results(\n\u001b[1;32m 1642\u001b[0m retry\u001b[38;5;241m=\u001b[39mretry, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mreload_query_results_kwargs\n\u001b[1;32m 1643\u001b[0m )\n", + "\u001b[0;31mNotFound\u001b[0m: 404 Not found: Table bigframes-dev:_8b037bfb7316dddf9d92b12dcf93e008906bfe52._641b89a4_f2cc_4eee_a38e_a79c7c92293b_bqdf_5b24fca6-6b41-4342-8733-6fd57a90e0e3 was not found in location US; reason: notFound, message: Not found: Table bigframes-dev:_8b037bfb7316dddf9d92b12dcf93e008906bfe52._641b89a4_f2cc_4eee_a38e_a79c7c92293b_bqdf_5b24fca6-6b41-4342-8733-6fd57a90e0e3 was not found in location US\n\nLocation: US\nJob ID: 310b7d0c-1369-4f83-b3ab-0627540e8c66\n Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey. You are currently running BigFrames version 2.5.0." + ] + }, + { + "data": { + "text/html": [ + "Query job 31d9c7a6-23a1-4a87-abf1-724f2f8995d5 is DONE. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "ename": "NotFound", + "evalue": "404 Not found: Table bigframes-dev:_8b037bfb7316dddf9d92b12dcf93e008906bfe52._641b89a4_f2cc_4eee_a38e_a79c7c92293b_bqdf_5b24fca6-6b41-4342-8733-6fd57a90e0e3 was not found in location US; reason: notFound, message: Not found: Table bigframes-dev:_8b037bfb7316dddf9d92b12dcf93e008906bfe52._641b89a4_f2cc_4eee_a38e_a79c7c92293b_bqdf_5b24fca6-6b41-4342-8733-6fd57a90e0e3 was not found in location US\n\nLocation: US\nJob ID: 31d9c7a6-23a1-4a87-abf1-724f2f8995d5\n Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey. You are currently running BigFrames version 2.5.0.", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNotFound\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/IPython/core/formatters.py:406\u001b[0m, in \u001b[0;36mBaseFormatter.__call__\u001b[0;34m(self, obj)\u001b[0m\n\u001b[1;32m 404\u001b[0m method \u001b[38;5;241m=\u001b[39m get_real_method(obj, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprint_method)\n\u001b[1;32m 405\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m method \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 406\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 407\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 408\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:175\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 172\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 175\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 178\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 179\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;66;03m# _block.session.bqclient.\u001b[39;00m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/dataframe.py:799\u001b[0m, in \u001b[0;36mDataFrame._repr_html_\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 794\u001b[0m df[col] \u001b[38;5;241m=\u001b[39m df[col]\u001b[38;5;241m.\u001b[39mblob\u001b[38;5;241m.\u001b[39m_get_runtime(mode\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mR\u001b[39m\u001b[38;5;124m\"\u001b[39m, with_metadata\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 796\u001b[0m \u001b[38;5;66;03m# TODO(swast): pass max_columns and get the true column count back. Maybe\u001b[39;00m\n\u001b[1;32m 797\u001b[0m \u001b[38;5;66;03m# get 1 more column than we have requested so that pandas can add the\u001b[39;00m\n\u001b[1;32m 798\u001b[0m \u001b[38;5;66;03m# ... for us?\u001b[39;00m\n\u001b[0;32m--> 799\u001b[0m pandas_df, row_count, query_job \u001b[38;5;241m=\u001b[39m \u001b[43mdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_block\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mretrieve_repr_request_results\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 800\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\n\u001b[1;32m 801\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 803\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_set_internal_query_job(query_job)\n\u001b[1;32m 805\u001b[0m column_count \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(pandas_df\u001b[38;5;241m.\u001b[39mcolumns)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/blocks.py:1515\u001b[0m, in \u001b[0;36mBlock.retrieve_repr_request_results\u001b[0;34m(self, max_results)\u001b[0m\n\u001b[1;32m 1513\u001b[0m \u001b[38;5;66;03m# head caches full underlying expression, so row_count will be free after\u001b[39;00m\n\u001b[1;32m 1514\u001b[0m executor \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msession\u001b[38;5;241m.\u001b[39m_executor\n\u001b[0;32m-> 1515\u001b[0m \u001b[43mexecutor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcached\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1516\u001b[0m \u001b[43m \u001b[49m\u001b[43marray_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexpr\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1517\u001b[0m \u001b[43m \u001b[49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mexecutors\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mCacheConfig\u001b[49m\u001b[43m(\u001b[49m\u001b[43moptimize_for\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mhead\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mif_cached\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mreuse-strict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1518\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1519\u001b[0m head_result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msession\u001b[38;5;241m.\u001b[39m_executor\u001b[38;5;241m.\u001b[39mexecute(\n\u001b[1;32m 1520\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexpr\u001b[38;5;241m.\u001b[39mslice(start\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, stop\u001b[38;5;241m=\u001b[39mmax_results, step\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m 1521\u001b[0m )\n\u001b[1;32m 1522\u001b[0m row_count \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msession\u001b[38;5;241m.\u001b[39m_executor\u001b[38;5;241m.\u001b[39mexecute(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexpr\u001b[38;5;241m.\u001b[39mrow_count())\u001b[38;5;241m.\u001b[39mto_py_scalar()\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/bq_caching_executor.py:363\u001b[0m, in \u001b[0;36mBigQueryCachingExecutor.cached\u001b[0;34m(self, array_value, config)\u001b[0m\n\u001b[1;32m 361\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_cache_with_session_awareness(array_value)\n\u001b[1;32m 362\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m config\u001b[38;5;241m.\u001b[39moptimize_for \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhead\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m--> 363\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_cache_with_offsets\u001b[49m\u001b[43m(\u001b[49m\u001b[43marray_value\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 364\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 365\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(config\u001b[38;5;241m.\u001b[39moptimize_for, executor\u001b[38;5;241m.\u001b[39mHierarchicalKey)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/bq_caching_executor.py:475\u001b[0m, in \u001b[0;36mBigQueryCachingExecutor._cache_with_offsets\u001b[0;34m(self, array_value)\u001b[0m\n\u001b[1;32m 468\u001b[0m w_offsets, offset_column \u001b[38;5;241m=\u001b[39m array_value\u001b[38;5;241m.\u001b[39mpromote_offsets()\n\u001b[1;32m 469\u001b[0m compiled \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mcompile\u001b[39m\u001b[38;5;241m.\u001b[39mcompile_sql(\n\u001b[1;32m 470\u001b[0m \u001b[38;5;28mcompile\u001b[39m\u001b[38;5;241m.\u001b[39mCompileRequest(\n\u001b[1;32m 471\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlogical_plan(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_substitute_large_local_sources(w_offsets\u001b[38;5;241m.\u001b[39mnode)),\n\u001b[1;32m 472\u001b[0m sort_rows\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m 473\u001b[0m )\n\u001b[1;32m 474\u001b[0m )\n\u001b[0;32m--> 475\u001b[0m tmp_table_ref \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sql_as_cached_temp_table\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 476\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompiled\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 477\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompiled\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msql_schema\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 478\u001b[0m \u001b[43m \u001b[49m\u001b[43mcluster_cols\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43moffset_column\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 479\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 480\u001b[0m tmp_table \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbqclient\u001b[38;5;241m.\u001b[39mget_table(tmp_table_ref)\n\u001b[1;32m 481\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m compiled\u001b[38;5;241m.\u001b[39mrow_order \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/bq_caching_executor.py:550\u001b[0m, in \u001b[0;36mBigQueryCachingExecutor._sql_as_cached_temp_table\u001b[0;34m(self, sql, schema, cluster_cols)\u001b[0m\n\u001b[1;32m 545\u001b[0m job_config \u001b[38;5;241m=\u001b[39m cast(\n\u001b[1;32m 546\u001b[0m bigquery\u001b[38;5;241m.\u001b[39mQueryJobConfig,\n\u001b[1;32m 547\u001b[0m bigquery\u001b[38;5;241m.\u001b[39mQueryJobConfig\u001b[38;5;241m.\u001b[39mfrom_api_repr({}),\n\u001b[1;32m 548\u001b[0m )\n\u001b[1;32m 549\u001b[0m job_config\u001b[38;5;241m.\u001b[39mdestination \u001b[38;5;241m=\u001b[39m temp_table\n\u001b[0;32m--> 550\u001b[0m _, query_job \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_execute_query\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 551\u001b[0m \u001b[43m \u001b[49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 552\u001b[0m \u001b[43m \u001b[49m\u001b[43mjob_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjob_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 553\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 554\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m query_job \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 555\u001b[0m query_job\u001b[38;5;241m.\u001b[39mresult()\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/bq_caching_executor.py:392\u001b[0m, in \u001b[0;36mBigQueryCachingExecutor._run_execute_query\u001b[0;34m(self, sql, job_config, query_with_job)\u001b[0m\n\u001b[1;32m 389\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 390\u001b[0m \u001b[38;5;66;03m# Trick the type checker into thinking we got a literal.\u001b[39;00m\n\u001b[1;32m 391\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m query_with_job:\n\u001b[0;32m--> 392\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mbq_io\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstart_query_with_client\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 393\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbqclient\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 394\u001b[0m \u001b[43m \u001b[49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 395\u001b[0m \u001b[43m \u001b[49m\u001b[43mjob_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjob_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 396\u001b[0m \u001b[43m \u001b[49m\u001b[43mmetrics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmetrics\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 397\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 398\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 399\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 400\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_with_job\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 401\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 402\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 403\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m bq_io\u001b[38;5;241m.\u001b[39mstart_query_with_client(\n\u001b[1;32m 404\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbqclient,\n\u001b[1;32m 405\u001b[0m sql,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 411\u001b[0m query_with_job\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m 412\u001b[0m )\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/_io/bigquery/__init__.py:314\u001b[0m, in \u001b[0;36mstart_query_with_client\u001b[0;34m(bq_client, sql, job_config, location, project, timeout, metrics, query_with_job)\u001b[0m\n\u001b[1;32m 312\u001b[0m opts \u001b[38;5;241m=\u001b[39m bigframes\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mdisplay\n\u001b[1;32m 313\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m opts\u001b[38;5;241m.\u001b[39mprogress_bar \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m query_job\u001b[38;5;241m.\u001b[39mconfiguration\u001b[38;5;241m.\u001b[39mdry_run:\n\u001b[0;32m--> 314\u001b[0m results_iterator \u001b[38;5;241m=\u001b[39m \u001b[43mformatting_helpers\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait_for_query_job\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 315\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_job\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 316\u001b[0m \u001b[43m \u001b[49m\u001b[43mprogress_bar\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mopts\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprogress_bar\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 317\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 318\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 319\u001b[0m results_iterator \u001b[38;5;241m=\u001b[39m query_job\u001b[38;5;241m.\u001b[39mresult()\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/formatting_helpers.py:149\u001b[0m, in \u001b[0;36mwait_for_query_job\u001b[0;34m(query_job, max_results, page_size, progress_bar)\u001b[0m\n\u001b[1;32m 147\u001b[0m loading_bar \u001b[38;5;241m=\u001b[39m display\u001b[38;5;241m.\u001b[39mHTML(get_query_job_loading_html(query_job))\n\u001b[1;32m 148\u001b[0m display\u001b[38;5;241m.\u001b[39mdisplay(loading_bar, display_id\u001b[38;5;241m=\u001b[39mdisplay_id)\n\u001b[0;32m--> 149\u001b[0m query_result \u001b[38;5;241m=\u001b[39m \u001b[43mquery_job\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 150\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpage_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpage_size\u001b[49m\n\u001b[1;32m 151\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 152\u001b[0m query_job\u001b[38;5;241m.\u001b[39mreload()\n\u001b[1;32m 153\u001b[0m display\u001b[38;5;241m.\u001b[39mupdate_display(\n\u001b[1;32m 154\u001b[0m display\u001b[38;5;241m.\u001b[39mHTML(get_query_job_loading_html(query_job)),\n\u001b[1;32m 155\u001b[0m display_id\u001b[38;5;241m=\u001b[39mdisplay_id,\n\u001b[1;32m 156\u001b[0m )\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/job/query.py:1681\u001b[0m, in \u001b[0;36mQueryJob.result\u001b[0;34m(self, page_size, max_results, retry, timeout, start_index, job_retry)\u001b[0m\n\u001b[1;32m 1676\u001b[0m remaining_timeout \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1678\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m remaining_timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1679\u001b[0m \u001b[38;5;66;03m# Since is_job_done() calls jobs.getQueryResults, which is a\u001b[39;00m\n\u001b[1;32m 1680\u001b[0m \u001b[38;5;66;03m# long-running API, don't delay the next request at all.\u001b[39;00m\n\u001b[0;32m-> 1681\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[43mis_job_done\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 1682\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[1;32m 1683\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1684\u001b[0m \u001b[38;5;66;03m# Use a monotonic clock since we don't actually care about\u001b[39;00m\n\u001b[1;32m 1685\u001b[0m \u001b[38;5;66;03m# daylight savings or similar, just the elapsed time.\u001b[39;00m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:293\u001b[0m, in \u001b[0;36mRetry.__call__..retry_wrapped_func\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 289\u001b[0m target \u001b[38;5;241m=\u001b[39m functools\u001b[38;5;241m.\u001b[39mpartial(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 290\u001b[0m sleep_generator \u001b[38;5;241m=\u001b[39m exponential_sleep_generator(\n\u001b[1;32m 291\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_initial, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_maximum, multiplier\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_multiplier\n\u001b[1;32m 292\u001b[0m )\n\u001b[0;32m--> 293\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mretry_target\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 294\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 295\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_predicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 296\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep_generator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 297\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 298\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 299\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:153\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 149\u001b[0m \u001b[38;5;66;03m# pylint: disable=broad-except\u001b[39;00m\n\u001b[1;32m 150\u001b[0m \u001b[38;5;66;03m# This function explicitly must deal with broad exceptions.\u001b[39;00m\n\u001b[1;32m 151\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 152\u001b[0m \u001b[38;5;66;03m# defer to shared logic for handling errors\u001b[39;00m\n\u001b[0;32m--> 153\u001b[0m \u001b[43m_retry_error_helper\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 154\u001b[0m \u001b[43m \u001b[49m\u001b[43mexc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 155\u001b[0m \u001b[43m \u001b[49m\u001b[43mdeadline\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 156\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 157\u001b[0m \u001b[43m \u001b[49m\u001b[43merror_list\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 158\u001b[0m \u001b[43m \u001b[49m\u001b[43mpredicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 159\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 160\u001b[0m \u001b[43m \u001b[49m\u001b[43mexception_factory\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 161\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 162\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 163\u001b[0m \u001b[38;5;66;03m# if exception not raised, sleep before next attempt\u001b[39;00m\n\u001b[1;32m 164\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(sleep)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_base.py:212\u001b[0m, in \u001b[0;36m_retry_error_helper\u001b[0;34m(exc, deadline, next_sleep, error_list, predicate_fn, on_error_fn, exc_factory_fn, original_timeout)\u001b[0m\n\u001b[1;32m 206\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m predicate_fn(exc):\n\u001b[1;32m 207\u001b[0m final_exc, source_exc \u001b[38;5;241m=\u001b[39m exc_factory_fn(\n\u001b[1;32m 208\u001b[0m error_list,\n\u001b[1;32m 209\u001b[0m RetryFailureReason\u001b[38;5;241m.\u001b[39mNON_RETRYABLE_ERROR,\n\u001b[1;32m 210\u001b[0m original_timeout,\n\u001b[1;32m 211\u001b[0m )\n\u001b[0;32m--> 212\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m final_exc \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01msource_exc\u001b[39;00m\n\u001b[1;32m 213\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m on_error_fn \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 214\u001b[0m on_error_fn(exc)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:144\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m sleep \u001b[38;5;129;01min\u001b[39;00m sleep_generator:\n\u001b[1;32m 143\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 144\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mtarget\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 145\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inspect\u001b[38;5;241m.\u001b[39misawaitable(result):\n\u001b[1;32m 146\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(_ASYNC_RETRY_WARNING)\n", + "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/job/query.py:1630\u001b[0m, in \u001b[0;36mQueryJob.result..is_job_done\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1607\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m job_failed_exception \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1608\u001b[0m \u001b[38;5;66;03m# Only try to restart the query job if the job failed for\u001b[39;00m\n\u001b[1;32m 1609\u001b[0m \u001b[38;5;66;03m# a retriable reason. For example, don't restart the query\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1627\u001b[0m \u001b[38;5;66;03m# into an exception that can be processed by the\u001b[39;00m\n\u001b[1;32m 1628\u001b[0m \u001b[38;5;66;03m# `job_retry` predicate.\u001b[39;00m\n\u001b[1;32m 1629\u001b[0m restart_query_job \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m-> 1630\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m job_failed_exception\n\u001b[1;32m 1631\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1632\u001b[0m \u001b[38;5;66;03m# Make sure that the _query_results are cached so we\u001b[39;00m\n\u001b[1;32m 1633\u001b[0m \u001b[38;5;66;03m# can return a complete RowIterator.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1639\u001b[0m \u001b[38;5;66;03m# making any extra API calls if the previous loop\u001b[39;00m\n\u001b[1;32m 1640\u001b[0m \u001b[38;5;66;03m# iteration fetched the finished job.\u001b[39;00m\n\u001b[1;32m 1641\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reload_query_results(\n\u001b[1;32m 1642\u001b[0m retry\u001b[38;5;241m=\u001b[39mretry, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mreload_query_results_kwargs\n\u001b[1;32m 1643\u001b[0m )\n", + "\u001b[0;31mNotFound\u001b[0m: 404 Not found: Table bigframes-dev:_8b037bfb7316dddf9d92b12dcf93e008906bfe52._641b89a4_f2cc_4eee_a38e_a79c7c92293b_bqdf_5b24fca6-6b41-4342-8733-6fd57a90e0e3 was not found in location US; reason: notFound, message: Not found: Table bigframes-dev:_8b037bfb7316dddf9d92b12dcf93e008906bfe52._641b89a4_f2cc_4eee_a38e_a79c7c92293b_bqdf_5b24fca6-6b41-4342-8733-6fd57a90e0e3 was not found in location US\n\nLocation: US\nJob ID: 31d9c7a6-23a1-4a87-abf1-724f2f8995d5\n Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey. You are currently running BigFrames version 2.5.0." + ] + } + ], + "source": [ + "df" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/test_blob_transcribe_long_audio_2p5.ipynb b/notebooks/test_blob_transcribe_long_audio_2p5.ipynb new file mode 100644 index 0000000000..e156232b51 --- /dev/null +++ b/notebooks/test_blob_transcribe_long_audio_2p5.ipynb @@ -0,0 +1,271 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/_config/experiment_options.py:71: ApiDeprecationWarning: BigFrames Blob is in preview now. This flag is no longer needed.\n", + " warnings.warn(msg, category=bfe.ApiDeprecationWarning)\n" + ] + } + ], + "source": [ + "import bigframes\n", + "bigframes.options.experiments.blob = True\n", + "#bigframes.options._bigquery_options.client_endpoints_override = {\"bqclient\": \"https://test-bigquery.sandbox.google.com\", \n", + "# \"bqconnectionclient\": \"test-bigqueryconnection.sandbox.googleapis.com\", \n", + "# \"bqstoragereadclient\": \"test-bigquerystorage-grpc.sandbox.googleapis.com\"}\n", + "import bigframes.pandas as bpd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/global_session.py:103: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", + " _global_session = bigframes.session.connect(\n" + ] + }, + { + "data": { + "text/html": [ + "Query job 9553647a-5a71-43af-902d-043df6a62387 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 7b3ca9c5-5664-4973-b062-3bb304730dd6 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df = bpd.from_glob_path(\"gs://shuowei_bucket/long_audio/*\", name=\"audio\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# copy files again, now we have 1000 audio files\n", + "#copies = [df] * 2 * 100\n", + "#df = bpd.concat(copies, ignore_index=True)\n", + "#df = df.cache()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# copy files again, now we have 1,000,000 audio files\n", + "#copies = [df] * 2 * 100\n", + "#df = bpd.concat(copies, ignore_index=True)\n", + "#df = df.cache()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job 848201eb-0342-40a4-8de8-621ad180184a is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 094c794c-9283-4a12-9c6a-c7aaa559e791 is DONE. 480 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/array_value.py:109: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", + "`db_dtypes` is a preview feature and subject to change.\n", + " warnings.warn(msg, bfe.PreviewWarning)\n" + ] + } + ], + "source": [ + "#bq_connection = \"bigframes-dev.us.bigframes-default-connection\"\n", + "df[\"text\"] = df[\"audio\"].blob.transcribe(df=df, audio_column=\"audio\", model_name=\"gemini-2.5-pro-preview-05-06\", verbose=True)\n", + "# gemini-2.0-flash-001" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "Query job ed3d84ac-2e1b-42ab-b867-58560e1a3167 is DONE. 8.8 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job a65b6550-f250-4052-92e2-492ab58da692 is DONE. 8.8 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
audiotext
0uri: gs://shuowei_bucket/audio/ID014clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': \"I'll tell you something, when the first revelation Joseph Smith received about marriage of any kind was that you couldn't get anywhere, you into the celestial kingdom. No man could get into the celestial kingdom without a wife, because he's only half there if he hasn't got a wife. And no woman, of course, could get there without a husband either, because she's only half there. Well, now at that time, the Mormons were very strict, uh, uh, religious people, you see. Sure. Sure. And, uh, thought that a man had to live clean all his life as well as a woman did in order to be worthy to be a man. Well, this is still true. Well, it's still true certainly. But I said, in that those circumstances, at that time, there were, uh, about three times as many women that joined the church and were faithful Latter-day Saints than there were men. When Joseph Smith comes out with the revelation that, uh, marriage was for eternity. And if if you're going to be separated from death do you part, then you can't get into the celestial kingdom at all to become as God is. Now, they got that doctrine. And that was, the prophet said that, they believed that. What about these here two-thirds of the women? Yeah, that's a good question. What is going to become of us then? It won't do us any good to go out here and marry one of these scallywags that spends his time drinking and boozing around and mining and maybe works in the mine a while and does all kind of skullduggery. That won't do us any good to marry that kind of a man. Besides, we don't have a chance to raise a respectable family with that kind of a man. So, what are we going to do about it now? What's going to become of us? And Joseph Smith always prayed about every problem that came to him. And there's where he got the revelation that if they live thoroughly clean and faithful, if they can understand the purity and sacredness of sex, so they don't desecrate it, then one man can have a number of wives and and permit them to raise a respectable, honorable family as well as have those wives throughout eternity, have their husband to take them into the celestial kingdom. Now, there was the origin of polygamy.\"}
1uri: gs://shuowei_bucket/audio/HI009clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': \"Uh, I hear that there's a Hawaiian custom of something called a hukilau. Hukilau. Hukilau is they have a rope and on the rope they have a net there. Mhm. They surround it a certain place and they drag the net. They surround it, you mean they carry the net out? Carry the net, uh-huh. Uh, how do they, is there something to keep the net floating? Yeah, some they have those floaters uh floating. Mhm. Some they have a lead down on the bottom, then they drag that uh net in. What do the floats, what are the floats made of? The float made of those uh some coconut uh dry coconut, I guess. Oh, dry coconut. Yeah. That's a good use for coconut. Uh, and that floats the top of the net. Top of the net. Now, the word lau means leaf, doesn't it? Lau, lau is a yeah, yeah, lau is a leaf. All right. Well now, the the where does the leaf come in in this hukilau? Hukilau, the leaves, you know, they stay on the top, top of the rope. So that the when they pull, the fish going to not to jump over the rope. The ropes, the the uh the leaves keep the fish from jumping over to the scared, they scared them away. They don't want to go underneath the water, see. When some moment she go underneath the water there, they will go to the net. They have a pocket on the net. They have a pocket on the net. Now the bottom of the net then. Bottom of the net they have a pocket in. Mhm. The water must have some kind of things to make it sink. Sink is a lead, they have a lead. Oh, they have lead on it. I see. Then uh uh somebody goes out and puts the net out around. Yeah, they tow it with a boat. With a boat? Two, two boats they goes out. Tow it all around. And then uh who pulls it in? Well, they have a lots of uh from the shore about 40, 50 guys they pull that rope up. Ah, I see. Yeah. And uh they must have a pretty big school of fish there. Yeah, once in a while they have a nice school, right? And then they pull them all in and the fish don't are scared to jump out so they get caught. No, they all in up on the net already. Mhm. They get caught in the net. I suppose the top of the net comes in sooner than the bottom so they if they do try to go out there, that's pocket. Yeah, they have to take on uh, you know, about six or seven guys is swimming on it around the net to they're watching how the fish going. Oh, I see. They they're chasing at the hole. Uh-huh. Well, about how much fish do they get out of a good haul of those? Once I saw up Waimea, they caught about about a ton, I guess, you know, those akule. Mhm. Oh, they have a nice crew up there. What's akule? Mm, the akule some kind of, you know, that's Hawaiian used to call akule, akule. They have been calling it Aji in Japan.\"}
2uri: gs://shuowei_bucket/audio/AK012clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': \"the soapstone carvings are those done, have you done any of those? Yes. Yeah. It's pretty much like ivory carving. Only it takes more care. What? To work on soapstone. Why? Why is it? Because it is brittle. Oh, it is. And very soft. Mhm. Uh, you you can hack on ivory, but you can't do that on soapstone. Chip too easily. Uh-huh. But then you have to use your files on those. Yes. And then once you've done the filing and so on, how do you smooth it down? Uh, we I use uh fine file. Mhm. To finish it up and then use uh sandpaper or emery cloth. Mhm. I think you said that um quite often the thing that is carved is determined by the shape of the piece of stone that one starts carving. Yes, yeah. Sometimes uh uh an ivory carver or a soapstone carver will take advantage of the shape of the stone. Mhm. And uh try to visualize what it'd look like. Uh, maybe a polar bear or or Mhm. He makes it just the way it is.\"}
3uri: gs://shuowei_bucket/audio/CA138clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': \"was rather interesting just to watch them gathering their materials and bouncing around. Yeah, they they are what they call it kangaroo walk or or something. Really? Uh, something like that. They named it that. I don't know. I bet those men are going to get quite a reception when they get back to earth. Oh, yes. I'll be so glad when they land back now, but I think that's uh pretty well uh fact because they've landed so many safely now that I I feel really relieved. Just getting off of the moon was the thing that was. Have they met with the um one that was circling? Yes, they've rendezvoused. So I understand. I that wasn't shown either, so I but uh they say they have rendezvoused, so that's a matter of making the circles and then coming down. What do you sort of imagine for the future? Do you imagine them sending a I think they will. I think they will will do some more exploring up there. Right. Positive, you know, to continue with this. Uh-huh. Because that was such a very small area when you think of it that they just gathered uh rocks and and uh oh samples of soil and all and they did uh probe for some samples. And just what is going to come of that, I don't know. I'd be glad to read it.\"}
4uri: gs://shuowei_bucket/audio/TX048clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': \"Say this is entered customs. And uh the next step is to prepare a Mexican customs entry. Mexico call them pedimentos. It's really an appraisal sheet. It's it's a breakdown of the sheet. You use the same information. That's right. But here you give them all the information, for example, let me see, I found one right here. You give them all the information on on what the invoice contains. Here's one. On what the invoice contains, what uh what what the material is itself. If you hear, for example, this is some bottling equipment going over there and on the appraisal sheet, so you tell them who it's for, who made the invoice out of the states. This came from St. Louis, Missouri. Yeah. How much the freight was. And on the reverse of this of this thing, but the everything has to be marked. So complete identification. This identification number is that traffic number I spoke to you a little while ago. Number of boxes, what they contain, the tariff duty, the tariff number, the ad valorem rate, and what the actual duties would be paid. For example, here would be 996 pesos and 73 cents. And as you can see, this is a form that has already been approved by the appraiser. These appraisers are called Vistas. It's very different from the Vistas we have nowadays here.\"}
\n", + "

5 rows × 2 columns

\n", + "
[5 rows x 2 columns in total]" + ], + "text/plain": [ + " audio \\\n", + "0 {'uri': 'gs://shuowei_bucket/audio/ID014clip.m... \n", + "1 {'uri': 'gs://shuowei_bucket/audio/HI009clip.m... \n", + "2 {'uri': 'gs://shuowei_bucket/audio/AK012clip.m... \n", + "3 {'uri': 'gs://shuowei_bucket/audio/CA138clip.m... \n", + "4 {'uri': 'gs://shuowei_bucket/audio/TX048clip.m... \n", + "\n", + " text \n", + "0 {'status': '', 'content': \"I'll tell you somet... \n", + "1 {'status': '', 'content': \"Uh, I hear that the... \n", + "2 {'status': '', 'content': \"the soapstone carvi... \n", + "3 {'status': '', 'content': \"was rather interest... \n", + "4 {'status': '', 'content': \"Say this is entered... \n", + "\n", + "[5 rows x 2 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/test_notebook.ipynb b/notebooks/test_notebook.ipynb new file mode 100644 index 0000000000..ce85a846de --- /dev/null +++ b/notebooks/test_notebook.ipynb @@ -0,0 +1,58 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import bigframes.pandas as bpd\n", + "bpd.options.bigquery.project = 'bigquery-public-data'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gender_filter = 'M'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "sql" + ] + }, + "outputs": [], + "source": [ + "%%sql\n", + "SELECT * FROM `bigquery-public-data.usa_names.usa_1910_2013` WHERE gender = '{gender_filter}'" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.5" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/tests/system/small/test_loc.py b/tests/system/small/test_loc.py new file mode 100644 index 0000000000..2f0b9df31b --- /dev/null +++ b/tests/system/small/test_loc.py @@ -0,0 +1,222 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +import pytest + +from bigframes.testing.utils import assert_pandas_df_equal + + +def test_loc_select_columns_w_repeats(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.loc[:, ["string_col", "string_col"]].to_pandas() + pd_result = scalars_pandas_df_index.loc[:, ["string_col", "string_col"]] + assert_pandas_df_equal(bf_result, pd_result) + + +def test_loc_select_rows_and_columns_w_repeats( + scalars_df_index, scalars_pandas_df_index +): + bf_result = scalars_df_index.loc[ + [2, 3, 2], ["string_col", "string_col"] + ].to_pandas() + pd_result = scalars_pandas_df_index.loc[[2, 3, 2], ["string_col", "string_col"]] + assert_pandas_df_equal(bf_result, pd_result) + + +def test_loc_slice_rows_and_select_columns_w_repeats( + scalars_df_index, scalars_pandas_df_index +): + bf_result = scalars_df_index.loc[2:5, ["string_col", "string_col"]].to_pandas() + pd_result = scalars_pandas_df_index.loc[2:5, ["string_col", "string_col"]] + assert_pandas_df_equal(bf_result, pd_result) + + +def test_loc_bool_series(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.loc[scalars_df_index["bool_col"]].to_pandas() + pd_result = scalars_pandas_df_index.loc[scalars_pandas_df_index["bool_col"]] + assert_pandas_df_equal(bf_result, pd_result) + + +def test_loc_list_select_rows_and_columns(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.loc[[2, 3], ["string_col", "int64_col"]].to_pandas() + pd_result = scalars_pandas_df_index.loc[[2, 3], ["string_col", "int64_col"]] + assert_pandas_df_equal(bf_result, pd_result) + + +def test_loc_select_column(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.loc[:, "string_col"].to_pandas() + pd_result = scalars_pandas_df_index.loc[:, "string_col"] + assert_pandas_df_equal(bf_result, pd_result) + + +def test_loc_select_with_column_condition(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.loc[ + scalars_df_index["bool_col"], "string_col" + ].to_pandas() + pd_result = scalars_pandas_df_index.loc[ + scalars_pandas_df_index["bool_col"], "string_col" + ] + assert_pandas_df_equal(bf_result, pd_result) + + +def test_loc_select_with_column_condition_bf_series( + scalars_df_index, scalars_pandas_df_index +): + bf_result = scalars_df_index.loc[ + scalars_df_index["bool_col"], scalars_df_index.columns.to_series() + ].to_pandas() + pd_result = scalars_pandas_df_index.loc[ + scalars_pandas_df_index["bool_col"], + scalars_pandas_df_index.columns.to_series(), + ] + assert_pandas_df_equal(bf_result, pd_result) + + +def test_loc_single_index_with_duplicate(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.set_index("int64_col").loc[2].to_pandas() + pd_result = scalars_pandas_df_index.set_index("int64_col").loc[2] + assert_pandas_df_equal(bf_result, pd_result) + + +def test_loc_single_index_no_duplicate(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.set_index("int64_col").loc[6].to_pandas() + pd_result = scalars_pandas_df_index.set_index("int64_col").loc[6] + assert_pandas_df_equal(bf_result, pd_result) + + +def test_loc_setitem_slice_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df.loc[2:5, "int64_col"] = 99 + pd_df.loc[2:5, "int64_col"] = 99 + assert_pandas_df_equal(bf_df.to_pandas(), pd_df) + + +def test_loc_setitem_slice_series(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_series = bf_df["int64_col"] * 2 + pd_series = pd_df["int64_col"] * 2 + bf_df.loc[2:5, "int64_col"] = bf_series + pd_df.loc[2:5, "int64_col"] = pd_series + assert_pandas_df_equal(bf_df.to_pandas(), pd_df) + + +def test_loc_setitem_list_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df.loc[[2, 5], "int64_col"] = 99 + pd_df.loc[[2, 5], "int64_col"] = 99 + assert_pandas_df_equal(bf_df.to_pandas(), pd_df) + + +def test_loc_setitem_list_series(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_series = bf_df["int64_col"] * 2 + pd_series = pd_df["int64_col"] * 2 + bf_df.loc[[2, 5], "int64_col"] = bf_series + pd_df.loc[[2, 5], "int64_col"] = pd_series + assert_pandas_df_equal(bf_df.to_pandas(), pd_df) + + +@pytest.mark.parametrize( + ("col", "value"), + [ + ("new_col", 99), + ("int64_col", -1), + ("string_col", "new_string"), + ("date_col", pd.Timestamp("2024-01-01")), + ], +) +def test_loc_setitem_bool_series_scalar(scalars_dfs, col, value): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df.loc[bf_df["bool_col"], col] = value + pd_df.loc[pd_df["bool_col"], col] = value + assert_pandas_df_equal(bf_df.to_pandas(), pd_df) + + +def test_loc_setitem_bool_series_scalar_error(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + with pytest.raises(TypeError): + bf_df.loc[bf_df["bool_col"], "int64_col"] = "incompatible_string" + with pytest.raises(TypeError): + pd_df.loc[pd_df["bool_col"], "int64_col"] = "incompatible_string" + + +def test_loc_list_string_index(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.set_index("string_col").loc[["cat", "dog"]].to_pandas() + pd_result = scalars_pandas_df_index.set_index("string_col").loc[["cat", "dog"]] + assert_pandas_df_equal(bf_result, pd_result) + + +def test_loc_list_integer_index(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.loc[[2, 3]].to_pandas() + pd_result = scalars_pandas_df_index.loc[[2, 3]] + assert_pandas_df_equal(bf_result, pd_result) + + +def test_loc_list_multiindex(scalars_dfs): + # TODO: supply a reason why this isn't compatible with pandas 1.x + pytest.importorskip("pandas", minversion="2.0.0") + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.set_index(["string_col", "int64_col"]) + pd_df = scalars_pandas_df.set_index(["string_col", "int64_col"]) + bf_result = bf_df.loc[[("cat", 2), ("dog", 2)]].to_pandas() + pd_result = pd_df.loc[[("cat", 2), ("dog", 2)]] + assert_pandas_df_equal(bf_result, pd_result) + + +def test_loc_bf_series_string_index(scalars_df_index, scalars_pandas_df_index): + bf_result = ( + scalars_df_index.set_index("string_col") + .loc[scalars_df_index["string_col"]] + .to_pandas() + ) + pd_result = scalars_pandas_df_index.set_index("string_col").loc[ + scalars_pandas_df_index["string_col"] + ] + assert_pandas_df_equal(bf_result, pd_result) + + +def test_loc_bf_series_multiindex(scalars_df_index, scalars_pandas_df_index): + bf_df = scalars_df_index.set_index(["string_col", "int64_col"]) + pd_df = scalars_pandas_df_index.set_index(["string_col", "int64_col"]) + bf_result = bf_df.loc[bf_df.index.to_series()].to_pandas() + pd_result = pd_df.loc[pd_df.index.to_series()] + assert_pandas_df_equal(bf_result, pd_result) + + +def test_loc_bf_index_integer_index(scalars_df_index, scalars_pandas_df_index): + bf_result = scalars_df_index.loc[scalars_df_index.index].to_pandas() + pd_result = scalars_pandas_df_index.loc[scalars_pandas_df_index.index] + assert_pandas_df_equal(bf_result, pd_result) + + +def test_loc_bf_index_integer_index_renamed_col( + scalars_df_index, scalars_pandas_df_index +): + bf_df = scalars_df_index.rename(columns={"int64_col": "new_name"}) + pd_df = scalars_pandas_df_index.rename(columns={"int64_col": "new_name"}) + bf_result = bf_df.loc[bf_df.index].to_pandas() + pd_result = pd_df.loc[pd_df.index] + assert_pandas_df_equal(bf_result, pd_result) diff --git a/tests/unit/ml/test_utils.py b/tests/unit/ml/test_utils.py new file mode 100644 index 0000000000..9b273cb716 --- /dev/null +++ b/tests/unit/ml/test_utils.py @@ -0,0 +1,34 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from bigframes.ml import utils + + +@pytest.mark.parametrize( + ("input", "expected"), + [ + ("STRING", "string"), + ("str", "string"), + ("Integer", "int64"), + ("int64", "int64"), + ("boolean", "bool"), + ("bool", "bool"), + ("float", "float64"), + ("float64", "float64"), + ], +) +def test_standardize_type(input, expected): + assert utils.standardize_type(input) == expected From 3cc643d9cf87468a07d1e381dc9f931faa4f5d0b Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 30 Oct 2025 20:20:36 +0000 Subject: [PATCH 44/53] Revert "Revert: Unwanted code changes" This reverts commit db5d8ea04ee3e8a6382ac546764aff0f6880f66b. --- =3.4.0 | 13 - =3.4.0, | 0 emp | 0 .../dataframes/anywidget_mode.nbconvert.ipynb | 622 --------------- notebooks/e2e_RAG_bk.ipynb | 624 --------------- notebooks/e2e_RAG_debug.ipynb | 305 -------- notebooks/e2e_RAG_prod.ipynb | 571 -------------- notebooks/e2e_RAG_prod_1M.ipynb | 661 ---------------- notebooks/e2e_RAG_test.ipynb | 712 ------------------ notebooks/google_sql_notebook.ipynb | 54 -- .../multimodal/transcribe_partial_mode.ipynb | 153 ---- notebooks/test.ipynb | 225 ------ notebooks/test_blob_trancription.ipynb | 310 -------- notebooks/test_blob_trans_blur_image.ipynb | 200 ----- notebooks/test_blob_trans_pdf_extract.ipynb | 445 ----------- notebooks/test_blob_transcribe.ipynb | 157 ---- .../test_blob_transcribe_1M_short_audio.ipynb | 450 ----------- ...st_blob_transcribe_1M_short_audio_v1.ipynb | 345 --------- .../test_blob_transcribe_long_audio.ipynb | 315 -------- .../test_blob_transcribe_long_audio_2p5.ipynb | 271 ------- notebooks/test_notebook.ipynb | 58 -- tests/system/small/test_loc.py | 222 ------ tests/unit/ml/test_utils.py | 34 - 23 files changed, 6747 deletions(-) delete mode 100644 =3.4.0 delete mode 100644 =3.4.0, delete mode 100644 emp delete mode 100644 notebooks/dataframes/anywidget_mode.nbconvert.ipynb delete mode 100644 notebooks/e2e_RAG_bk.ipynb delete mode 100644 notebooks/e2e_RAG_debug.ipynb delete mode 100644 notebooks/e2e_RAG_prod.ipynb delete mode 100644 notebooks/e2e_RAG_prod_1M.ipynb delete mode 100644 notebooks/e2e_RAG_test.ipynb delete mode 100644 notebooks/google_sql_notebook.ipynb delete mode 100644 notebooks/multimodal/transcribe_partial_mode.ipynb delete mode 100644 notebooks/test.ipynb delete mode 100644 notebooks/test_blob_trancription.ipynb delete mode 100644 notebooks/test_blob_trans_blur_image.ipynb delete mode 100644 notebooks/test_blob_trans_pdf_extract.ipynb delete mode 100644 notebooks/test_blob_transcribe.ipynb delete mode 100644 notebooks/test_blob_transcribe_1M_short_audio.ipynb delete mode 100644 notebooks/test_blob_transcribe_1M_short_audio_v1.ipynb delete mode 100644 notebooks/test_blob_transcribe_long_audio.ipynb delete mode 100644 notebooks/test_blob_transcribe_long_audio_2p5.ipynb delete mode 100644 notebooks/test_notebook.ipynb delete mode 100644 tests/system/small/test_loc.py delete mode 100644 tests/unit/ml/test_utils.py diff --git a/=3.4.0 b/=3.4.0 deleted file mode 100644 index 51e648aef4..0000000000 --- a/=3.4.0 +++ /dev/null @@ -1,13 +0,0 @@ -Collecting pypdf[crypto] - Downloading pypdf-6.0.0-py3-none-any.whl (310 kB) - ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 310.5/310.5 kB 11.5 MB/s eta 0:00:00 -Requirement already satisfied: typing_extensions>=4.0 in ./venv/lib/python3.10/site-packages (from pypdf[crypto]) (4.14.1) -Collecting cryptography - Downloading cryptography-45.0.6-cp37-abi3-manylinux_2_34_x86_64.whl (4.4 MB) - ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 4.4/4.4 MB 78.9 MB/s eta 0:00:00 -Collecting cffi>=1.14 - Using cached cffi-1.17.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (446 kB) -Collecting pycparser - Using cached pycparser-2.22-py3-none-any.whl (117 kB) -Installing collected packages: pypdf, pycparser, cffi, cryptography -Successfully installed cffi-1.17.1 cryptography-45.0.6 pycparser-2.22 pypdf-6.0.0 diff --git a/=3.4.0, b/=3.4.0, deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/emp b/emp deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/notebooks/dataframes/anywidget_mode.nbconvert.ipynb b/notebooks/dataframes/anywidget_mode.nbconvert.ipynb deleted file mode 100644 index 32a4b432a2..0000000000 --- a/notebooks/dataframes/anywidget_mode.nbconvert.ipynb +++ /dev/null @@ -1,622 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "d10bfca4", - "metadata": { - "execution": { - "iopub.execute_input": "2025-08-19T18:07:47.689257Z", - "iopub.status.busy": "2025-08-19T18:07:47.688863Z", - "iopub.status.idle": "2025-08-19T18:07:47.694257Z", - "shell.execute_reply": "2025-08-19T18:07:47.693398Z" - } - }, - "outputs": [], - "source": [ - "# Copyright 2025 Google LLC\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "id": "acca43ae", - "metadata": {}, - "source": [ - "# Demo to Show Anywidget mode" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "ca22f059", - "metadata": { - "execution": { - "iopub.execute_input": "2025-08-19T18:07:47.697344Z", - "iopub.status.busy": "2025-08-19T18:07:47.697049Z", - "iopub.status.idle": "2025-08-19T18:07:49.528371Z", - "shell.execute_reply": "2025-08-19T18:07:49.527605Z" - } - }, - "outputs": [], - "source": [ - "import bigframes.pandas as bpd" - ] - }, - { - "cell_type": "markdown", - "id": "04406a4d", - "metadata": {}, - "source": [ - "Set the display option to use anywidget" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "1bc5aaf3", - "metadata": { - "execution": { - "iopub.execute_input": "2025-08-19T18:07:49.531182Z", - "iopub.status.busy": "2025-08-19T18:07:49.530928Z", - "iopub.status.idle": "2025-08-19T18:07:49.535337Z", - "shell.execute_reply": "2025-08-19T18:07:49.534613Z" - } - }, - "outputs": [], - "source": [ - "bpd.options.bigquery.ordering_mode = \"partial\"\n", - "bpd.options.display.repr_mode = \"anywidget\"" - ] - }, - { - "cell_type": "markdown", - "id": "0a354c69", - "metadata": {}, - "source": [ - "Load Sample Data" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "f289d250", - "metadata": { - "execution": { - "iopub.execute_input": "2025-08-19T18:07:49.538687Z", - "iopub.status.busy": "2025-08-19T18:07:49.538398Z", - "iopub.status.idle": "2025-08-19T18:07:53.574536Z", - "shell.execute_reply": "2025-08-19T18:07:53.573718Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Computation deferred. Computation will process 171.4 MB\n" - ] - } - ], - "source": [ - "df = bpd.read_gbq(\"bigquery-public-data.usa_names.usa_1910_2013\")\n", - "print(df)" - ] - }, - { - "cell_type": "markdown", - "id": "3a73e472", - "metadata": {}, - "source": [ - "Display Series in anywidget mode" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "42bb02ab", - "metadata": { - "execution": { - "iopub.execute_input": "2025-08-19T18:07:53.577575Z", - "iopub.status.busy": "2025-08-19T18:07:53.577219Z", - "iopub.status.idle": "2025-08-19T18:07:53.997894Z", - "shell.execute_reply": "2025-08-19T18:07:53.996854Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Computation deferred. Computation will process 44.4 MB\n" - ] - } - ], - "source": [ - "test_series = df[\"year\"]\n", - "print(test_series)" - ] - }, - { - "cell_type": "markdown", - "id": "7bcf1bb7", - "metadata": {}, - "source": [ - "Display with Pagination" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "ce250157", - "metadata": { - "execution": { - "iopub.execute_input": "2025-08-19T18:07:54.001504Z", - "iopub.status.busy": "2025-08-19T18:07:54.000991Z", - "iopub.status.idle": "2025-08-19T18:07:56.279608Z", - "shell.execute_reply": "2025-08-19T18:07:56.278922Z" - } - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d2ab83b1e9f24674a73a12094be1e831", - "version_major": 2, - "version_minor": 1 - }, - "text/plain": [ - "TableWidget()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [], - "text/plain": [ - "Computation deferred. Computation will process 171.4 MB" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df" - ] - }, - { - "cell_type": "markdown", - "id": "bb15bab6", - "metadata": {}, - "source": [ - "Programmatic Navigation Demo" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "6920d49b", - "metadata": { - "execution": { - "iopub.execute_input": "2025-08-19T18:07:56.282008Z", - "iopub.status.busy": "2025-08-19T18:07:56.281778Z", - "iopub.status.idle": "2025-08-19T18:07:56.959938Z", - "shell.execute_reply": "2025-08-19T18:07:56.959205Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Total pages: 555246\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e7c10bb6833b4f649a26d5f33b00897b", - "version_major": 2, - "version_minor": 1 - }, - "text/plain": [ - "TableWidget()" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from bigframes.display.anywidget import TableWidget\n", - "import math\n", - " \n", - "# Create widget programmatically \n", - "widget = TableWidget(df)\n", - "print(f\"Total pages: {math.ceil(widget.row_count / widget.page_size)}\")\n", - " \n", - "# Display the widget\n", - "widget" - ] - }, - { - "cell_type": "markdown", - "id": "02cbd1be", - "metadata": {}, - "source": [ - "Test Navigation Programmatically" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "12b68f15", - "metadata": { - "execution": { - "iopub.execute_input": "2025-08-19T18:07:56.962194Z", - "iopub.status.busy": "2025-08-19T18:07:56.961974Z", - "iopub.status.idle": "2025-08-19T18:07:56.965782Z", - "shell.execute_reply": "2025-08-19T18:07:56.965121Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Current page: 0\n", - "After next: 1\n", - "After prev: 0\n" - ] - } - ], - "source": [ - "# Simulate button clicks programmatically\n", - "print(\"Current page:\", widget.page)\n", - "\n", - "# Go to next page\n", - "widget.page = 1\n", - "print(\"After next:\", widget.page)\n", - "\n", - "# Go to previous page\n", - "widget.page = 0\n", - "print(\"After prev:\", widget.page)" - ] - }, - { - "cell_type": "markdown", - "id": "9d310138", - "metadata": {}, - "source": [ - "Edge Case Demonstration" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "a9d5d13a", - "metadata": { - "execution": { - "iopub.execute_input": "2025-08-19T18:07:56.968276Z", - "iopub.status.busy": "2025-08-19T18:07:56.968023Z", - "iopub.status.idle": "2025-08-19T18:08:12.463471Z", - "shell.execute_reply": "2025-08-19T18:08:12.462652Z" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/array_value.py:230: AmbiguousWindowWarning: Window ordering may be ambiguous, this can cause unstable results.\n", - " warnings.warn(msg, bfe.AmbiguousWindowWarning)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Small dataset pages: 1\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "a670ee71d58c47babab171f4f229db62", - "version_major": 2, - "version_minor": 1 - }, - "text/plain": [ - "TableWidget()" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Test with very small dataset\n", - "small_df = df.sort_values([\"name\", \"year\", \"state\"]).head(5)\n", - "small_widget = TableWidget(small_df)\n", - "print(f\"Small dataset pages: {math.ceil(small_widget.row_count / small_widget.page_size)}\")\n", - "small_widget" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c4e5836b-c872-4a9c-b9ec-14f6f338176d", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "state": { - "2ac7d45b9bce40f196823982403f3bf3": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "2.0.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "2.0.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border_bottom": null, - "border_left": null, - "border_right": null, - "border_top": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "45d720d8fd954a529cc657457f681ee1": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "2.0.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "2.0.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border_bottom": null, - "border_left": null, - "border_right": null, - "border_top": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "548b9bae022d4dc5a38a6a8740276387": { - "model_module": "@jupyter-widgets/base", - "model_module_version": "2.0.0", - "model_name": "LayoutModel", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "2.0.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "2.0.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border_bottom": null, - "border_left": null, - "border_right": null, - "border_top": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "a670ee71d58c47babab171f4f229db62": { - "model_module": "anywidget", - "model_module_version": "~0.9.*", - "model_name": "AnyModel", - "state": { - "_anywidget_id": "bigframes.display.anywidget.TableWidget", - "_css": "/**\n * Copyright 2025 Google LLC\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n.bigframes-widget {\n\tdisplay: inline-block;\n}\n\n.bigframes-widget .table-container {\n\tmax-height: 620px;\n\toverflow: auto;\n}\n\n.bigframes-widget .footer {\n\talign-items: center;\n\tdisplay: flex;\n\tfont-size: 0.8rem;\n\tpadding-top: 8px;\n}\n\n.bigframes-widget .footer > * {\n\tflex: 1;\n}\n\n.bigframes-widget .pagination {\n\talign-items: center;\n\tdisplay: flex;\n\tflex-direction: row;\n\tgap: 4px;\n\tjustify-content: center;\n\tpadding: 4px;\n}\n\n.bigframes-widget .page-size {\n\talign-items: center;\n\tdisplay: flex;\n\tflex-direction: row;\n\tgap: 4px;\n\tjustify-content: end;\n}\n\n.bigframes-widget table {\n\tborder-collapse: collapse;\n\ttext-align: left;\n}\n\n.bigframes-widget th {\n\tbackground-color: var(--colab-primary-surface-color, var(--jp-layout-color0));\n\t/* Uncomment once we support sorting: cursor: pointer; */\n\tposition: sticky;\n\ttop: 0;\n\tz-index: 1;\n}\n\n.bigframes-widget button {\n\tcursor: pointer;\n\tdisplay: inline-block;\n\ttext-align: center;\n\ttext-decoration: none;\n\tuser-select: none;\n\tvertical-align: middle;\n}\n\n.bigframes-widget button:disabled {\n\topacity: 0.65;\n\tpointer-events: none;\n}\n", - "_dom_classes": [], - "_esm": "\nfunction render({ model, el }) {\n\tconsole.log(\"render called\");\n\t// Main container with a unique class for CSS scoping\n\tel.classList.add(\"bigframes-widget\");\n\n\t// State\n\tlet page = 0;\n\tlet pageSize = 10;\n\tlet rowCount = 0;\n\tlet tableHtml = \"\";\n\n\t// Structure\n\tconst tableContainer = document.createElement(\"div\");\n\tconst footer = document.createElement(\"div\");\n\n\t// Footer: Total rows label\n\tconst rowCountLabel = document.createElement(\"div\");\n\n\t// Footer: Pagination controls\n\tconst paginationContainer = document.createElement(\"div\");\n\tconst prevPage = document.createElement(\"button\");\n\tconst paginationLabel = document.createElement(\"span\");\n\tconst nextPage = document.createElement(\"button\");\n\n\t// Footer: Page size controls\n\tconst pageSizeContainer = document.createElement(\"div\");\n\tconst pageSizeLabel = document.createElement(\"label\");\n\tconst pageSizeSelect = document.createElement(\"select\");\n\n\t// Add CSS classes\n\ttableContainer.classList.add(\"table-container\");\n\tfooter.classList.add(\"footer\");\n\tpaginationContainer.classList.add(\"pagination\");\n\tpageSizeContainer.classList.add(\"page-size\");\n\n\t// Configure pagination buttons\n\tprevPage.type = \"button\";\n\tnextPage.type = \"button\";\n\tprevPage.textContent = \"Prev\";\n\tnextPage.textContent = \"Next\";\n\n\t// Configure page size selector\n\tpageSizeLabel.textContent = \"Page Size\";\n\tfor (const size of [10, 25, 50, 100]) {\n\t\tconst option = document.createElement(\"option\");\n\t\toption.value = size;\n\t\toption.textContent = size;\n\t\tpageSizeSelect.appendChild(option);\n\t}\n\n\t// Add event listeners\n\tprevPage.addEventListener(\"click\", () => {\n\t\tmodel.send({ type: \"page_change\", page: page - 1 });\n\t});\n\tnextPage.addEventListener(\"click\", () => {\n\t\tmodel.send({ type: \"page_change\", page: page + 1 });\n\t});\n\tpageSizeSelect.addEventListener(\"change\", (e) => {\n\t\tconst newSize = Number(e.target.value);\n\t\tif (newSize) {\n\t\t\tmodel.send({ type: \"page_size_change\", page_size: newSize });\n\t\t}\n\t});\n\n\tfunction updateUI() {\n\t\tconst totalPages = Math.ceil(rowCount / pageSize);\n\t\trowCountLabel.textContent = `${rowCount.toLocaleString()} total rows`;\n\t\tpaginationLabel.textContent = `Page ${page + 1} of ${totalPages || 1}`;\n\t\tprevPage.disabled = page === 0;\n\t\tnextPage.disabled = page >= totalPages - 1;\n\t\tpageSizeSelect.value = pageSize;\n\t\ttableContainer.innerHTML = tableHtml;\n\t}\n\n\tmodel.onMsg((msg) => {\n\t\tconsole.log(\"message received\", msg);\n\t\tif (msg.type === \"update\") {\n\t\t\tpage = msg.page;\n\t\t\tpageSize = msg.page_size;\n\t\t\trowCount = msg.row_count;\n\t\t\ttableHtml = msg.table_html;\n\t\t\tupdateUI();\n\t\t}\n\t});\n\n\t// Assemble the DOM\n\tpaginationContainer.appendChild(prevPage);\n\tpaginationContainer.appendChild(paginationLabel);\n\tpaginationContainer.appendChild(nextPage);\n\n\tpageSizeContainer.appendChild(pageSizeLabel);\n\tpageSizeContainer.appendChild(pageSizeSelect);\n\n\tfooter.appendChild(rowCountLabel);\n\tfooter.appendChild(paginationContainer);\n\tfooter.appendChild(pageSizeContainer);\n\n\tel.appendChild(tableContainer);\n\tel.appendChild(footer);\n\n\t// Initial UI state\n\tupdateUI();\n}\n\nexport default { render };\n", - "_model_module": "anywidget", - "_model_module_version": "~0.9.*", - "_model_name": "AnyModel", - "_view_count": null, - "_view_module": "anywidget", - "_view_module_version": "~0.9.*", - "_view_name": "AnyView", - "layout": "IPY_MODEL_548b9bae022d4dc5a38a6a8740276387", - "tabbable": null, - "tooltip": null - } - }, - "d2ab83b1e9f24674a73a12094be1e831": { - "model_module": "anywidget", - "model_module_version": "~0.9.*", - "model_name": "AnyModel", - "state": { - "_anywidget_id": "bigframes.display.anywidget.TableWidget", - "_css": "/**\n * Copyright 2025 Google LLC\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n.bigframes-widget {\n\tdisplay: inline-block;\n}\n\n.bigframes-widget .table-container {\n\tmax-height: 620px;\n\toverflow: auto;\n}\n\n.bigframes-widget .footer {\n\talign-items: center;\n\tdisplay: flex;\n\tfont-size: 0.8rem;\n\tpadding-top: 8px;\n}\n\n.bigframes-widget .footer > * {\n\tflex: 1;\n}\n\n.bigframes-widget .pagination {\n\talign-items: center;\n\tdisplay: flex;\n\tflex-direction: row;\n\tgap: 4px;\n\tjustify-content: center;\n\tpadding: 4px;\n}\n\n.bigframes-widget .page-size {\n\talign-items: center;\n\tdisplay: flex;\n\tflex-direction: row;\n\tgap: 4px;\n\tjustify-content: end;\n}\n\n.bigframes-widget table {\n\tborder-collapse: collapse;\n\ttext-align: left;\n}\n\n.bigframes-widget th {\n\tbackground-color: var(--colab-primary-surface-color, var(--jp-layout-color0));\n\t/* Uncomment once we support sorting: cursor: pointer; */\n\tposition: sticky;\n\ttop: 0;\n\tz-index: 1;\n}\n\n.bigframes-widget button {\n\tcursor: pointer;\n\tdisplay: inline-block;\n\ttext-align: center;\n\ttext-decoration: none;\n\tuser-select: none;\n\tvertical-align: middle;\n}\n\n.bigframes-widget button:disabled {\n\topacity: 0.65;\n\tpointer-events: none;\n}\n", - "_dom_classes": [], - "_esm": "\nfunction render({ model, el }) {\n\tconsole.log(\"render called\");\n\t// Main container with a unique class for CSS scoping\n\tel.classList.add(\"bigframes-widget\");\n\n\t// State\n\tlet page = 0;\n\tlet pageSize = 10;\n\tlet rowCount = 0;\n\tlet tableHtml = \"\";\n\n\t// Structure\n\tconst tableContainer = document.createElement(\"div\");\n\tconst footer = document.createElement(\"div\");\n\n\t// Footer: Total rows label\n\tconst rowCountLabel = document.createElement(\"div\");\n\n\t// Footer: Pagination controls\n\tconst paginationContainer = document.createElement(\"div\");\n\tconst prevPage = document.createElement(\"button\");\n\tconst paginationLabel = document.createElement(\"span\");\n\tconst nextPage = document.createElement(\"button\");\n\n\t// Footer: Page size controls\n\tconst pageSizeContainer = document.createElement(\"div\");\n\tconst pageSizeLabel = document.createElement(\"label\");\n\tconst pageSizeSelect = document.createElement(\"select\");\n\n\t// Add CSS classes\n\ttableContainer.classList.add(\"table-container\");\n\tfooter.classList.add(\"footer\");\n\tpaginationContainer.classList.add(\"pagination\");\n\tpageSizeContainer.classList.add(\"page-size\");\n\n\t// Configure pagination buttons\n\tprevPage.type = \"button\";\n\tnextPage.type = \"button\";\n\tprevPage.textContent = \"Prev\";\n\tnextPage.textContent = \"Next\";\n\n\t// Configure page size selector\n\tpageSizeLabel.textContent = \"Page Size\";\n\tfor (const size of [10, 25, 50, 100]) {\n\t\tconst option = document.createElement(\"option\");\n\t\toption.value = size;\n\t\toption.textContent = size;\n\t\tpageSizeSelect.appendChild(option);\n\t}\n\n\t// Add event listeners\n\tprevPage.addEventListener(\"click\", () => {\n\t\tmodel.send({ type: \"page_change\", page: page - 1 });\n\t});\n\tnextPage.addEventListener(\"click\", () => {\n\t\tmodel.send({ type: \"page_change\", page: page + 1 });\n\t});\n\tpageSizeSelect.addEventListener(\"change\", (e) => {\n\t\tconst newSize = Number(e.target.value);\n\t\tif (newSize) {\n\t\t\tmodel.send({ type: \"page_size_change\", page_size: newSize });\n\t\t}\n\t});\n\n\tfunction updateUI() {\n\t\tconst totalPages = Math.ceil(rowCount / pageSize);\n\t\trowCountLabel.textContent = `${rowCount.toLocaleString()} total rows`;\n\t\tpaginationLabel.textContent = `Page ${page + 1} of ${totalPages || 1}`;\n\t\tprevPage.disabled = page === 0;\n\t\tnextPage.disabled = page >= totalPages - 1;\n\t\tpageSizeSelect.value = pageSize;\n\t\ttableContainer.innerHTML = tableHtml;\n\t}\n\n\tmodel.onMsg((msg) => {\n\t\tconsole.log(\"message received\", msg);\n\t\tif (msg.type === \"update\") {\n\t\t\tpage = msg.page;\n\t\t\tpageSize = msg.page_size;\n\t\t\trowCount = msg.row_count;\n\t\t\ttableHtml = msg.table_html;\n\t\t\tupdateUI();\n\t\t}\n\t});\n\n\t// Assemble the DOM\n\tpaginationContainer.appendChild(prevPage);\n\tpaginationContainer.appendChild(paginationLabel);\n\tpaginationContainer.appendChild(nextPage);\n\n\tpageSizeContainer.appendChild(pageSizeLabel);\n\tpageSizeContainer.appendChild(pageSizeSelect);\n\n\tfooter.appendChild(rowCountLabel);\n\tfooter.appendChild(paginationContainer);\n\tfooter.appendChild(pageSizeContainer);\n\n\tel.appendChild(tableContainer);\n\tel.appendChild(footer);\n\n\t// Initial UI state\n\tupdateUI();\n}\n\nexport default { render };\n", - "_model_module": "anywidget", - "_model_module_version": "~0.9.*", - "_model_name": "AnyModel", - "_view_count": null, - "_view_module": "anywidget", - "_view_module_version": "~0.9.*", - "_view_name": "AnyView", - "layout": "IPY_MODEL_45d720d8fd954a529cc657457f681ee1", - "tabbable": null, - "tooltip": null - } - }, - "e7c10bb6833b4f649a26d5f33b00897b": { - "model_module": "anywidget", - "model_module_version": "~0.9.*", - "model_name": "AnyModel", - "state": { - "_anywidget_id": "bigframes.display.anywidget.TableWidget", - "_css": "/**\n * Copyright 2025 Google LLC\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *\n * http://www.apache.org/licenses/LICENSE-2.0\n *\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\n.bigframes-widget {\n\tdisplay: inline-block;\n}\n\n.bigframes-widget .table-container {\n\tmax-height: 620px;\n\toverflow: auto;\n}\n\n.bigframes-widget .footer {\n\talign-items: center;\n\tdisplay: flex;\n\tfont-size: 0.8rem;\n\tpadding-top: 8px;\n}\n\n.bigframes-widget .footer > * {\n\tflex: 1;\n}\n\n.bigframes-widget .pagination {\n\talign-items: center;\n\tdisplay: flex;\n\tflex-direction: row;\n\tgap: 4px;\n\tjustify-content: center;\n\tpadding: 4px;\n}\n\n.bigframes-widget .page-size {\n\talign-items: center;\n\tdisplay: flex;\n\tflex-direction: row;\n\tgap: 4px;\n\tjustify-content: end;\n}\n\n.bigframes-widget table {\n\tborder-collapse: collapse;\n\ttext-align: left;\n}\n\n.bigframes-widget th {\n\tbackground-color: var(--colab-primary-surface-color, var(--jp-layout-color0));\n\t/* Uncomment once we support sorting: cursor: pointer; */\n\tposition: sticky;\n\ttop: 0;\n\tz-index: 1;\n}\n\n.bigframes-widget button {\n\tcursor: pointer;\n\tdisplay: inline-block;\n\ttext-align: center;\n\ttext-decoration: none;\n\tuser-select: none;\n\tvertical-align: middle;\n}\n\n.bigframes-widget button:disabled {\n\topacity: 0.65;\n\tpointer-events: none;\n}\n", - "_dom_classes": [], - "_esm": "\nfunction render({ model, el }) {\n\tconsole.log(\"render called\");\n\t// Main container with a unique class for CSS scoping\n\tel.classList.add(\"bigframes-widget\");\n\n\t// State\n\tlet page = 0;\n\tlet pageSize = 10;\n\tlet rowCount = 0;\n\tlet tableHtml = \"\";\n\n\t// Structure\n\tconst tableContainer = document.createElement(\"div\");\n\tconst footer = document.createElement(\"div\");\n\n\t// Footer: Total rows label\n\tconst rowCountLabel = document.createElement(\"div\");\n\n\t// Footer: Pagination controls\n\tconst paginationContainer = document.createElement(\"div\");\n\tconst prevPage = document.createElement(\"button\");\n\tconst paginationLabel = document.createElement(\"span\");\n\tconst nextPage = document.createElement(\"button\");\n\n\t// Footer: Page size controls\n\tconst pageSizeContainer = document.createElement(\"div\");\n\tconst pageSizeLabel = document.createElement(\"label\");\n\tconst pageSizeSelect = document.createElement(\"select\");\n\n\t// Add CSS classes\n\ttableContainer.classList.add(\"table-container\");\n\tfooter.classList.add(\"footer\");\n\tpaginationContainer.classList.add(\"pagination\");\n\tpageSizeContainer.classList.add(\"page-size\");\n\n\t// Configure pagination buttons\n\tprevPage.type = \"button\";\n\tnextPage.type = \"button\";\n\tprevPage.textContent = \"Prev\";\n\tnextPage.textContent = \"Next\";\n\n\t// Configure page size selector\n\tpageSizeLabel.textContent = \"Page Size\";\n\tfor (const size of [10, 25, 50, 100]) {\n\t\tconst option = document.createElement(\"option\");\n\t\toption.value = size;\n\t\toption.textContent = size;\n\t\tpageSizeSelect.appendChild(option);\n\t}\n\n\t// Add event listeners\n\tprevPage.addEventListener(\"click\", () => {\n\t\tmodel.send({ type: \"page_change\", page: page - 1 });\n\t});\n\tnextPage.addEventListener(\"click\", () => {\n\t\tmodel.send({ type: \"page_change\", page: page + 1 });\n\t});\n\tpageSizeSelect.addEventListener(\"change\", (e) => {\n\t\tconst newSize = Number(e.target.value);\n\t\tif (newSize) {\n\t\t\tmodel.send({ type: \"page_size_change\", page_size: newSize });\n\t\t}\n\t});\n\n\tfunction updateUI() {\n\t\tconst totalPages = Math.ceil(rowCount / pageSize);\n\t\trowCountLabel.textContent = `${rowCount.toLocaleString()} total rows`;\n\t\tpaginationLabel.textContent = `Page ${page + 1} of ${totalPages || 1}`;\n\t\tprevPage.disabled = page === 0;\n\t\tnextPage.disabled = page >= totalPages - 1;\n\t\tpageSizeSelect.value = pageSize;\n\t\ttableContainer.innerHTML = tableHtml;\n\t}\n\n\tmodel.onMsg((msg) => {\n\t\tconsole.log(\"message received\", msg);\n\t\tif (msg.type === \"update\") {\n\t\t\tpage = msg.page;\n\t\t\tpageSize = msg.page_size;\n\t\t\trowCount = msg.row_count;\n\t\t\ttableHtml = msg.table_html;\n\t\t\tupdateUI();\n\t\t}\n\t});\n\n\t// Assemble the DOM\n\tpaginationContainer.appendChild(prevPage);\n\tpaginationContainer.appendChild(paginationLabel);\n\tpaginationContainer.appendChild(nextPage);\n\n\tpageSizeContainer.appendChild(pageSizeLabel);\n\tpageSizeContainer.appendChild(pageSizeSelect);\n\n\tfooter.appendChild(rowCountLabel);\n\tfooter.appendChild(paginationContainer);\n\tfooter.appendChild(pageSizeContainer);\n\n\tel.appendChild(tableContainer);\n\tel.appendChild(footer);\n\n\t// Initial UI state\n\tupdateUI();\n}\n\nexport default { render };\n", - "_model_module": "anywidget", - "_model_module_version": "~0.9.*", - "_model_name": "AnyModel", - "_view_count": null, - "_view_module": "anywidget", - "_view_module_version": "~0.9.*", - "_view_name": "AnyView", - "layout": "IPY_MODEL_2ac7d45b9bce40f196823982403f3bf3", - "tabbable": null, - "tooltip": null - } - } - }, - "version_major": 2, - "version_minor": 0 - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/e2e_RAG_bk.ipynb b/notebooks/e2e_RAG_bk.ipynb deleted file mode 100644 index 9b2f23a483..0000000000 --- a/notebooks/e2e_RAG_bk.ipynb +++ /dev/null @@ -1,624 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# Copyright 2025 Google LLC\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Get hands-on experience with Gemini-powered AI operator APIs in this notebook. We'll start with clear examples of API syntax, ensuring you understand how to use these operators. Then, we'll dive into a real-world application, showcasing their performance on a large dataset and providing key statistics. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Preparation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, import the BigFrames modules." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import bigframes\n", - "import bigframes.pandas as bpd" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Make sure the BigFrames version is at least `1.36.0`" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "from packaging.version import Version\n", - "\n", - "assert Version(bigframes.__version__) >= Version(\"1.36.0\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Connect to test environmet" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/_config/experiment_options.py:55: PreviewWarning: \u001b[93mBigFrames Blob is still under experiments. It may not work and subject\n", - "to change in the future.\u001b[0m\n", - " warnings.warn(msg, category=bfe.PreviewWarning)\n", - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/_config/bigquery_options.py:364: UserWarning: \u001b[93mThis is an advanced configuration option for directly setting\n", - "endpoints. Incorrect use may lead to unexpected behavior or system\n", - "instability. Proceed only if you fully understand its implications.\u001b[0m\n", - " warnings.warn(msg)\n" - ] - } - ], - "source": [ - "bigframes.options.experiments.blob = True\n", - "bigframes.options._bigquery_options.client_endpoints_override = {\"bqclient\": \"https://test-bigquery.sandbox.google.com\", \n", - " \"bqconnectionclient\": \"test-bigqueryconnection.sandbox.googleapis.com\", \n", - " \"bqstoragereadclient\": \"test-bigquerystorage-grpc.sandbox.googleapis.com\"}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# PDF chunk" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Retrieval of PDF URLs" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/global_session.py:114: DefaultLocationWarning: \u001b[93mNo explicit location is set, so using location US for the session.\u001b[0m\n", - " return func(get_global_session(), *args, **kwargs)\n" - ] - }, - { - "data": { - "text/html": [ - "Query job b261812f-a98e-4453-9a23-7f5c8ab7811b is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 370a4f1e-3b2c-405d-86f6-92167f03d464 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "chunks_df = bpd.from_glob_path(\"gs://shuowei_bucket/pdf/*\")\n", - "chunks_df.columns = [\"uri\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "#chunks_df = chunks_df.head(50)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# copy twice for testing\n", - "#copies = [chunks_df] * 10000\n", - "#chunks_df = bpd.concat(copies, ignore_index=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Text extraction, and chunking" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 3f5faa00-8366-4bfc-87f5-1dbe18e355fb is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dataframe.py:4117: PreviewWarning: \u001b[93maxis=1 scenario is in preview.\u001b[0m\n", - " warnings.warn(msg, category=bfe.PreviewWarning)\n" - ] - } - ], - "source": [ - "bq_connection = \"bigframes-dev.us.bigframes-default-connection\"\n", - "chunks_df[\"chunk_text\"] = chunks_df[\"uri\"].blob.pdf_chunk(connection=bq_connection, chunk_size=2000, overlap_size=200)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Explode column for future processing." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "chunk_df_exploded = chunks_df[\"chunk_text\"].explode()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job ee1ac492-65b8-4c1a-aac2-9abfadea3251 is DONE. 7.3 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 0bec4448-73ac-4478-88c2-eef74552fb3b is DONE. 8.6 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job c3a4fd8f-d42f-4d54-93ae-3b6e40f0e9ad is DONE. 8.6 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "0 Hydra: Bidirectional State Space Models\n", - "Throug...\n", - "0 multiple domains, including language and visio...\n", - "0 Mixing\n", - "��ℳ��&\n", - "Figure 1: (Left) A schematic of ...\n", - "0 parameterizations underpin efficient sequence ...\n", - "0 more\n", - "coherent and theoretically grounded advan...\n", - "0 important characteristics of downstream sequen...\n", - "0 preprocessing function and the matrix construc...\n", - "0 Sequence Aligned Matrices (SAM) to\n", - "systematica...\n", - "0 Toeplitz matrix mixer; GSS [26] adds a data-de...\n", - "0 (FNet is a structured matrix mixer without seq...\n", - "0 each generated fromQand K. Specifically, each\n", - "...\n", - "0 ��$\"��$:&×\"��&\"��$\"��$:'×\"��'\"\n", - "��&\"��&:!×\"��!\"...\n", - "0 represented within the matrix mixer framework,...\n", - "0 defined\n", - "as follows: a matrixM is N-quasisepara...\n", - "0 This generosity in the rank-based definition s...\n", - "0 consequence of the favorable mathematical prop...\n", - "0 84.1 88.2 69.1 91.0 85.9 47.6 83.9 78.4\n", - "Attent...\n", - "0 analyzing the matrix mixer framework through e...\n", - "0 rigorous and focused comparison between differ...\n", - "0 Appendix D.1.\n", - "Results. The results presented i...\n", - "0 BERT – trained with the latest HuggingFace\n", - "rec...\n", - "0 from 0.3 to 0.5 as stronger\n", - "regularization. We...\n", - "0 Michael Poli, James Zou, Atri\n", - "Rudra, and Chris...\n", - "0 of\n", - "deep bidirectional transformers for languag...\n", - "0 vision and pattern recognition.\n", - "2016, pp. 770–...\n", - "Name: chunk_text, dtype: string" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chunk_df_exploded.cache()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Generate Embeddings" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Generation of embeddings within BigFrames." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job b31a3f5e-943a-4c67-9c62-b8b058cced45 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from bigframes.ml import llm\n", - "\n", - "text_embedding_model = llm.TextEmbeddingGenerator(model_name=\"text-embedding-005\")" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job e1aacf23-0af8-4a45-ba15-e3cfae45370b is DONE. 8.6 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: \u001b[93mInterpreting JSON column(s) as the `db_dtypes.dbjson` extension type\n", - "is in preview; this behavior may change in future versions.\u001b[0m\n", - " warnings.warn(msg, bfe.PreviewWarning)\n" - ] - }, - { - "data": { - "text/html": [ - "Query job d4f524a3-4005-401c-978d-76e6e6a9b496 is DONE. 8.5 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 5e88b6d3-f68c-4f31-84d9-6b0cbd6c5d47 is DONE. 8.5 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# generate embeddings\n", - "embeddings_df = text_embedding_model.predict(chunk_df_exploded)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create Embedding table in Bigquery if not exist." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "test_project_id = \"bigframes-dev\"\n", - "test_dataset_id = \"shuowei_test_us\"\n", - "test_table_id = \"pdf_chunk_embedding_v10\"\n", - "embedding_table_id = f\"{test_project_id}.{test_dataset_id}.{test_table_id}\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Save embedding into a BigQuery table for downstream processing.." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 29727e93-2f99-4331-95c9-a6ab78fd06b6 is DONE. 34.8 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "'bigframes-dev.shuowei_test_us.pdf_chunk_embedding_v10'" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "\n", - "embeddings_df.to_gbq(destination_table=embedding_table_id,if_exists=\"replace\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Create vector search index" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Construction of an index over these embeddings" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'embedding_table_id_v11' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[15], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mbigframes\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbigquery\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mbbq\u001b[39;00m\n\u001b[1;32m 2\u001b[0m bbq\u001b[38;5;241m.\u001b[39mcreate_vector_index(\n\u001b[0;32m----> 3\u001b[0m table_id\u001b[38;5;241m=\u001b[39m\u001b[43membedding_table_id_v11\u001b[49m,\n\u001b[1;32m 4\u001b[0m column_name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mml_generate_embedding_result\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 5\u001b[0m distance_type\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcosine\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 6\u001b[0m index_type\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mivf\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 7\u001b[0m ivf_options\u001b[38;5;241m=\u001b[39m{\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnum_lists\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;241m100\u001b[39m},\n\u001b[1;32m 8\u001b[0m )\n", - "\u001b[0;31mNameError\u001b[0m: name 'embedding_table_id_v11' is not defined" - ] - } - ], - "source": [ - "import bigframes.bigquery as bbq\n", - "bbq.create_vector_index(\n", - " table_id=embedding_table_id_v11,\n", - " column_name=\"ml_generate_embedding_result\",\n", - " distance_type=\"cosine\",\n", - " index_type=\"ivf\",\n", - " ivf_options={\"num_lists\": 100},\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Search with pointers to the original pdf" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Execution of semantic search, with results linked back to the original PDFs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# temp test code, reada from gbq\n", - "embeddings_df = bpd.read_gbq(embedding_table_id)\n", - "embedding_table_id_v11 = \"bigframes-dev.shuowei_test_us.pdf_chunk_embedding_v11\"\n", - "# copy twice for testing\n", - "copies = [embeddings_df] * 5\n", - "embeddings_df= bpd.concat(copies, ignore_index=True)\n", - "type(embeddings_df)\n", - "embeddings_df.to_gbq(destination_table=embedding_table_id_v11, if_exists=\"replace\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "bigframes.options.experiments.semantic_operators = True\n", - "\n", - "embeddings_df.semantics.search(\n", - " \"ml_generate_embedding_result\", \n", - " \"reinforce\", \n", - " top_k=3, \n", - " model=text_embedding_model, \n", - " score_column=\"distance\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# generate embedding for the word for searching\n", - "searched_words = [\"reinforce\"]\n", - "searched_words_embeddings = text_embedding_model.predict(searched_words)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "search_query = bpd.DataFrame({\"query_id\": [\"dog\", \"cat\"], embedding=})\n", - "result_df = bbq.vector_search(\n", - " base_table=\n", - " column_to_search=\n", - " query=search_query,\n", - " distance_type=\"cosine\",\n", - " top_k=5,\n", - ")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/e2e_RAG_debug.ipynb b/notebooks/e2e_RAG_debug.ipynb deleted file mode 100644 index c112e5149f..0000000000 --- a/notebooks/e2e_RAG_debug.ipynb +++ /dev/null @@ -1,305 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# Copyright 2025 Google LLC\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Get hands-on experience with Gemini-powered AI operator APIs in this notebook. We'll start with clear examples of API syntax, ensuring you understand how to use these operators. Then, we'll dive into a real-world application, showcasing their performance on a large dataset and providing key statistics. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Preparation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, import the BigFrames modules." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import bigframes\n", - "import bigframes.pandas as bpd" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Make sure the BigFrames version is at least `1.38.0`" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "from packaging.version import Version\n", - "\n", - "assert Version(bigframes.__version__) >= Version(\"1.38.0\")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/_config/experiment_options.py:68: PreviewWarning: BigFrames Blob is still under experiments. It may not work and subject\n", - "to change in the future.\n", - " warnings.warn(msg, category=bfe.PreviewWarning)\n" - ] - } - ], - "source": [ - "bigframes.options.experiments.blob = True" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# PDF chunk" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Retrieval of PDF URLs, text extraction, and chunking." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/global_session.py:114: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", - " return func(get_global_session(), *args, **kwargs)\n" - ] - }, - { - "data": { - "text/html": [ - "Query job a777cf43-fe93-49f6-8a48-2db57a248a85 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 63d6cafb-56e2-435a-b00a-b3b35badcab0 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "chunks_df = bpd.from_glob_path(\"gs://shuowei_bucket/failed_pdf/*\", name=\"pdf\")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/functions/_function_session.py:804: PreviewWarning: udf is in preview.\n", - " warnings.warn(\"udf is in preview.\", category=bfe.PreviewWarning)\n" - ] - }, - { - "data": { - "text/html": [ - "Query job e39925b5-632d-4d8d-8282-141dfad0463f is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "ename": "NotImplementedError", - "evalue": "Cannot mix Series with other types. Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey. You are currently running BigFrames version 2.0.0.", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNotImplementedError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[6], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m bq_connection \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbigframes-dev.us.bigframes-default-connection\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 2\u001b[0m chunks_df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mchunk_text\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mchunks_df\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mpdf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mblob\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpdf_chunk\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# connection=bq_connection, verbose=True)#, chunk_size=2000, overlap_size=200,\u001b[39;00m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;66;03m#max_batching_rows=1, container_cpu=2, container_memory=\"1Gi\")\u001b[39;00m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;66;03m# notes: use connection is not necessary, we can use default connection.\u001b[39;00m\n\u001b[1;32m 6\u001b[0m \u001b[38;5;66;03m# However, in current stage, using a specfic conneciton will grant more quota\u001b[39;00m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:180\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 171\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_block\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 172\u001b[0m submit_pandas_labels(\n\u001b[1;32m 173\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_block\u001b[38;5;241m.\u001b[39mexpr\u001b[38;5;241m.\u001b[39msession\u001b[38;5;241m.\u001b[39mbqclient,\n\u001b[1;32m 174\u001b[0m class_name,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 178\u001b[0m task\u001b[38;5;241m=\u001b[39mPANDAS_PARAM_TRACKING_TASK,\n\u001b[1;32m 179\u001b[0m )\n\u001b[0;32m--> 180\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 182\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mpop()\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:164\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 161\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 163\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 164\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 165\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 166\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 167\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 168\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 169\u001b[0m \u001b[38;5;66;03m# self._block.expr.session.bqclient. Also, to avoid generating multiple queries\u001b[39;00m\n\u001b[1;32m 170\u001b[0m \u001b[38;5;66;03m# because of internal calls, we log only when the method is directly invoked.\u001b[39;00m\n\u001b[1;32m 171\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_block\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/operations/blob.py:735\u001b[0m, in \u001b[0;36mBlobAccessor.pdf_chunk\u001b[0;34m(self, connection, chunk_size, overlap_size, max_batching_rows, container_cpu, container_memory, verbose)\u001b[0m\n\u001b[1;32m 714\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moverlap_size must be smaller than chunk_size.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 716\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m'''\u001b[39;00m\n\u001b[1;32m 717\u001b[0m \u001b[38;5;124;03mpdf_chunk_udf = blob_func.TransformFunction(\u001b[39;00m\n\u001b[1;32m 718\u001b[0m \u001b[38;5;124;03m blob_func.pdf_chunk_def,\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 733\u001b[0m \u001b[38;5;124;03m#df[\"res\"] = df[\"pdf\"].apply(blob_func.pdf_chunk_func)\u001b[39;00m\n\u001b[1;32m 734\u001b[0m \u001b[38;5;124;03m'''\u001b[39;00m\n\u001b[0;32m--> 735\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43mbpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mDataFrame\u001b[49m\u001b[43m(\u001b[49m\u001b[43m{\u001b[49m\n\u001b[1;32m 736\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mpdf_url_json\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_runtime_json_str\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mR\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 737\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mchunk_size\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mchunk_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 738\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43moverlap_size\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43moverlap_size\u001b[49m\n\u001b[1;32m 739\u001b[0m \u001b[43m\u001b[49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 741\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m_call_pdf_chunk_udf\u001b[39m(row):\n\u001b[1;32m 742\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_block\u001b[38;5;241m.\u001b[39msession\u001b[38;5;241m.\u001b[39mcall_udf(\n\u001b[1;32m 743\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mshuowei_bb1_us.pdf_chunk_def\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;66;03m# Use the full UDF name\u001b[39;00m\n\u001b[1;32m 744\u001b[0m row[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpdf_url_json\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 750\u001b[0m container_memory\u001b[38;5;241m=\u001b[39mcontainer_memory,\n\u001b[1;32m 751\u001b[0m )\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:180\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 171\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_block\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 172\u001b[0m submit_pandas_labels(\n\u001b[1;32m 173\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_block\u001b[38;5;241m.\u001b[39mexpr\u001b[38;5;241m.\u001b[39msession\u001b[38;5;241m.\u001b[39mbqclient,\n\u001b[1;32m 174\u001b[0m class_name,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 178\u001b[0m task\u001b[38;5;241m=\u001b[39mPANDAS_PARAM_TRACKING_TASK,\n\u001b[1;32m 179\u001b[0m )\n\u001b[0;32m--> 180\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 182\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mpop()\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:164\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 161\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 163\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 164\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 165\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 166\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 167\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 168\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 169\u001b[0m \u001b[38;5;66;03m# self._block.expr.session.bqclient. Also, to avoid generating multiple queries\u001b[39;00m\n\u001b[1;32m 170\u001b[0m \u001b[38;5;66;03m# because of internal calls, we log only when the method is directly invoked.\u001b[39;00m\n\u001b[1;32m 171\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_block\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/dataframe.py:157\u001b[0m, in \u001b[0;36mDataFrame.__init__\u001b[0;34m(self, data, index, columns, dtype, copy, session)\u001b[0m\n\u001b[1;32m 146\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m (\n\u001b[1;32m 147\u001b[0m utils\u001b[38;5;241m.\u001b[39mis_dict_like(data)\n\u001b[1;32m 148\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(data) \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 151\u001b[0m )\n\u001b[1;32m 152\u001b[0m ):\n\u001b[1;32m 153\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mall\u001b[39m(\n\u001b[1;32m 154\u001b[0m \u001b[38;5;28misinstance\u001b[39m(data[key], bigframes\u001b[38;5;241m.\u001b[39mseries\u001b[38;5;241m.\u001b[39mSeries) \u001b[38;5;28;01mfor\u001b[39;00m key \u001b[38;5;129;01min\u001b[39;00m data\u001b[38;5;241m.\u001b[39mkeys()\n\u001b[1;32m 155\u001b[0m ):\n\u001b[1;32m 156\u001b[0m \u001b[38;5;66;03m# TODO(tbergeron): Support local list/series data by converting to memtable.\u001b[39;00m\n\u001b[0;32m--> 157\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\n\u001b[1;32m 158\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCannot mix Series with other types. \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mconstants\u001b[38;5;241m.\u001b[39mFEEDBACK_LINK\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 159\u001b[0m )\n\u001b[1;32m 160\u001b[0m keys \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(data\u001b[38;5;241m.\u001b[39mkeys())\n\u001b[1;32m 161\u001b[0m first_label, first_series \u001b[38;5;241m=\u001b[39m keys[\u001b[38;5;241m0\u001b[39m], data[keys[\u001b[38;5;241m0\u001b[39m]]\n", - "\u001b[0;31mNotImplementedError\u001b[0m: Cannot mix Series with other types. Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey. You are currently running BigFrames version 2.0.0." - ] - } - ], - "source": [ - "bq_connection = \"bigframes-dev.us.bigframes-default-connection\"\n", - "chunks_df[\"chunk_text\"] = chunks_df[\"pdf\"].blob.pdf_chunk()\n", - "# connection=bq_connection, verbose=True)#, chunk_size=2000, overlap_size=200,\n", - " #max_batching_rows=1, container_cpu=2, container_memory=\"1Gi\")\n", - "# notes: use connection is not necessary, we can use default connection.\n", - "# However, in current stage, using a specfic conneciton will grant more quota" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Explode column for future processing." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "chunks_df" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "chunk_df_exploded = chunks_df[\"chunk_text\"].explode()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "chunk_df_exploded" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Generation of embeddings within BigFrames." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create Embedding table in Bigquery if not exist." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Save embedding into a BigQuery table for downstream processing." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Create vector search index" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Construction of an index over these embeddings" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Search with pointers to the original pdf" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Execution of vector search, with results linked back to the original PDFs" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/e2e_RAG_prod.ipynb b/notebooks/e2e_RAG_prod.ipynb deleted file mode 100644 index aa00a9013d..0000000000 --- a/notebooks/e2e_RAG_prod.ipynb +++ /dev/null @@ -1,571 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# Copyright 2025 Google LLC\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Get hands-on experience with Gemini-powered AI operator APIs in this notebook. We'll start with clear examples of API syntax, ensuring you understand how to use these operators. Then, we'll dive into a real-world application, showcasing their performance on a large dataset and providing key statistics. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Preparation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, import the BigFrames modules." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import bigframes\n", - "import bigframes.pandas as bpd" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Make sure the BigFrames version is at least `1.38.0`" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from packaging.version import Version\n", - "\n", - "assert Version(bigframes.__version__) >= Version(\"1.38.0\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Set blob to true for testing" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/_config/experiment_options.py:71: ApiDeprecationWarning: BigFrames Blob is in preview now. This flag is no longer needed.\n", - " warnings.warn(msg, category=bfe.ApiDeprecationWarning)\n" - ] - } - ], - "source": [ - "bigframes.options.experiments.blob = True" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# PDF chunk" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Retrieval of PDF URLs, text extraction, and chunking." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/global_session.py:103: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", - " _global_session = bigframes.session.connect(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=262006177488-ka1m0ue4fptfmt9siejdd5lom7p39upa.apps.googleusercontent.com&redirect_uri=https%3A%2F%2Fpydata-google-auth.readthedocs.io%2Fen%2Flatest%2Foauth.html&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform&state=GE3CiB2iPQ32Mbcgug2H68pdMulb7j&prompt=consent&access_type=offline\n" - ] - }, - { - "ename": "ValueError", - "evalue": "Project must be set to initialize BigQuery client. Try setting `bigframes.options.bigquery.project` first.", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[4], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m chunks_df \u001b[38;5;241m=\u001b[39m \u001b[43mbpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_glob_path\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgs://garrettwu_bucket/pdfs/*\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2\u001b[0m chunks_df\u001b[38;5;241m.\u001b[39mcolumns \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124muri\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 3\u001b[0m bq_connection \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbigframes-dev.us.bigframes-default-connection\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:175\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 172\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 175\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 178\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 179\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;66;03m# _block.session.bqclient.\u001b[39;00m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/pandas/io/api.py:606\u001b[0m, in \u001b[0;36mfrom_glob_path\u001b[0;34m(path, connection, name)\u001b[0m\n\u001b[1;32m 603\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mfrom_glob_path\u001b[39m(\n\u001b[1;32m 604\u001b[0m path: \u001b[38;5;28mstr\u001b[39m, \u001b[38;5;241m*\u001b[39m, connection: Optional[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m, name: Optional[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 605\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m bigframes\u001b[38;5;241m.\u001b[39mdataframe\u001b[38;5;241m.\u001b[39mDataFrame:\n\u001b[0;32m--> 606\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mglobal_session\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwith_default_session\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 607\u001b[0m \u001b[43m \u001b[49m\u001b[43mbigframes\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mSession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_glob_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 608\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 609\u001b[0m \u001b[43m \u001b[49m\u001b[43mconnection\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconnection\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 610\u001b[0m \u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 611\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/global_session.py:114\u001b[0m, in \u001b[0;36mwith_default_session\u001b[0;34m(func_, *args, **kwargs)\u001b[0m\n\u001b[1;32m 113\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mwith_default_session\u001b[39m(func_: Callable[\u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m, _T], \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m _T:\n\u001b[0;32m--> 114\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func_(\u001b[43mget_global_session\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/global_session.py:103\u001b[0m, in \u001b[0;36mget_global_session\u001b[0;34m()\u001b[0m\n\u001b[1;32m 101\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _global_session_lock:\n\u001b[1;32m 102\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m _global_session \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 103\u001b[0m _global_session \u001b[38;5;241m=\u001b[39m \u001b[43mbigframes\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconnect\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 104\u001b[0m \u001b[43m \u001b[49m\u001b[43mbigframes\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbigquery\u001b[49m\n\u001b[1;32m 105\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 107\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _global_session\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/__init__.py:2188\u001b[0m, in \u001b[0;36mconnect\u001b[0;34m(context)\u001b[0m\n\u001b[1;32m 2187\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mconnect\u001b[39m(context: Optional[bigquery_options\u001b[38;5;241m.\u001b[39mBigQueryOptions] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Session:\n\u001b[0;32m-> 2188\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mSession\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcontext\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:175\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 172\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 175\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 178\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 179\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;66;03m# _block.session.bqclient.\u001b[39;00m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/__init__.py:170\u001b[0m, in \u001b[0;36mSession.__init__\u001b[0;34m(self, context, clients_provider)\u001b[0m\n\u001b[1;32m 168\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_clients_provider \u001b[38;5;241m=\u001b[39m clients_provider\n\u001b[1;32m 169\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 170\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_clients_provider \u001b[38;5;241m=\u001b[39m \u001b[43mbigframes\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mclients\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mClientsProvider\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 171\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcontext\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mproject\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 172\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_location\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 173\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_regional_endpoints\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcontext\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43muse_regional_endpoints\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 174\u001b[0m \u001b[43m \u001b[49m\u001b[43mcredentials\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcontext\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcredentials\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 175\u001b[0m \u001b[43m \u001b[49m\u001b[43mapplication_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcontext\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapplication_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[43m \u001b[49m\u001b[43mbq_kms_key_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_bq_kms_key_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 177\u001b[0m \u001b[43m \u001b[49m\u001b[43mclient_endpoints_override\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcontext\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mclient_endpoints_override\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 178\u001b[0m \u001b[43m \u001b[49m\u001b[43mrequests_transport_adapters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcontext\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrequests_transport_adapters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 179\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;66;03m# TODO(shobs): Remove this logic after https://github.com/ibis-project/ibis/issues/8494\u001b[39;00m\n\u001b[1;32m 182\u001b[0m \u001b[38;5;66;03m# has been fixed. The ibis client changes the default query job config\u001b[39;00m\n\u001b[1;32m 183\u001b[0m \u001b[38;5;66;03m# so we are going to remember the current config and restore it after\u001b[39;00m\n\u001b[1;32m 184\u001b[0m \u001b[38;5;66;03m# the ibis client has been created\u001b[39;00m\n\u001b[1;32m 185\u001b[0m original_default_query_job_config \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbqclient\u001b[38;5;241m.\u001b[39mdefault_query_job_config\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/clients.py:107\u001b[0m, in \u001b[0;36mClientsProvider.__init__\u001b[0;34m(self, project, location, use_regional_endpoints, credentials, application_name, bq_kms_key_name, client_endpoints_override, requests_transport_adapters)\u001b[0m\n\u001b[1;32m 100\u001b[0m project \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 101\u001b[0m project\n\u001b[1;32m 102\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m os\u001b[38;5;241m.\u001b[39mgetenv(_ENV_DEFAULT_PROJECT)\n\u001b[1;32m 103\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m typing\u001b[38;5;241m.\u001b[39mcast(Optional[\u001b[38;5;28mstr\u001b[39m], credentials_project)\n\u001b[1;32m 104\u001b[0m )\n\u001b[1;32m 106\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m project:\n\u001b[0;32m--> 107\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 108\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mProject must be set to initialize BigQuery client. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 109\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTry setting `bigframes.options.bigquery.project` first.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 110\u001b[0m )\n\u001b[1;32m 112\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_application_name \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 113\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m_get_application_names()\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mapplication_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 114\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m application_name\n\u001b[1;32m 115\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m _get_application_names()\n\u001b[1;32m 116\u001b[0m )\n\u001b[1;32m 117\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_project \u001b[38;5;241m=\u001b[39m project\n", - "\u001b[0;31mValueError\u001b[0m: Project must be set to initialize BigQuery client. Try setting `bigframes.options.bigquery.project` first." - ] - } - ], - "source": [ - "chunks_df = bpd.from_glob_path(\"gs://garrettwu_bucket/pdfs/*\")\n", - "chunks_df.columns = [\"uri\"]\n", - "bq_connection = \"bigframes-dev.us.bigframes-default-connection\"\n", - "chunks_df[\"chunk_text\"] = chunks_df[\"uri\"].blob.pdf_chunk(\n", - " connection=bq_connection, chunk_size=2000, overlap_size=200,\n", - " max_batching_rows=1\n", - ")\n", - "chunk_df_exploded = chunks_df[\"chunk_text\"].explode()\n", - "chunk_df_exploded.cache()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/global_session.py:114: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", - " return func(get_global_session(), *args, **kwargs)\n" - ] - }, - { - "data": { - "text/html": [ - "Query job 4b6facd8-54f1-4a58-a2cb-db230bfc1388 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 9c194a18-b5ff-425f-9312-4dce3e21f4bf is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "chunks_df = bpd.from_glob_path(\"gs://shuowei_bucket/pdf/*\", name=\"pdf\")" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# copy files to genearte more inputs, now we have 1000 PDF files\n", - "#copies = [chunks_df] * 20\n", - "#chunks_df = bpd.concat(copies, ignore_index=True)\n", - "#chunks_df = chunks_df.cache()\n", - "chunks_df = chunks_df.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# copy files to genearte more inputs, now we have 10,000 PDF files\n", - "copies = [chunks_df] * 100\n", - "chunks_df = bpd.concat(copies, ignore_index=True)\n", - "chunks_df = chunks_df.cache()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# copy files again, now we have 100,000 PDF files\n", - "copies = [chunks_df] * 10\n", - "chunks_df = bpd.concat(copies, ignore_index=True)\n", - "chunks_df = chunks_df.cache()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 58e48da6-87fb-4d17-97e0-d3d7e02ee58f is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dataframe.py:4162: PreviewWarning: axis=1 scenario is in preview.\n", - " warnings.warn(msg, category=bfe.PreviewWarning)\n" - ] - } - ], - "source": [ - "bq_connection = \"bigframes-dev.us.bigframes-default-connection\"\n", - "chunks_df[\"chunk_text\"] = chunks_df[\"pdf\"].blob.pdf_chunk(\n", - " connection=bq_connection)\n", - "# notes: use connection is not necessary, we can use default connection.\n", - "# However, in current stage, using a specfic conneciton will grant more quota" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Explode column for future processing." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "chunk_df_exploded = chunks_df[\"chunk_text\"].explode()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Save to a temporary table" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job c0c05fe3-c1cb-4d59-a1a1-c2a3d8582c94 is DONE. 49.0 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "chunk_df_exploded = chunk_df_exploded.cache()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 2145211c-d60f-45d2-acf8-c19c9176298f is DONE. 457.0 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 77319c66-4155-48c2-af63-cf5a558e3cf5 is DONE. 455.2 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "0 Integrating Reinforcement Learning, Action Mod...\n", - "0 Benyamin)\n", - "Preprint submitted to Artificial Int...\n", - "0 classical, discrete, environments.\n", - "Therefore, ...\n", - "0 setting we consider in this work isoffline lea...\n", - "0 more complex\n", - "problems that required longer-ter...\n", - "0 domain models for planning, and RL. We also pr...\n", - "0 means that a planning domain defines parameter...\n", - "0 which actions to perform in\n", - "order to collect n...\n", - "0 these\n", - "assumptions, NSAM is guaranteed to retur...\n", - "0 policy.\n", - "Off-policy algorithms are algorithms t...\n", - "0 the\n", - "environment, mining resources, collecting ...\n", - "0 must:\n", - "1. Harvest at least one wood block from ...\n", - "0 irreversible and the amount of resources in a ...\n", - "0 created by observing an expert solve different...\n", - "0 Moreover, most actions are TP TO actions, whic...\n", - "0 our RL models. Moreover, our gym environment i...\n", - "0 within that time limit,\n", - "we consider the run as...\n", - "0 length.\n", - "4https://imitation.readthedocs.io\n", - "5htt...\n", - "0 planning lies in its capacity to generalize ac...\n", - "0 for the simpler\n", - "Craft Wooden Sword, BC is actu...\n", - "0 the number of episodes\n", - "in which the agent succ...\n", - "0 policy using higher-quality examples. Figure 1...\n", - "0 search\n", - "processes may require higher computatio...\n", - "0 right, (1, 1), move up, (1, 2), move right, (2...\n", - "0 methodological tool to solve problems when pla...\n", - "Name: chunk_text, dtype: string" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chunk_df_exploded" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Generate Embeddings" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Generation of embeddings within BigFrames." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from bigframes.ml import llm\n", - "\n", - "text_embedding_model = llm.TextEmbeddingGenerator(model_name=\"text-embedding-005\")\n", - "embeddings_df = text_embedding_model.predict(chunk_df_exploded)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create Embedding table in Bigquery if not exist." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "test_project_id = \"bigframes-dev\"\n", - "test_dataset_id = \"shuowei_test_us\"\n", - "test_table_id = \"pdf_chunk_embedding\"\n", - "embedding_table_id = f\"{test_project_id}.{test_dataset_id}.{test_table_id}\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Save embedding into a BigQuery table for downstream processing." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "embeddings_df.to_gbq(destination_table=embedding_table_id,if_exists=\"replace\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Create vector search index" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Construction of an index over these embeddings" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import bigframes.bigquery as bbq\n", - "bbq.create_vector_index(\n", - " table_id=embedding_table_id,\n", - " column_name=\"ml_generate_embedding_result\",\n", - " distance_type=\"cosine\",\n", - " index_type=\"ivf\",\n", - " ivf_options={\"num_lists\": 100},\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Search with pointers to the original pdf" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Execution of vector search, with results linked back to the original PDFs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# generate the embedding of the words for search\n", - "searched_words = [\"reinforce\"]\n", - "searched_words_embeddings = text_embedding_model.predict(searched_words)\n", - "embedding_result_column = \"ml_generate_embedding_result\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# perform vector search\n", - "search_result = (\n", - " bbq.vector_search(\n", - " base_table=embedding_table_id,\n", - " column_to_search=embedding_result_column,\n", - " query=searched_words_embeddings,\n", - " query_column_to_search=embedding_result_column,\n", - " top_k=3,\n", - " )\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "search_result" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/e2e_RAG_prod_1M.ipynb b/notebooks/e2e_RAG_prod_1M.ipynb deleted file mode 100644 index 5154aa804c..0000000000 --- a/notebooks/e2e_RAG_prod_1M.ipynb +++ /dev/null @@ -1,661 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# Copyright 2025 Google LLC\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Get hands-on experience with Gemini-powered AI operator APIs in this notebook. We'll start with clear examples of API syntax, ensuring you understand how to use these operators. Then, we'll dive into a real-world application, showcasing their performance on a large dataset and providing key statistics. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Preparation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, import the BigFrames modules." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import bigframes\n", - "import bigframes.pandas as bpd" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Make sure the BigFrames version is at least `1.38.0`" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "from packaging.version import Version\n", - "\n", - "assert Version(bigframes.__version__) >= Version(\"1.38.0\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Set blob to true for testing" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/_config/experiment_options.py:55: PreviewWarning: \u001b[93mBigFrames Blob is still under experiments. It may not work and subject\n", - "to change in the future.\u001b[0m\n", - " warnings.warn(msg, category=bfe.PreviewWarning)\n" - ] - } - ], - "source": [ - "bigframes.options.experiments.blob = True" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# PDF chunk" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Retrieval of PDF URLs, text extraction, and chunking." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/global_session.py:114: DefaultLocationWarning: \u001b[93mNo explicit location is set, so using location US for the session.\u001b[0m\n", - " return func(get_global_session(), *args, **kwargs)\n" - ] - }, - { - "data": { - "text/html": [ - "Query job b4323a3d-e7f7-41b6-a122-59e5c1a5e6ba is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 5455fdcd-3102-451d-bead-20356689285f is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "chunks_df = bpd.from_glob_path(\"gs://shuowei_bucket/pdf/*\", name=\"pdf\")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# copy files to genearte more inputs, now we have 1000 PDF files\n", - "#copies = [chunks_df] * 20\n", - "#chunks_df = bpd.concat(copies, ignore_index=True)\n", - "#chunks_df = chunks_df.cache()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 0ab3a77c-24ea-4bac-8cd4-a29d6b489151 is DONE. 1.8 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# copy files to genearte more inputs, now we have 10,000 PDF files\n", - "copies = [chunks_df] * 100\n", - "chunks_df = bpd.concat(copies, ignore_index=True)\n", - "chunks_df = chunks_df.cache()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 23bfff29-3e86-4eaf-a8e3-c6323e5e41de is DONE. 1.6 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# copy files again, now we have 1,000,000 PDF files\n", - "copies = [chunks_df] * 100\n", - "chunks_df = bpd.concat(copies, ignore_index=True)\n", - "chunks_df = chunks_df.cache()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "#chunks_df" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 6c1cc2ff-b0df-4678-84ba-153592077591 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dataframe.py:4117: PreviewWarning: \u001b[93maxis=1 scenario is in preview.\u001b[0m\n", - " warnings.warn(msg, category=bfe.PreviewWarning)\n" - ] - } - ], - "source": [ - "bq_connection = \"bigframes-dev.us.bigframes-default-connection\"\n", - "chunks_df[\"chunk_text\"] = chunks_df[\"pdf\"].blob.pdf_chunk(\n", - " connection=bq_connection, chunk_size=2000, overlap_size=200,\n", - " max_batching_rows=1, container_cpu=2, container_memory=\"1Gi\")\n", - "# notes: use connection is not necessary, we can use default connection.\n", - "# However, in current stage, using a specfic conneciton will grant more quota" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Explode column for future processing." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "chunk_df_exploded = chunks_df[\"chunk_text\"].explode()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Save to a temporary table" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 03167820-aa2b-4499-9d85-2ffae2770c82 is DONE. 158.0 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "chunk_df_exploded = chunk_df_exploded.cache()" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job f03d7a4d-2977-43fe-82ab-a40341355a7d is DONE. 86.3 GB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job bde70e30-312b-4fa5-ba2e-35b441988e4c is DONE. 10.1 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "0 Integrating Reinforcement Learning, Action Mod...\n", - "0 Benyamin)\n", - "Preprint submitted to Artificial Int...\n", - "0 classical, discrete, environments.\n", - "Therefore, ...\n", - "0 setting we consider in this work isoffline lea...\n", - "0 more complex\n", - "problems that required longer-ter...\n", - "Name: chunk_text, dtype: string" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chunk_df_exploded.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Generate Embeddings" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Generation of embeddings within BigFrames." - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 741a86ed-0b1b-4c69-ad07-6f9859c6ec9f is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job fd427249-d3e1-42bd-86a0-a1952965effd is DONE. 85.7 GB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/array_value.py:114: PreviewWarning: \u001b[93mInterpreting JSON column(s) as the `db_dtypes.dbjson` extension type\n", - "is in preview; this behavior may change in future versions.\u001b[0m\n", - " warnings.warn(msg, bfe.PreviewWarning)\n" - ] - }, - { - "data": { - "text/html": [ - "Query job a8fba082-2b9b-434c-95fa-c50644d03e03 is DONE. 4.2 GB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 9fc6a16b-03b7-4ea6-a69d-902576e0b251 is DONE. 4.2 GB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/ml/base.py:289: RuntimeWarning: \u001b[93mSome predictions failed. Check column ml_generate_embedding_status for\n", - "detailed status. You may want to filter the failed rows and retry.\u001b[0m\n", - " warnings.warn(msg, category=RuntimeWarning)\n" - ] - } - ], - "source": [ - "from bigframes.ml import llm\n", - "\n", - "text_embedding_model = llm.TextEmbeddingGenerator(model_name=\"text-embedding-005\")\n", - "embeddings_df = text_embedding_model.predict(chunk_df_exploded)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create Embedding table in Bigquery if not exist." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "test_project_id = \"bigframes-dev\"\n", - "test_dataset_id = \"shuowei_test_us\"\n", - "test_table_id = \"pdf_chunk_embedding\"\n", - "embedding_table_id = f\"{test_project_id}.{test_dataset_id}.{test_table_id}\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Save embedding into a BigQuery table for downstream processing." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 7f8ac705-f32c-447a-ac5a-57a0c165dde0 is DONE. 104.8 GB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "'bigframes-dev.shuowei_test_us.pdf_chunk_embedding'" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "embeddings_df.to_gbq(destination_table=embedding_table_id,if_exists=\"replace\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Create vector search index" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Construction of an index over these embeddings" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job acfac823-c809-4928-8b1c-132f7f84ea11 is RUNNING. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "ename": "BadRequest", - "evalue": "400 GET https://bigquery.googleapis.com/bigquery/v2/projects/bigframes-dev/queries/acfac823-c809-4928-8b1c-132f7f84ea11?maxResults=0&location=US&prettyPrint=false: Column 'ml_generate_embedding_result' must have the same array length, while the minimum length is 0 and the maximum length is 768.\n\nLocation: US\nJob ID: acfac823-c809-4928-8b1c-132f7f84ea11\n Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey.You are currently running BigFrames version 1.39.0 [{'@type': 'type.googleapis.com/google.rpc.DebugInfo', 'detail': '[INVALID_INPUT] message=QUERY_ERROR: [Column \\'ml_generate_embedding_result\\' must have the same array length, while the minimum length is 0 and the maximum length is 768.] debug=code: \\t BAD_QUERY\\ndescription: \"Column \\\\\\'ml_generate_embedding_result\\\\\\' must have the same array length, while the minimum length is 0 and the maximum length is 768.\"\\ncause: USER_ERROR\\naddress: \"http://jdyd1.prod.google.com:4901/task?handle=logs.0.prod-ml-us.server.cloud-dataengine-ml.10584282029591\"\\nstatus_proto {\\n code: 3\\n space: \"generic\"\\n message: \"Column \\\\\\'ml_generate_embedding_result\\\\\\' must have the same array length, while the minimum length is 0 and the maximum length is 768.\"\\n}\\nerror_details {\\n argument_error {\\n query_error {\\n }\\n }\\n debug_info {\\n error_message_template: \"Column \\\\\\'$0\\\\\\' must have the same array length, while the minimum length is $1 and the maximum length is $2.\"\\n error_id: 3839077984\\n }\\n}\\n errorProto=code: \"QUERY_ERROR\"\\nargument: \"Column \\\\\\'ml_generate_embedding_result\\\\\\' must have the same array length, while the minimum length is 0 and the maximum length is 768.\"\\nlocation_type: OTHER\\nlocation: \"query\"\\n\\n\\tat com.google.cloud.helix.common.Exceptions.fromProto(Exceptions.java:1993)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl.mapDremelErrorsTohelixException(QueryExecutorImpl.java:1206)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:766)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:693)\\n\\tat com.google.net.rpc3.stream.RpcMessageCallback$ForwardingRpcMessageCallback.onMessage(RpcMessageCallback.java:123)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.processMessageUnlocked(RpcStreamInternalContext.java:1839)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksInternalUnlocked(RpcStreamInternalContext.java:2877)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksUnlocked(RpcStreamInternalContext.java:2801)\\n\\tat com.google.net.eventmanager.AbstractFutureTask$Sync.innerRun(AbstractFutureTask.java:259)\\n\\tat com.google.net.eventmanager.AbstractFutureTask.run(AbstractFutureTask.java:120)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runTaskTraced(EventManagerImpl.java:901)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runTask(EventManagerImpl.java:893)\\n\\tat com.google.net.eventmanager.EventManagerImpl.internalRunWorkerLoop(EventManagerImpl.java:1320)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runWorkerLoop(EventManagerImpl.java:1211)\\n\\tat com.google.net.eventmanager.WorkerThreadInfo.runWorkerLoop(WorkerThreadInfo.java:153)\\n\\tat com.google.net.eventmanager.EventManagerImpl$WorkerThread.run(EventManagerImpl.java:2000)\\n'}]", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mBadRequest\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[17], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mbigframes\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbigquery\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mbbq\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[43mbbq\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcreate_vector_index\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[43m \u001b[49m\u001b[43mtable_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43membedding_table_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumn_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mml_generate_embedding_result\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[43mdistance_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcosine\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mivf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[43mivf_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mnum_lists\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m100\u001b[39;49m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/bigquery/_operations/search.py:89\u001b[0m, in \u001b[0;36mcreate_vector_index\u001b[0;34m(table_id, column_name, replace, index_name, distance_type, stored_column_names, index_type, ivf_options, tree_ah_options, session)\u001b[0m\n\u001b[1;32m 86\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 87\u001b[0m read_gbq_query \u001b[38;5;241m=\u001b[39m session\u001b[38;5;241m.\u001b[39mread_gbq_query\n\u001b[0;32m---> 89\u001b[0m \u001b[43mread_gbq_query\u001b[49m\u001b[43m(\u001b[49m\u001b[43msql\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/pandas/io/api.py:223\u001b[0m, in \u001b[0;36mread_gbq_query\u001b[0;34m(query, index_col, columns, configuration, max_results, use_cache, col_order, filters)\u001b[0m\n\u001b[1;32m 211\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mread_gbq_query\u001b[39m(\n\u001b[1;32m 212\u001b[0m query: \u001b[38;5;28mstr\u001b[39m,\n\u001b[1;32m 213\u001b[0m \u001b[38;5;241m*\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 220\u001b[0m filters: vendored_pandas_gbq\u001b[38;5;241m.\u001b[39mFiltersType \u001b[38;5;241m=\u001b[39m (),\n\u001b[1;32m 221\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m bigframes\u001b[38;5;241m.\u001b[39mdataframe\u001b[38;5;241m.\u001b[39mDataFrame:\n\u001b[1;32m 222\u001b[0m _set_default_session_location_if_possible(query)\n\u001b[0;32m--> 223\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mglobal_session\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwith_default_session\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 224\u001b[0m \u001b[43m \u001b[49m\u001b[43mbigframes\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mSession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_gbq_query\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 225\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 226\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex_col\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 227\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 228\u001b[0m \u001b[43m \u001b[49m\u001b[43mconfiguration\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfiguration\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 229\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 230\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 231\u001b[0m \u001b[43m \u001b[49m\u001b[43mcol_order\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcol_order\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 232\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 233\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/global_session.py:114\u001b[0m, in \u001b[0;36mwith_default_session\u001b[0;34m(func, *args, **kwargs)\u001b[0m\n\u001b[1;32m 113\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwith_default_session\u001b[39m(func: Callable[\u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m, _T], \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m _T:\n\u001b[0;32m--> 114\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43mget_global_session\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/__init__.py:534\u001b[0m, in \u001b[0;36mSession.read_gbq_query\u001b[0;34m(self, query, index_col, columns, configuration, max_results, use_cache, col_order, filters)\u001b[0m\n\u001b[1;32m 531\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m col_order:\n\u001b[1;32m 532\u001b[0m columns \u001b[38;5;241m=\u001b[39m col_order\n\u001b[0;32m--> 534\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_loader\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_gbq_query\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 535\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 536\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex_col\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 537\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 538\u001b[0m \u001b[43m \u001b[49m\u001b[43mconfiguration\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfiguration\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 539\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 540\u001b[0m \u001b[43m \u001b[49m\u001b[43mapi_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mread_gbq_query\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 541\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 542\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 543\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/loader.py:609\u001b[0m, in \u001b[0;36mGbqDataLoader.read_gbq_query\u001b[0;34m(self, query, index_col, columns, configuration, max_results, api_name, use_cache, filters)\u001b[0m\n\u001b[1;32m 598\u001b[0m all_columns \u001b[38;5;241m=\u001b[39m itertools\u001b[38;5;241m.\u001b[39mchain(index_cols, columns) \u001b[38;5;28;01mif\u001b[39;00m columns \u001b[38;5;28;01melse\u001b[39;00m ()\n\u001b[1;32m 599\u001b[0m query \u001b[38;5;241m=\u001b[39m bf_io_bigquery\u001b[38;5;241m.\u001b[39mto_query(\n\u001b[1;32m 600\u001b[0m query,\n\u001b[1;32m 601\u001b[0m all_columns,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 606\u001b[0m time_travel_timestamp\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 607\u001b[0m )\n\u001b[0;32m--> 609\u001b[0m destination, query_job \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_query_to_destination\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 610\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 611\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex_cols\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 612\u001b[0m \u001b[43m \u001b[49m\u001b[43mapi_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mapi_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 613\u001b[0m \u001b[43m \u001b[49m\u001b[43mconfiguration\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfiguration\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 614\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 616\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_metrics \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 617\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_metrics\u001b[38;5;241m.\u001b[39mcount_job_stats(query_job)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/loader.py:661\u001b[0m, in \u001b[0;36mGbqDataLoader._query_to_destination\u001b[0;34m(self, query, index_cols, api_name, configuration, do_clustering)\u001b[0m\n\u001b[1;32m 657\u001b[0m _, dry_run_job \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_start_query(\n\u001b[1;32m 658\u001b[0m query, job_config\u001b[38;5;241m=\u001b[39mdry_run_config, api_name\u001b[38;5;241m=\u001b[39mapi_name\n\u001b[1;32m 659\u001b[0m )\n\u001b[1;32m 660\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m dry_run_job\u001b[38;5;241m.\u001b[39mstatement_type \u001b[38;5;241m!=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSELECT\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m--> 661\u001b[0m _, query_job \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_start_query\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mapi_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mapi_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 662\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m query_job\u001b[38;5;241m.\u001b[39mdestination, query_job\n\u001b[1;32m 664\u001b[0m \u001b[38;5;66;03m# Create a table to workaround BigQuery 10 GB query results limit. See:\u001b[39;00m\n\u001b[1;32m 665\u001b[0m \u001b[38;5;66;03m# internal issue 303057336.\u001b[39;00m\n\u001b[1;32m 666\u001b[0m \u001b[38;5;66;03m# Since we have a `statement_type == 'SELECT'`, schema should be populated.\u001b[39;00m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/loader.py:729\u001b[0m, in \u001b[0;36mGbqDataLoader._start_query\u001b[0;34m(self, sql, job_config, max_results, timeout, api_name)\u001b[0m\n\u001b[1;32m 724\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m bigframes\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mcompute\u001b[38;5;241m.\u001b[39mmaximum_bytes_billed \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 725\u001b[0m \u001b[38;5;66;03m# Maybe this should be pushed down into start_query_with_client\u001b[39;00m\n\u001b[1;32m 726\u001b[0m job_config\u001b[38;5;241m.\u001b[39mmaximum_bytes_billed \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 727\u001b[0m bigframes\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mcompute\u001b[38;5;241m.\u001b[39mmaximum_bytes_billed\n\u001b[1;32m 728\u001b[0m )\n\u001b[0;32m--> 729\u001b[0m iterator, query_job \u001b[38;5;241m=\u001b[39m \u001b[43mbf_io_bigquery\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstart_query_with_client\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 730\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_bqclient\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 731\u001b[0m \u001b[43m \u001b[49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 732\u001b[0m \u001b[43m \u001b[49m\u001b[43mjob_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjob_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 733\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 734\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 735\u001b[0m \u001b[43m \u001b[49m\u001b[43mapi_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mapi_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 736\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 737\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m query_job \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 738\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m iterator, query_job\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/_io/bigquery/__init__.py:267\u001b[0m, in \u001b[0;36mstart_query_with_client\u001b[0;34m(bq_client, sql, job_config, location, project, max_results, page_size, timeout, api_name, metrics, query_with_job)\u001b[0m\n\u001b[1;32m 265\u001b[0m opts \u001b[38;5;241m=\u001b[39m bigframes\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mdisplay\n\u001b[1;32m 266\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m opts\u001b[38;5;241m.\u001b[39mprogress_bar \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m query_job\u001b[38;5;241m.\u001b[39mconfiguration\u001b[38;5;241m.\u001b[39mdry_run:\n\u001b[0;32m--> 267\u001b[0m results_iterator \u001b[38;5;241m=\u001b[39m \u001b[43mformatting_helpers\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait_for_query_job\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 268\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_job\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 269\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 270\u001b[0m \u001b[43m \u001b[49m\u001b[43mprogress_bar\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mopts\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprogress_bar\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 271\u001b[0m \u001b[43m \u001b[49m\u001b[43mpage_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpage_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 272\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 273\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 274\u001b[0m results_iterator \u001b[38;5;241m=\u001b[39m query_job\u001b[38;5;241m.\u001b[39mresult(\n\u001b[1;32m 275\u001b[0m max_results\u001b[38;5;241m=\u001b[39mmax_results, page_size\u001b[38;5;241m=\u001b[39mpage_size\n\u001b[1;32m 276\u001b[0m )\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/formatting_helpers.py:139\u001b[0m, in \u001b[0;36mwait_for_query_job\u001b[0;34m(query_job, max_results, page_size, progress_bar)\u001b[0m\n\u001b[1;32m 137\u001b[0m loading_bar \u001b[38;5;241m=\u001b[39m display\u001b[38;5;241m.\u001b[39mHTML(get_query_job_loading_html(query_job))\n\u001b[1;32m 138\u001b[0m display\u001b[38;5;241m.\u001b[39mdisplay(loading_bar, display_id\u001b[38;5;241m=\u001b[39mdisplay_id)\n\u001b[0;32m--> 139\u001b[0m query_result \u001b[38;5;241m=\u001b[39m \u001b[43mquery_job\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 140\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpage_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpage_size\u001b[49m\n\u001b[1;32m 141\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 142\u001b[0m query_job\u001b[38;5;241m.\u001b[39mreload()\n\u001b[1;32m 143\u001b[0m display\u001b[38;5;241m.\u001b[39mupdate_display(\n\u001b[1;32m 144\u001b[0m display\u001b[38;5;241m.\u001b[39mHTML(get_query_job_loading_html(query_job)),\n\u001b[1;32m 145\u001b[0m display_id\u001b[38;5;241m=\u001b[39mdisplay_id,\n\u001b[1;32m 146\u001b[0m )\n", - "File \u001b[0;32m~/src/python-bigquery/google/cloud/bigquery/job/query.py:1681\u001b[0m, in \u001b[0;36mQueryJob.result\u001b[0;34m(self, page_size, max_results, retry, timeout, start_index, job_retry)\u001b[0m\n\u001b[1;32m 1676\u001b[0m remaining_timeout \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1678\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m remaining_timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1679\u001b[0m \u001b[38;5;66;03m# Since is_job_done() calls jobs.getQueryResults, which is a\u001b[39;00m\n\u001b[1;32m 1680\u001b[0m \u001b[38;5;66;03m# long-running API, don't delay the next request at all.\u001b[39;00m\n\u001b[0;32m-> 1681\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[43mis_job_done\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 1682\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[1;32m 1683\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1684\u001b[0m \u001b[38;5;66;03m# Use a monotonic clock since we don't actually care about\u001b[39;00m\n\u001b[1;32m 1685\u001b[0m \u001b[38;5;66;03m# daylight savings or similar, just the elapsed time.\u001b[39;00m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:293\u001b[0m, in \u001b[0;36mRetry.__call__..retry_wrapped_func\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 289\u001b[0m target \u001b[38;5;241m=\u001b[39m functools\u001b[38;5;241m.\u001b[39mpartial(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 290\u001b[0m sleep_generator \u001b[38;5;241m=\u001b[39m exponential_sleep_generator(\n\u001b[1;32m 291\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_initial, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_maximum, multiplier\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_multiplier\n\u001b[1;32m 292\u001b[0m )\n\u001b[0;32m--> 293\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mretry_target\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 294\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 295\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_predicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 296\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep_generator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 297\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 298\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 299\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:153\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 149\u001b[0m \u001b[38;5;66;03m# pylint: disable=broad-except\u001b[39;00m\n\u001b[1;32m 150\u001b[0m \u001b[38;5;66;03m# This function explicitly must deal with broad exceptions.\u001b[39;00m\n\u001b[1;32m 151\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 152\u001b[0m \u001b[38;5;66;03m# defer to shared logic for handling errors\u001b[39;00m\n\u001b[0;32m--> 153\u001b[0m \u001b[43m_retry_error_helper\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 154\u001b[0m \u001b[43m \u001b[49m\u001b[43mexc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 155\u001b[0m \u001b[43m \u001b[49m\u001b[43mdeadline\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 156\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 157\u001b[0m \u001b[43m \u001b[49m\u001b[43merror_list\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 158\u001b[0m \u001b[43m \u001b[49m\u001b[43mpredicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 159\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 160\u001b[0m \u001b[43m \u001b[49m\u001b[43mexception_factory\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 161\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 162\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 163\u001b[0m \u001b[38;5;66;03m# if exception not raised, sleep before next attempt\u001b[39;00m\n\u001b[1;32m 164\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(sleep)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_base.py:212\u001b[0m, in \u001b[0;36m_retry_error_helper\u001b[0;34m(exc, deadline, next_sleep, error_list, predicate_fn, on_error_fn, exc_factory_fn, original_timeout)\u001b[0m\n\u001b[1;32m 206\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m predicate_fn(exc):\n\u001b[1;32m 207\u001b[0m final_exc, source_exc \u001b[38;5;241m=\u001b[39m exc_factory_fn(\n\u001b[1;32m 208\u001b[0m error_list,\n\u001b[1;32m 209\u001b[0m RetryFailureReason\u001b[38;5;241m.\u001b[39mNON_RETRYABLE_ERROR,\n\u001b[1;32m 210\u001b[0m original_timeout,\n\u001b[1;32m 211\u001b[0m )\n\u001b[0;32m--> 212\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m final_exc \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msource_exc\u001b[39;00m\n\u001b[1;32m 213\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m on_error_fn \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 214\u001b[0m on_error_fn(exc)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:144\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m sleep \u001b[38;5;129;01min\u001b[39;00m sleep_generator:\n\u001b[1;32m 143\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 144\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mtarget\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 145\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inspect\u001b[38;5;241m.\u001b[39misawaitable(result):\n\u001b[1;32m 146\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(_ASYNC_RETRY_WARNING)\n", - "File \u001b[0;32m~/src/python-bigquery/google/cloud/bigquery/job/query.py:1650\u001b[0m, in \u001b[0;36mQueryJob.result..is_job_done\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1644\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 1646\u001b[0m \u001b[38;5;66;03m# Call jobs.getQueryResults with max results set to 0 just to\u001b[39;00m\n\u001b[1;32m 1647\u001b[0m \u001b[38;5;66;03m# wait for the query to finish. Unlike most methods,\u001b[39;00m\n\u001b[1;32m 1648\u001b[0m \u001b[38;5;66;03m# jobs.getQueryResults hangs as long as it can to ensure we\u001b[39;00m\n\u001b[1;32m 1649\u001b[0m \u001b[38;5;66;03m# know when the query has finished as soon as possible.\u001b[39;00m\n\u001b[0;32m-> 1650\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_reload_query_results\u001b[49m\u001b[43m(\u001b[49m\u001b[43mretry\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mretry\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mreload_query_results_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1652\u001b[0m \u001b[38;5;66;03m# Even if the query is finished now according to\u001b[39;00m\n\u001b[1;32m 1653\u001b[0m \u001b[38;5;66;03m# jobs.getQueryResults, we'll want to reload the job status if\u001b[39;00m\n\u001b[1;32m 1654\u001b[0m \u001b[38;5;66;03m# it's not already DONE.\u001b[39;00m\n\u001b[1;32m 1655\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n", - "File \u001b[0;32m~/src/python-bigquery/google/cloud/bigquery/job/query.py:1448\u001b[0m, in \u001b[0;36mQueryJob._reload_query_results\u001b[0;34m(self, retry, timeout, page_size)\u001b[0m\n\u001b[1;32m 1445\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(transport_timeout, (\u001b[38;5;28mfloat\u001b[39m, \u001b[38;5;28mint\u001b[39m)):\n\u001b[1;32m 1446\u001b[0m transport_timeout \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1448\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_query_results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_client\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_query_results\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1449\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjob_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1450\u001b[0m \u001b[43m \u001b[49m\u001b[43mretry\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1451\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mproject\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1452\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout_ms\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout_ms\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1453\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlocation\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1454\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtransport_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1455\u001b[0m \u001b[43m \u001b[49m\u001b[43mpage_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpage_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1456\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery/google/cloud/bigquery/client.py:2028\u001b[0m, in \u001b[0;36mClient._get_query_results\u001b[0;34m(self, job_id, retry, project, timeout_ms, location, timeout, page_size)\u001b[0m\n\u001b[1;32m 2024\u001b[0m \u001b[38;5;66;03m# This call is typically made in a polling loop that checks whether the\u001b[39;00m\n\u001b[1;32m 2025\u001b[0m \u001b[38;5;66;03m# job is complete (from QueryJob.done(), called ultimately from\u001b[39;00m\n\u001b[1;32m 2026\u001b[0m \u001b[38;5;66;03m# QueryJob.result()). So we don't need to poll here.\u001b[39;00m\n\u001b[1;32m 2027\u001b[0m span_attributes \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpath\u001b[39m\u001b[38;5;124m\"\u001b[39m: path}\n\u001b[0;32m-> 2028\u001b[0m resource \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_api\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2029\u001b[0m \u001b[43m \u001b[49m\u001b[43mretry\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2030\u001b[0m \u001b[43m \u001b[49m\u001b[43mspan_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mBigQuery.getQueryResults\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2031\u001b[0m \u001b[43m \u001b[49m\u001b[43mspan_attributes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mspan_attributes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2032\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mGET\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2033\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2034\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_params\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextra_params\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2035\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2036\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2037\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _QueryResults\u001b[38;5;241m.\u001b[39mfrom_api_repr(resource)\n", - "File \u001b[0;32m~/src/python-bigquery/google/cloud/bigquery/client.py:837\u001b[0m, in \u001b[0;36mClient._call_api\u001b[0;34m(self, retry, span_name, span_attributes, job_ref, headers, **kwargs)\u001b[0m\n\u001b[1;32m 833\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m span_name \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 834\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m create_span(\n\u001b[1;32m 835\u001b[0m name\u001b[38;5;241m=\u001b[39mspan_name, attributes\u001b[38;5;241m=\u001b[39mspan_attributes, client\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m, job_ref\u001b[38;5;241m=\u001b[39mjob_ref\n\u001b[1;32m 836\u001b[0m ):\n\u001b[0;32m--> 837\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcall\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 839\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m call()\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:293\u001b[0m, in \u001b[0;36mRetry.__call__..retry_wrapped_func\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 289\u001b[0m target \u001b[38;5;241m=\u001b[39m functools\u001b[38;5;241m.\u001b[39mpartial(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 290\u001b[0m sleep_generator \u001b[38;5;241m=\u001b[39m exponential_sleep_generator(\n\u001b[1;32m 291\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_initial, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_maximum, multiplier\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_multiplier\n\u001b[1;32m 292\u001b[0m )\n\u001b[0;32m--> 293\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mretry_target\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 294\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 295\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_predicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 296\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep_generator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 297\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 298\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 299\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:153\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 149\u001b[0m \u001b[38;5;66;03m# pylint: disable=broad-except\u001b[39;00m\n\u001b[1;32m 150\u001b[0m \u001b[38;5;66;03m# This function explicitly must deal with broad exceptions.\u001b[39;00m\n\u001b[1;32m 151\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 152\u001b[0m \u001b[38;5;66;03m# defer to shared logic for handling errors\u001b[39;00m\n\u001b[0;32m--> 153\u001b[0m \u001b[43m_retry_error_helper\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 154\u001b[0m \u001b[43m \u001b[49m\u001b[43mexc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 155\u001b[0m \u001b[43m \u001b[49m\u001b[43mdeadline\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 156\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 157\u001b[0m \u001b[43m \u001b[49m\u001b[43merror_list\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 158\u001b[0m \u001b[43m \u001b[49m\u001b[43mpredicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 159\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 160\u001b[0m \u001b[43m \u001b[49m\u001b[43mexception_factory\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 161\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 162\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 163\u001b[0m \u001b[38;5;66;03m# if exception not raised, sleep before next attempt\u001b[39;00m\n\u001b[1;32m 164\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(sleep)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_base.py:212\u001b[0m, in \u001b[0;36m_retry_error_helper\u001b[0;34m(exc, deadline, next_sleep, error_list, predicate_fn, on_error_fn, exc_factory_fn, original_timeout)\u001b[0m\n\u001b[1;32m 206\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m predicate_fn(exc):\n\u001b[1;32m 207\u001b[0m final_exc, source_exc \u001b[38;5;241m=\u001b[39m exc_factory_fn(\n\u001b[1;32m 208\u001b[0m error_list,\n\u001b[1;32m 209\u001b[0m RetryFailureReason\u001b[38;5;241m.\u001b[39mNON_RETRYABLE_ERROR,\n\u001b[1;32m 210\u001b[0m original_timeout,\n\u001b[1;32m 211\u001b[0m )\n\u001b[0;32m--> 212\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m final_exc \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msource_exc\u001b[39;00m\n\u001b[1;32m 213\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m on_error_fn \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 214\u001b[0m on_error_fn(exc)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:144\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m sleep \u001b[38;5;129;01min\u001b[39;00m sleep_generator:\n\u001b[1;32m 143\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 144\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mtarget\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 145\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inspect\u001b[38;5;241m.\u001b[39misawaitable(result):\n\u001b[1;32m 146\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(_ASYNC_RETRY_WARNING)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/_http/__init__.py:494\u001b[0m, in \u001b[0;36mJSONConnection.api_request\u001b[0;34m(self, method, path, query_params, data, content_type, headers, api_base_url, api_version, expect_json, _target_object, timeout, extra_api_info)\u001b[0m\n\u001b[1;32m 482\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_make_request(\n\u001b[1;32m 483\u001b[0m method\u001b[38;5;241m=\u001b[39mmethod,\n\u001b[1;32m 484\u001b[0m url\u001b[38;5;241m=\u001b[39murl,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 490\u001b[0m extra_api_info\u001b[38;5;241m=\u001b[39mextra_api_info,\n\u001b[1;32m 491\u001b[0m )\n\u001b[1;32m 493\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;241m200\u001b[39m \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m response\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m300\u001b[39m:\n\u001b[0;32m--> 494\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exceptions\u001b[38;5;241m.\u001b[39mfrom_http_response(response)\n\u001b[1;32m 496\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m expect_json \u001b[38;5;129;01mand\u001b[39;00m response\u001b[38;5;241m.\u001b[39mcontent:\n\u001b[1;32m 497\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m response\u001b[38;5;241m.\u001b[39mjson()\n", - "\u001b[0;31mBadRequest\u001b[0m: 400 GET https://bigquery.googleapis.com/bigquery/v2/projects/bigframes-dev/queries/acfac823-c809-4928-8b1c-132f7f84ea11?maxResults=0&location=US&prettyPrint=false: Column 'ml_generate_embedding_result' must have the same array length, while the minimum length is 0 and the maximum length is 768.\n\nLocation: US\nJob ID: acfac823-c809-4928-8b1c-132f7f84ea11\n Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey.You are currently running BigFrames version 1.39.0 [{'@type': 'type.googleapis.com/google.rpc.DebugInfo', 'detail': '[INVALID_INPUT] message=QUERY_ERROR: [Column \\'ml_generate_embedding_result\\' must have the same array length, while the minimum length is 0 and the maximum length is 768.] debug=code: \\t BAD_QUERY\\ndescription: \"Column \\\\\\'ml_generate_embedding_result\\\\\\' must have the same array length, while the minimum length is 0 and the maximum length is 768.\"\\ncause: USER_ERROR\\naddress: \"http://jdyd1.prod.google.com:4901/task?handle=logs.0.prod-ml-us.server.cloud-dataengine-ml.10584282029591\"\\nstatus_proto {\\n code: 3\\n space: \"generic\"\\n message: \"Column \\\\\\'ml_generate_embedding_result\\\\\\' must have the same array length, while the minimum length is 0 and the maximum length is 768.\"\\n}\\nerror_details {\\n argument_error {\\n query_error {\\n }\\n }\\n debug_info {\\n error_message_template: \"Column \\\\\\'$0\\\\\\' must have the same array length, while the minimum length is $1 and the maximum length is $2.\"\\n error_id: 3839077984\\n }\\n}\\n errorProto=code: \"QUERY_ERROR\"\\nargument: \"Column \\\\\\'ml_generate_embedding_result\\\\\\' must have the same array length, while the minimum length is 0 and the maximum length is 768.\"\\nlocation_type: OTHER\\nlocation: \"query\"\\n\\n\\tat com.google.cloud.helix.common.Exceptions.fromProto(Exceptions.java:1993)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl.mapDremelErrorsTohelixException(QueryExecutorImpl.java:1206)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:766)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:693)\\n\\tat com.google.net.rpc3.stream.RpcMessageCallback$ForwardingRpcMessageCallback.onMessage(RpcMessageCallback.java:123)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.processMessageUnlocked(RpcStreamInternalContext.java:1839)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksInternalUnlocked(RpcStreamInternalContext.java:2877)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksUnlocked(RpcStreamInternalContext.java:2801)\\n\\tat com.google.net.eventmanager.AbstractFutureTask$Sync.innerRun(AbstractFutureTask.java:259)\\n\\tat com.google.net.eventmanager.AbstractFutureTask.run(AbstractFutureTask.java:120)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runTaskTraced(EventManagerImpl.java:901)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runTask(EventManagerImpl.java:893)\\n\\tat com.google.net.eventmanager.EventManagerImpl.internalRunWorkerLoop(EventManagerImpl.java:1320)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runWorkerLoop(EventManagerImpl.java:1211)\\n\\tat com.google.net.eventmanager.WorkerThreadInfo.runWorkerLoop(WorkerThreadInfo.java:153)\\n\\tat com.google.net.eventmanager.EventManagerImpl$WorkerThread.run(EventManagerImpl.java:2000)\\n'}]" - ] - } - ], - "source": [ - "import bigframes.bigquery as bbq\n", - "bbq.create_vector_index(\n", - " table_id=embedding_table_id,\n", - " column_name=\"ml_generate_embedding_result\",\n", - " distance_type=\"cosine\",\n", - " index_type=\"ivf\",\n", - " ivf_options={\"num_lists\": 100},\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Search with pointers to the original pdf" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Execution of vector search, with results linked back to the original PDFs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# generate the embedding of the words for search\n", - "searched_words = [\"reinforce\"]\n", - "searched_words_embeddings = text_embedding_model.predict(searched_words)\n", - "embedding_result_column = \"ml_generate_embedding_result\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# perform vector search\n", - "search_result = (\n", - " bbq.vector_search(\n", - " base_table=embedding_table_id,\n", - " column_to_search=embedding_result_column,\n", - " query=searched_words_embeddings,\n", - " query_column_to_search=embedding_result_column,\n", - " top_k=3,\n", - " )\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "search_result" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/e2e_RAG_test.ipynb b/notebooks/e2e_RAG_test.ipynb deleted file mode 100644 index 5a707bca3c..0000000000 --- a/notebooks/e2e_RAG_test.ipynb +++ /dev/null @@ -1,712 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# Copyright 2025 Google LLC\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Get hands-on experience with Gemini-powered AI operator APIs in this notebook. We'll start with clear examples of API syntax, ensuring you understand how to use these operators. Then, we'll dive into a real-world application, showcasing their performance on a large dataset and providing key statistics. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Preparation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, import the BigFrames modules." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import bigframes\n", - "import bigframes.pandas as bpd" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Make sure the BigFrames version is at least `1.38.0`" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "from packaging.version import Version\n", - "\n", - "assert Version(bigframes.__version__) >= Version(\"1.38.0\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Connect to test environmet" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/_config/experiment_options.py:54: PreviewWarning: BigFrames Blob is still under experiments. It may not work and subject to change in the future.\n", - " warnings.warn(msg, category=bfe.PreviewWarning)\n", - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/_config/bigquery_options.py:362: UserWarning: This is an advanced configuration option for directly setting endpoints. Incorrect use may lead to unexpected behavior or system instability. Proceed only if you fully understand its implications.\n", - " warnings.warn(msg)\n" - ] - } - ], - "source": [ - "bigframes.options.experiments.blob = True\n", - "bigframes.options._bigquery_options.client_endpoints_override = {\"bqclient\": \"https://test-bigquery.sandbox.google.com\", \n", - " \"bqconnectionclient\": \"test-bigqueryconnection.sandbox.googleapis.com\", \n", - " \"bqstoragereadclient\": \"test-bigquerystorage-grpc.sandbox.googleapis.com\"}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# PDF chunk" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Retrieval of PDF URLs, text extraction, and chunking." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/global_session.py:114: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", - " return func(get_global_session(), *args, **kwargs)\n" - ] - }, - { - "data": { - "text/html": [ - "Query job 8bb45a5c-8c84-42a2-945e-c82ded85fb31 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 85c66232-0901-46b3-a2b5-69f5a11ff85e is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "chunks_df = bpd.from_glob_path(\"gs://shuowei_bucket/pdf/*\", name=\"pdf\")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# copy files to genearte more inputs, now we have 1000 PDF files\n", - "#copies = [chunks_df] * 20\n", - "#chunks_df = bpd.concat(copies, ignore_index=True)\n", - "#chunks_df = chunks_df.cache()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job f055935b-00cc-40b3-8631-eab065130596 is DONE. 734.8 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# copy files to genearte more inputs, now we have 10,000 PDF files\n", - "copies = [chunks_df] * 100\n", - "chunks_df = bpd.concat(copies, ignore_index=True)\n", - "chunks_df = chunks_df.cache()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 66f5550a-e645-4fc7-87d7-4e0eee7ff08b is DONE. 1.6 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# copy files again, now we have 1,000,000 PDF files\n", - "copies = [chunks_df] * 100\n", - "chunks_df = bpd.concat(copies, ignore_index=True)\n", - "chunks_df = chunks_df.cache()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 28e709fc-2cd0-4512-aac2-b22872a3b84f is DONE. 158.0 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job b9a9479e-bb47-4a2e-afe1-b8fd099dcb88 is DONE. 158.0 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
pdf
0uri: gs://shuowei_bucket/pdf/NeurIPS-2024-hydra-bidirectional-state-space-models-through-generalized-matrix-mixers-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
1uri: gs://shuowei_bucket/pdf/NeurIPS-2023-neural-latent-geometry-search-product-manifold-inference-via-gromov-hausdorff-informed-bayesian-optimization-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
2uri: gs://shuowei_bucket/pdf/NeurIPS-2024-a-robust-inlier-identification-algorithm-for-point-cloud-registration-via-mathbfell_0-minimization-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
3uri: gs://shuowei_bucket/pdf/NeurIPS-2024-can-an-ai-agent-safely-run-a-government-existence-of-probably-approximately-aligned-policies-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
4uri: gs://shuowei_bucket/pdf/2502.12961v1.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
5uri: gs://shuowei_bucket/pdf/NeurIPS-2024-inexact-augmented-lagrangian-methods-for-conic-optimization-quadratic-growth-and-linear-convergence-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
6uri: gs://shuowei_bucket/pdf/NeurIPS-2024-predicting-the-performance-of-foundation-models-via-agreement-on-the-line-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
7uri: gs://shuowei_bucket/pdf/NeurIPS-2024-prediction-with-action-visual-policy-learning-via-joint-denoising-process-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
8uri: gs://shuowei_bucket/pdf/NeurIPS-2023-look-ma-no-hands-agent-environment-factorization-of-egocentric-videos-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
9uri: gs://shuowei_bucket/pdf/NeurIPS-2024-cross-scale-self-supervised-blind-image-deblurring-via-implicit-neural-representation-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
10uri: gs://shuowei_bucket/pdf/NeurIPS-2023-two-stage-learning-to-defer-with-multiple-experts-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
11uri: gs://shuowei_bucket/pdf/NeurIPS-2023-on-separate-normalization-in-self-supervised-transformers-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
12uri: gs://shuowei_bucket/pdf/NeurIPS-2024-decrl-a-deep-evolutionary-clustering-jointed-temporal-knowledge-graph-representation-learning-approach-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
13uri: gs://shuowei_bucket/pdf/NeurIPS-2023-demystifying-the-optimal-performance-of-multi-class-classification-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
14uri: gs://shuowei_bucket/pdf/2502.12926v1.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
15uri: gs://shuowei_bucket/pdf/2502.13069v1.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
16uri: gs://shuowei_bucket/pdf/NeurIPS-2024-a-scalable-generative-model-for-dynamical-system-reconstruction-from-neuroimaging-data-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
17uri: gs://shuowei_bucket/pdf/NeurIPS-2024-disentangling-interpretable-factors-with-supervised-independent-subspace-principal-component-analysis-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
18uri: gs://shuowei_bucket/pdf/NeurIPS-2023-deliffas-deformable-light-fields-for-fast-avatar-synthesis-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
19uri: gs://shuowei_bucket/pdf/NeurIPS-2024-diffusion-actor-critic-with-entropy-regulator-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
20uri: gs://shuowei_bucket/pdf/NeurIPS-2023-accurate-interpolation-for-scattered-data-through-hierarchical-residual-refinement-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
21uri: gs://shuowei_bucket/pdf/NeurIPS-2023-expressive-sign-equivariant-networks-for-spectral-geometric-learning-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
22uri: gs://shuowei_bucket/pdf/NeurIPS-2024-flexible-task-abstractions-emerge-in-linear-networks-with-fast-and-bounded-units-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
23uri: gs://shuowei_bucket/pdf/NeurIPS-2023-h3t-efficient-integration-of-memory-optimization-and-parallelism-for-large-scale-transformer-training-Paper-Conference.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
24uri: gs://shuowei_bucket/pdf/2502.12224v1.pdf, authorizer: bigframes-dev.us.bigframes-default-connection
\n", - "

25 rows × 1 columns

\n", - "
[1000000 rows x 1 columns in total]" - ], - "text/plain": [ - " pdf\n", - "0 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2024-...\n", - "1 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2023-...\n", - "2 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2024-...\n", - "3 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2024-...\n", - "4 {'uri': 'gs://shuowei_bucket/pdf/2502.12961v1....\n", - "5 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2024-...\n", - "6 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2024-...\n", - "7 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2024-...\n", - "8 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2023-...\n", - "9 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2024-...\n", - "10 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2023-...\n", - "11 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2023-...\n", - "12 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2024-...\n", - "13 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2023-...\n", - "14 {'uri': 'gs://shuowei_bucket/pdf/2502.12926v1....\n", - "15 {'uri': 'gs://shuowei_bucket/pdf/2502.13069v1....\n", - "16 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2024-...\n", - "17 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2024-...\n", - "18 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2023-...\n", - "19 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2024-...\n", - "20 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2023-...\n", - "21 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2023-...\n", - "22 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2024-...\n", - "23 {'uri': 'gs://shuowei_bucket/pdf/NeurIPS-2023-...\n", - "24 {'uri': 'gs://shuowei_bucket/pdf/2502.12224v1....\n", - "...\n", - "\n", - "[1000000 rows x 1 columns]" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chunks_df" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 4081bd0d-0d54-4c70-96d0-1f55b497cb5d is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dataframe.py:4098: PreviewWarning: axis=1 scenario is in preview.\n", - " warnings.warn(msg, category=bfe.PreviewWarning)\n" - ] - } - ], - "source": [ - "bq_connection = \"bigframes-dev.us.bigframes-default-connection\"\n", - "chunks_df[\"chunk_text\"] = chunks_df[\"pdf\"].blob.pdf_chunk(\n", - " connection=bq_connection, chunk_size=2000, overlap_size=200,\n", - " max_batching_rows=1, container_cpu=2, container_memory=\"1Gi\")\n", - "# notes: use connection is not necessary, we can use default connection.\n", - "# However, in current stage, using a specfic conneciton will grant more quota" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Explode column for future processing." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "chunk_df_exploded = chunks_df[\"chunk_text\"].explode()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Save to a temporary table" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job e671bba2-377c-45b9-9947-44f1914fae4e is RUNNING. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "ename": "BadRequest", - "evalue": "400 GET https://test-bigquery.sandbox.google.com/bigquery/v2/projects/bigframes-dev/queries/e671bba2-377c-45b9-9947-44f1914fae4e?maxResults=0&location=US&prettyPrint=false: The job encountered an error during execution. Retrying the job may solve the problem.\n\nLocation: US\nJob ID: e671bba2-377c-45b9-9947-44f1914fae4e\n Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey.You are currently running BigFrames version 1.38.0 [{'@type': 'type.googleapis.com/google.rpc.DebugInfo', 'detail': '[CONNECTION_ERROR] debug=Dremel returned an error: generic::UNAVAILABLE: Reached maximum number of retriable errors. errorProto=code: \"CONNECTION_ERROR\"\\n\\n\\tat com.google.cloud.helix.common.Exceptions$Public.connectionError(Exceptions.java:776)\\n\\tat com.google.cloud.helix.common.Exceptions$Public.connectionError(Exceptions.java:780)\\n\\tat com.google.cloud.helix.server.job.DremelErrorUtil.createHelixErrorFromDremelRpcException(DremelErrorUtil.java:60)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:783)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:697)\\n\\tat com.google.net.rpc3.stream.RpcMessageCallback$ForwardingRpcMessageCallback.onMessage(RpcMessageCallback.java:123)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.processMessageUnlocked(RpcStreamInternalContext.java:1839)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksInternalUnlocked(RpcStreamInternalContext.java:2877)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksUnlocked(RpcStreamInternalContext.java:2801)\\n\\tat com.google.net.eventmanager.AbstractFutureTask$Sync.innerRun(AbstractFutureTask.java:259)\\n\\tat com.google.net.eventmanager.AbstractFutureTask.run(AbstractFutureTask.java:120)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runTaskTraced(EventManagerImpl.java:901)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runTask(EventManagerImpl.java:893)\\n\\tat com.google.net.eventmanager.EventManagerImpl.internalRunWorkerLoop(EventManagerImpl.java:1320)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runWorkerLoop(EventManagerImpl.java:1211)\\n\\tat com.google.net.eventmanager.WorkerThreadInfo.runWorkerLoop(WorkerThreadInfo.java:153)\\n\\tat com.google.net.eventmanager.EventManagerImpl$WorkerThread.run(EventManagerImpl.java:2000)\\n\\tSuppressed: java.lang.Exception: Including call stack from HelixFutures\\n\\t\\tat com.google.cloud.helix.common.HelixFutures.getHelixException(HelixFutures.java:76)\\n\\t\\tat com.google.cloud.helix.common.HelixFutures.getDone(HelixFutures.java:55)\\n\\t\\tat com.google.cloud.helix.server.job.LocalQueryJobController.handleQueryDone(LocalQueryJobController.java:2626)\\n\\t\\tat com.google.cloud.helix.server.job.LocalQueryJobController.lambda$runJob$1(LocalQueryJobController.java:2539)\\n\\t\\tat com.google.common.util.concurrent.CombinedFuture$CallableInterruptibleTask.runInterruptibly(CombinedFuture.java:196)\\n\\t\\tat com.google.common.util.concurrent.InterruptibleTask.run(InterruptibleTask.java:74)\\n\\t\\tat com.google.common.context.ContextRunnable.runInContext(ContextRunnable.java:83)\\n\\t\\tat io.grpc.Context.run(Context.java:536)\\n\\t\\tat com.google.tracing.GenericContextCallback.runInInheritedContext(GenericContextCallback.java:78)\\n\\t\\tat com.google.common.context.ContextRunnable.run(ContextRunnable.java:74)\\n\\t\\tat java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)\\n\\t\\tat java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)\\n\\t\\tat java.base/java.lang.Thread.run(Unknown Source)\\n'}]", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mBadRequest\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[12], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m chunk_df_exploded \u001b[38;5;241m=\u001b[39m \u001b[43mchunk_df_exploded\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcache\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:147\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 144\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 146\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 147\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 148\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 149\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 150\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 151\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 152\u001b[0m \u001b[38;5;66;03m# self._block.expr.session.bqclient. Also, to avoid generating multiple queries\u001b[39;00m\n\u001b[1;32m 153\u001b[0m \u001b[38;5;66;03m# because of internal calls, we log only when the method is directly invoked.\u001b[39;00m\n\u001b[1;32m 154\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_block\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/series.py:2135\u001b[0m, in \u001b[0;36mSeries.cache\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 2126\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 2127\u001b[0m \u001b[38;5;124;03mMaterializes the Series to a temporary table.\u001b[39;00m\n\u001b[1;32m 2128\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 2132\u001b[0m \u001b[38;5;124;03m Series: Self\u001b[39;00m\n\u001b[1;32m 2133\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 2134\u001b[0m \u001b[38;5;66;03m# Do not use session-aware cashing if user-requested\u001b[39;00m\n\u001b[0;32m-> 2135\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_cached\u001b[49m\u001b[43m(\u001b[49m\u001b[43mforce\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msession_aware\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:147\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 144\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 146\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 147\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 148\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 149\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 150\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 151\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 152\u001b[0m \u001b[38;5;66;03m# self._block.expr.session.bqclient. Also, to avoid generating multiple queries\u001b[39;00m\n\u001b[1;32m 153\u001b[0m \u001b[38;5;66;03m# because of internal calls, we log only when the method is directly invoked.\u001b[39;00m\n\u001b[1;32m 154\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_block\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/series.py:2138\u001b[0m, in \u001b[0;36mSeries._cached\u001b[0;34m(self, force, session_aware)\u001b[0m\n\u001b[1;32m 2137\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_cached\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39m, force: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m, session_aware: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Series:\n\u001b[0;32m-> 2138\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_block\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcached\u001b[49m\u001b[43m(\u001b[49m\u001b[43mforce\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mforce\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msession_aware\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msession_aware\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2139\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/blocks.py:2445\u001b[0m, in \u001b[0;36mBlock.cached\u001b[0;34m(self, force, session_aware)\u001b[0m\n\u001b[1;32m 2443\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Write the block to a session table.\"\"\"\u001b[39;00m\n\u001b[1;32m 2444\u001b[0m \u001b[38;5;66;03m# use a heuristic for whether something needs to be cached\u001b[39;00m\n\u001b[0;32m-> 2445\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_executor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcached\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2446\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexpr\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2447\u001b[0m \u001b[43m \u001b[49m\u001b[43mforce\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mforce\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2448\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_session\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msession_aware\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2449\u001b[0m \u001b[43m \u001b[49m\u001b[43mcluster_cols\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mindex_columns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2450\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/executor.py:456\u001b[0m, in \u001b[0;36mBigQueryCachingExecutor.cached\u001b[0;34m(self, array_value, force, use_session, cluster_cols)\u001b[0m\n\u001b[1;32m 454\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_cache_with_session_awareness(array_value)\n\u001b[1;32m 455\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 456\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_cache_with_cluster_cols\u001b[49m\u001b[43m(\u001b[49m\u001b[43marray_value\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcluster_cols\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcluster_cols\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/executor.py:532\u001b[0m, in \u001b[0;36mBigQueryCachingExecutor._cache_with_cluster_cols\u001b[0;34m(self, array_value, cluster_cols)\u001b[0m\n\u001b[1;32m 527\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Executes the query and uses the resulting table to rewrite future executions.\"\"\"\u001b[39;00m\n\u001b[1;32m 529\u001b[0m sql, schema, ordering_info \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcompiler\u001b[38;5;241m.\u001b[39mcompile_raw(\n\u001b[1;32m 530\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreplace_cached_subtrees(array_value\u001b[38;5;241m.\u001b[39mnode)\n\u001b[1;32m 531\u001b[0m )\n\u001b[0;32m--> 532\u001b[0m tmp_table \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sql_as_cached_temp_table\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 533\u001b[0m \u001b[43m \u001b[49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 534\u001b[0m \u001b[43m \u001b[49m\u001b[43mschema\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 535\u001b[0m \u001b[43m \u001b[49m\u001b[43mcluster_cols\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbq_io\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mselect_cluster_cols\u001b[49m\u001b[43m(\u001b[49m\u001b[43mschema\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcluster_cols\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 536\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 537\u001b[0m cached_replacement \u001b[38;5;241m=\u001b[39m array_value\u001b[38;5;241m.\u001b[39mas_cached(\n\u001b[1;32m 538\u001b[0m cache_table\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbqclient\u001b[38;5;241m.\u001b[39mget_table(tmp_table),\n\u001b[1;32m 539\u001b[0m ordering\u001b[38;5;241m=\u001b[39mordering_info,\n\u001b[1;32m 540\u001b[0m )\u001b[38;5;241m.\u001b[39mnode\n\u001b[1;32m 541\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_cached_executions[array_value\u001b[38;5;241m.\u001b[39mnode] \u001b[38;5;241m=\u001b[39m cached_replacement\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/executor.py:626\u001b[0m, in \u001b[0;36mBigQueryCachingExecutor._sql_as_cached_temp_table\u001b[0;34m(self, sql, schema, cluster_cols)\u001b[0m\n\u001b[1;32m 621\u001b[0m job_config \u001b[38;5;241m=\u001b[39m cast(\n\u001b[1;32m 622\u001b[0m bigquery\u001b[38;5;241m.\u001b[39mQueryJobConfig,\n\u001b[1;32m 623\u001b[0m bigquery\u001b[38;5;241m.\u001b[39mQueryJobConfig\u001b[38;5;241m.\u001b[39mfrom_api_repr({}),\n\u001b[1;32m 624\u001b[0m )\n\u001b[1;32m 625\u001b[0m job_config\u001b[38;5;241m.\u001b[39mdestination \u001b[38;5;241m=\u001b[39m temp_table\n\u001b[0;32m--> 626\u001b[0m _, query_job \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_execute_query\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 627\u001b[0m \u001b[43m \u001b[49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 628\u001b[0m \u001b[43m \u001b[49m\u001b[43mjob_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjob_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 629\u001b[0m \u001b[43m \u001b[49m\u001b[43mapi_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcached\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 630\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 631\u001b[0m query_job\u001b[38;5;241m.\u001b[39mdestination\n\u001b[1;32m 632\u001b[0m query_job\u001b[38;5;241m.\u001b[39mresult()\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/executor.py:492\u001b[0m, in \u001b[0;36mBigQueryCachingExecutor._run_execute_query\u001b[0;34m(self, sql, job_config, api_name, page_size, max_results)\u001b[0m\n\u001b[1;32m 490\u001b[0m bq_io\u001b[38;5;241m.\u001b[39madd_and_trim_labels(job_config, api_name\u001b[38;5;241m=\u001b[39mapi_name)\n\u001b[1;32m 491\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 492\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mbq_io\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstart_query_with_client\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 493\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbqclient\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 494\u001b[0m \u001b[43m \u001b[49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 495\u001b[0m \u001b[43m \u001b[49m\u001b[43mjob_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjob_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 496\u001b[0m \u001b[43m \u001b[49m\u001b[43mapi_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mapi_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 497\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 498\u001b[0m \u001b[43m \u001b[49m\u001b[43mpage_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpage_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 499\u001b[0m \u001b[43m \u001b[49m\u001b[43mmetrics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmetrics\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 500\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 502\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m google\u001b[38;5;241m.\u001b[39mapi_core\u001b[38;5;241m.\u001b[39mexceptions\u001b[38;5;241m.\u001b[39mBadRequest \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 503\u001b[0m \u001b[38;5;66;03m# Unfortunately, this error type does not have a separate error code or exception type\u001b[39;00m\n\u001b[1;32m 504\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mResources exceeded during query execution\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m e\u001b[38;5;241m.\u001b[39mmessage:\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/_io/bigquery/__init__.py:253\u001b[0m, in \u001b[0;36mstart_query_with_client\u001b[0;34m(bq_client, sql, job_config, location, project, max_results, page_size, timeout, api_name, metrics)\u001b[0m\n\u001b[1;32m 251\u001b[0m opts \u001b[38;5;241m=\u001b[39m bigframes\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mdisplay\n\u001b[1;32m 252\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m opts\u001b[38;5;241m.\u001b[39mprogress_bar \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m query_job\u001b[38;5;241m.\u001b[39mconfiguration\u001b[38;5;241m.\u001b[39mdry_run:\n\u001b[0;32m--> 253\u001b[0m results_iterator \u001b[38;5;241m=\u001b[39m \u001b[43mformatting_helpers\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait_for_query_job\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 254\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_job\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 255\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 256\u001b[0m \u001b[43m \u001b[49m\u001b[43mprogress_bar\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mopts\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprogress_bar\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 257\u001b[0m \u001b[43m \u001b[49m\u001b[43mpage_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpage_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 258\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 259\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 260\u001b[0m results_iterator \u001b[38;5;241m=\u001b[39m query_job\u001b[38;5;241m.\u001b[39mresult(\n\u001b[1;32m 261\u001b[0m max_results\u001b[38;5;241m=\u001b[39mmax_results, page_size\u001b[38;5;241m=\u001b[39mpage_size\n\u001b[1;32m 262\u001b[0m )\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/formatting_helpers.py:139\u001b[0m, in \u001b[0;36mwait_for_query_job\u001b[0;34m(query_job, max_results, page_size, progress_bar)\u001b[0m\n\u001b[1;32m 137\u001b[0m loading_bar \u001b[38;5;241m=\u001b[39m display\u001b[38;5;241m.\u001b[39mHTML(get_query_job_loading_html(query_job))\n\u001b[1;32m 138\u001b[0m display\u001b[38;5;241m.\u001b[39mdisplay(loading_bar, display_id\u001b[38;5;241m=\u001b[39mdisplay_id)\n\u001b[0;32m--> 139\u001b[0m query_result \u001b[38;5;241m=\u001b[39m \u001b[43mquery_job\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 140\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpage_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpage_size\u001b[49m\n\u001b[1;32m 141\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 142\u001b[0m query_job\u001b[38;5;241m.\u001b[39mreload()\n\u001b[1;32m 143\u001b[0m display\u001b[38;5;241m.\u001b[39mupdate_display(\n\u001b[1;32m 144\u001b[0m display\u001b[38;5;241m.\u001b[39mHTML(get_query_job_loading_html(query_job)),\n\u001b[1;32m 145\u001b[0m display_id\u001b[38;5;241m=\u001b[39mdisplay_id,\n\u001b[1;32m 146\u001b[0m )\n", - "File \u001b[0;32m~/src/python-bigquery/google/cloud/bigquery/job/query.py:1681\u001b[0m, in \u001b[0;36mQueryJob.result\u001b[0;34m(self, page_size, max_results, retry, timeout, start_index, job_retry)\u001b[0m\n\u001b[1;32m 1676\u001b[0m remaining_timeout \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1678\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m remaining_timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1679\u001b[0m \u001b[38;5;66;03m# Since is_job_done() calls jobs.getQueryResults, which is a\u001b[39;00m\n\u001b[1;32m 1680\u001b[0m \u001b[38;5;66;03m# long-running API, don't delay the next request at all.\u001b[39;00m\n\u001b[0;32m-> 1681\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[43mis_job_done\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 1682\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[1;32m 1683\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1684\u001b[0m \u001b[38;5;66;03m# Use a monotonic clock since we don't actually care about\u001b[39;00m\n\u001b[1;32m 1685\u001b[0m \u001b[38;5;66;03m# daylight savings or similar, just the elapsed time.\u001b[39;00m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:293\u001b[0m, in \u001b[0;36mRetry.__call__..retry_wrapped_func\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 289\u001b[0m target \u001b[38;5;241m=\u001b[39m functools\u001b[38;5;241m.\u001b[39mpartial(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 290\u001b[0m sleep_generator \u001b[38;5;241m=\u001b[39m exponential_sleep_generator(\n\u001b[1;32m 291\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_initial, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_maximum, multiplier\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_multiplier\n\u001b[1;32m 292\u001b[0m )\n\u001b[0;32m--> 293\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mretry_target\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 294\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 295\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_predicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 296\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep_generator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 297\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 298\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 299\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:153\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 149\u001b[0m \u001b[38;5;66;03m# pylint: disable=broad-except\u001b[39;00m\n\u001b[1;32m 150\u001b[0m \u001b[38;5;66;03m# This function explicitly must deal with broad exceptions.\u001b[39;00m\n\u001b[1;32m 151\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 152\u001b[0m \u001b[38;5;66;03m# defer to shared logic for handling errors\u001b[39;00m\n\u001b[0;32m--> 153\u001b[0m \u001b[43m_retry_error_helper\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 154\u001b[0m \u001b[43m \u001b[49m\u001b[43mexc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 155\u001b[0m \u001b[43m \u001b[49m\u001b[43mdeadline\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 156\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 157\u001b[0m \u001b[43m \u001b[49m\u001b[43merror_list\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 158\u001b[0m \u001b[43m \u001b[49m\u001b[43mpredicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 159\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 160\u001b[0m \u001b[43m \u001b[49m\u001b[43mexception_factory\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 161\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 162\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 163\u001b[0m \u001b[38;5;66;03m# if exception not raised, sleep before next attempt\u001b[39;00m\n\u001b[1;32m 164\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(sleep)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_base.py:212\u001b[0m, in \u001b[0;36m_retry_error_helper\u001b[0;34m(exc, deadline, next_sleep, error_list, predicate_fn, on_error_fn, exc_factory_fn, original_timeout)\u001b[0m\n\u001b[1;32m 206\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m predicate_fn(exc):\n\u001b[1;32m 207\u001b[0m final_exc, source_exc \u001b[38;5;241m=\u001b[39m exc_factory_fn(\n\u001b[1;32m 208\u001b[0m error_list,\n\u001b[1;32m 209\u001b[0m RetryFailureReason\u001b[38;5;241m.\u001b[39mNON_RETRYABLE_ERROR,\n\u001b[1;32m 210\u001b[0m original_timeout,\n\u001b[1;32m 211\u001b[0m )\n\u001b[0;32m--> 212\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m final_exc \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msource_exc\u001b[39;00m\n\u001b[1;32m 213\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m on_error_fn \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 214\u001b[0m on_error_fn(exc)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:144\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m sleep \u001b[38;5;129;01min\u001b[39;00m sleep_generator:\n\u001b[1;32m 143\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 144\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mtarget\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 145\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inspect\u001b[38;5;241m.\u001b[39misawaitable(result):\n\u001b[1;32m 146\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(_ASYNC_RETRY_WARNING)\n", - "File \u001b[0;32m~/src/python-bigquery/google/cloud/bigquery/job/query.py:1650\u001b[0m, in \u001b[0;36mQueryJob.result..is_job_done\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1644\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 1646\u001b[0m \u001b[38;5;66;03m# Call jobs.getQueryResults with max results set to 0 just to\u001b[39;00m\n\u001b[1;32m 1647\u001b[0m \u001b[38;5;66;03m# wait for the query to finish. Unlike most methods,\u001b[39;00m\n\u001b[1;32m 1648\u001b[0m \u001b[38;5;66;03m# jobs.getQueryResults hangs as long as it can to ensure we\u001b[39;00m\n\u001b[1;32m 1649\u001b[0m \u001b[38;5;66;03m# know when the query has finished as soon as possible.\u001b[39;00m\n\u001b[0;32m-> 1650\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_reload_query_results\u001b[49m\u001b[43m(\u001b[49m\u001b[43mretry\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mretry\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mreload_query_results_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1652\u001b[0m \u001b[38;5;66;03m# Even if the query is finished now according to\u001b[39;00m\n\u001b[1;32m 1653\u001b[0m \u001b[38;5;66;03m# jobs.getQueryResults, we'll want to reload the job status if\u001b[39;00m\n\u001b[1;32m 1654\u001b[0m \u001b[38;5;66;03m# it's not already DONE.\u001b[39;00m\n\u001b[1;32m 1655\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n", - "File \u001b[0;32m~/src/python-bigquery/google/cloud/bigquery/job/query.py:1448\u001b[0m, in \u001b[0;36mQueryJob._reload_query_results\u001b[0;34m(self, retry, timeout, page_size)\u001b[0m\n\u001b[1;32m 1445\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(transport_timeout, (\u001b[38;5;28mfloat\u001b[39m, \u001b[38;5;28mint\u001b[39m)):\n\u001b[1;32m 1446\u001b[0m transport_timeout \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1448\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_query_results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_client\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_query_results\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1449\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjob_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1450\u001b[0m \u001b[43m \u001b[49m\u001b[43mretry\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1451\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mproject\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1452\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout_ms\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout_ms\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1453\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlocation\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1454\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtransport_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1455\u001b[0m \u001b[43m \u001b[49m\u001b[43mpage_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpage_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1456\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery/google/cloud/bigquery/client.py:2028\u001b[0m, in \u001b[0;36mClient._get_query_results\u001b[0;34m(self, job_id, retry, project, timeout_ms, location, timeout, page_size)\u001b[0m\n\u001b[1;32m 2024\u001b[0m \u001b[38;5;66;03m# This call is typically made in a polling loop that checks whether the\u001b[39;00m\n\u001b[1;32m 2025\u001b[0m \u001b[38;5;66;03m# job is complete (from QueryJob.done(), called ultimately from\u001b[39;00m\n\u001b[1;32m 2026\u001b[0m \u001b[38;5;66;03m# QueryJob.result()). So we don't need to poll here.\u001b[39;00m\n\u001b[1;32m 2027\u001b[0m span_attributes \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpath\u001b[39m\u001b[38;5;124m\"\u001b[39m: path}\n\u001b[0;32m-> 2028\u001b[0m resource \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_api\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2029\u001b[0m \u001b[43m \u001b[49m\u001b[43mretry\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2030\u001b[0m \u001b[43m \u001b[49m\u001b[43mspan_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mBigQuery.getQueryResults\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2031\u001b[0m \u001b[43m \u001b[49m\u001b[43mspan_attributes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mspan_attributes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2032\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mGET\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2033\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2034\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_params\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextra_params\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2035\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2036\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2037\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _QueryResults\u001b[38;5;241m.\u001b[39mfrom_api_repr(resource)\n", - "File \u001b[0;32m~/src/python-bigquery/google/cloud/bigquery/client.py:837\u001b[0m, in \u001b[0;36mClient._call_api\u001b[0;34m(self, retry, span_name, span_attributes, job_ref, headers, **kwargs)\u001b[0m\n\u001b[1;32m 833\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m span_name \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 834\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m create_span(\n\u001b[1;32m 835\u001b[0m name\u001b[38;5;241m=\u001b[39mspan_name, attributes\u001b[38;5;241m=\u001b[39mspan_attributes, client\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m, job_ref\u001b[38;5;241m=\u001b[39mjob_ref\n\u001b[1;32m 836\u001b[0m ):\n\u001b[0;32m--> 837\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcall\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 839\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m call()\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:293\u001b[0m, in \u001b[0;36mRetry.__call__..retry_wrapped_func\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 289\u001b[0m target \u001b[38;5;241m=\u001b[39m functools\u001b[38;5;241m.\u001b[39mpartial(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 290\u001b[0m sleep_generator \u001b[38;5;241m=\u001b[39m exponential_sleep_generator(\n\u001b[1;32m 291\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_initial, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_maximum, multiplier\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_multiplier\n\u001b[1;32m 292\u001b[0m )\n\u001b[0;32m--> 293\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mretry_target\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 294\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 295\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_predicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 296\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep_generator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 297\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 298\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 299\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:153\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 149\u001b[0m \u001b[38;5;66;03m# pylint: disable=broad-except\u001b[39;00m\n\u001b[1;32m 150\u001b[0m \u001b[38;5;66;03m# This function explicitly must deal with broad exceptions.\u001b[39;00m\n\u001b[1;32m 151\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 152\u001b[0m \u001b[38;5;66;03m# defer to shared logic for handling errors\u001b[39;00m\n\u001b[0;32m--> 153\u001b[0m \u001b[43m_retry_error_helper\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 154\u001b[0m \u001b[43m \u001b[49m\u001b[43mexc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 155\u001b[0m \u001b[43m \u001b[49m\u001b[43mdeadline\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 156\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 157\u001b[0m \u001b[43m \u001b[49m\u001b[43merror_list\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 158\u001b[0m \u001b[43m \u001b[49m\u001b[43mpredicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 159\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 160\u001b[0m \u001b[43m \u001b[49m\u001b[43mexception_factory\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 161\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 162\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 163\u001b[0m \u001b[38;5;66;03m# if exception not raised, sleep before next attempt\u001b[39;00m\n\u001b[1;32m 164\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(sleep)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_base.py:212\u001b[0m, in \u001b[0;36m_retry_error_helper\u001b[0;34m(exc, deadline, next_sleep, error_list, predicate_fn, on_error_fn, exc_factory_fn, original_timeout)\u001b[0m\n\u001b[1;32m 206\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m predicate_fn(exc):\n\u001b[1;32m 207\u001b[0m final_exc, source_exc \u001b[38;5;241m=\u001b[39m exc_factory_fn(\n\u001b[1;32m 208\u001b[0m error_list,\n\u001b[1;32m 209\u001b[0m RetryFailureReason\u001b[38;5;241m.\u001b[39mNON_RETRYABLE_ERROR,\n\u001b[1;32m 210\u001b[0m original_timeout,\n\u001b[1;32m 211\u001b[0m )\n\u001b[0;32m--> 212\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m final_exc \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msource_exc\u001b[39;00m\n\u001b[1;32m 213\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m on_error_fn \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 214\u001b[0m on_error_fn(exc)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:144\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m sleep \u001b[38;5;129;01min\u001b[39;00m sleep_generator:\n\u001b[1;32m 143\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 144\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mtarget\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 145\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inspect\u001b[38;5;241m.\u001b[39misawaitable(result):\n\u001b[1;32m 146\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(_ASYNC_RETRY_WARNING)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/_http/__init__.py:494\u001b[0m, in \u001b[0;36mJSONConnection.api_request\u001b[0;34m(self, method, path, query_params, data, content_type, headers, api_base_url, api_version, expect_json, _target_object, timeout, extra_api_info)\u001b[0m\n\u001b[1;32m 482\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_make_request(\n\u001b[1;32m 483\u001b[0m method\u001b[38;5;241m=\u001b[39mmethod,\n\u001b[1;32m 484\u001b[0m url\u001b[38;5;241m=\u001b[39murl,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 490\u001b[0m extra_api_info\u001b[38;5;241m=\u001b[39mextra_api_info,\n\u001b[1;32m 491\u001b[0m )\n\u001b[1;32m 493\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;241m200\u001b[39m \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m response\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m300\u001b[39m:\n\u001b[0;32m--> 494\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exceptions\u001b[38;5;241m.\u001b[39mfrom_http_response(response)\n\u001b[1;32m 496\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m expect_json \u001b[38;5;129;01mand\u001b[39;00m response\u001b[38;5;241m.\u001b[39mcontent:\n\u001b[1;32m 497\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m response\u001b[38;5;241m.\u001b[39mjson()\n", - "\u001b[0;31mBadRequest\u001b[0m: 400 GET https://test-bigquery.sandbox.google.com/bigquery/v2/projects/bigframes-dev/queries/e671bba2-377c-45b9-9947-44f1914fae4e?maxResults=0&location=US&prettyPrint=false: The job encountered an error during execution. Retrying the job may solve the problem.\n\nLocation: US\nJob ID: e671bba2-377c-45b9-9947-44f1914fae4e\n Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey.You are currently running BigFrames version 1.38.0 [{'@type': 'type.googleapis.com/google.rpc.DebugInfo', 'detail': '[CONNECTION_ERROR] debug=Dremel returned an error: generic::UNAVAILABLE: Reached maximum number of retriable errors. errorProto=code: \"CONNECTION_ERROR\"\\n\\n\\tat com.google.cloud.helix.common.Exceptions$Public.connectionError(Exceptions.java:776)\\n\\tat com.google.cloud.helix.common.Exceptions$Public.connectionError(Exceptions.java:780)\\n\\tat com.google.cloud.helix.server.job.DremelErrorUtil.createHelixErrorFromDremelRpcException(DremelErrorUtil.java:60)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:783)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:697)\\n\\tat com.google.net.rpc3.stream.RpcMessageCallback$ForwardingRpcMessageCallback.onMessage(RpcMessageCallback.java:123)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.processMessageUnlocked(RpcStreamInternalContext.java:1839)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksInternalUnlocked(RpcStreamInternalContext.java:2877)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksUnlocked(RpcStreamInternalContext.java:2801)\\n\\tat com.google.net.eventmanager.AbstractFutureTask$Sync.innerRun(AbstractFutureTask.java:259)\\n\\tat com.google.net.eventmanager.AbstractFutureTask.run(AbstractFutureTask.java:120)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runTaskTraced(EventManagerImpl.java:901)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runTask(EventManagerImpl.java:893)\\n\\tat com.google.net.eventmanager.EventManagerImpl.internalRunWorkerLoop(EventManagerImpl.java:1320)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runWorkerLoop(EventManagerImpl.java:1211)\\n\\tat com.google.net.eventmanager.WorkerThreadInfo.runWorkerLoop(WorkerThreadInfo.java:153)\\n\\tat com.google.net.eventmanager.EventManagerImpl$WorkerThread.run(EventManagerImpl.java:2000)\\n\\tSuppressed: java.lang.Exception: Including call stack from HelixFutures\\n\\t\\tat com.google.cloud.helix.common.HelixFutures.getHelixException(HelixFutures.java:76)\\n\\t\\tat com.google.cloud.helix.common.HelixFutures.getDone(HelixFutures.java:55)\\n\\t\\tat com.google.cloud.helix.server.job.LocalQueryJobController.handleQueryDone(LocalQueryJobController.java:2626)\\n\\t\\tat com.google.cloud.helix.server.job.LocalQueryJobController.lambda$runJob$1(LocalQueryJobController.java:2539)\\n\\t\\tat com.google.common.util.concurrent.CombinedFuture$CallableInterruptibleTask.runInterruptibly(CombinedFuture.java:196)\\n\\t\\tat com.google.common.util.concurrent.InterruptibleTask.run(InterruptibleTask.java:74)\\n\\t\\tat com.google.common.context.ContextRunnable.runInContext(ContextRunnable.java:83)\\n\\t\\tat io.grpc.Context.run(Context.java:536)\\n\\t\\tat com.google.tracing.GenericContextCallback.runInInheritedContext(GenericContextCallback.java:78)\\n\\t\\tat com.google.common.context.ContextRunnable.run(ContextRunnable.java:74)\\n\\t\\tat java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)\\n\\t\\tat java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)\\n\\t\\tat java.base/java.lang.Thread.run(Unknown Source)\\n'}]" - ] - } - ], - "source": [ - "chunk_df_exploded = chunk_df_exploded.cache()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "chunk_df_exploded" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Generate Embeddings" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Generation of embeddings within BigFrames." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from bigframes.ml import llm\n", - "\n", - "text_embedding_model = llm.TextEmbeddingGenerator(model_name=\"text-embedding-005\")\n", - "embeddings_df = text_embedding_model.predict(chunk_df_exploded)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create Embedding table in Bigquery if not exist." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "test_project_id = \"bigframes-dev\"\n", - "test_dataset_id = \"shuowei_test_us\"\n", - "test_table_id = \"pdf_chunk_embedding\"\n", - "embedding_table_id = f\"{test_project_id}.{test_dataset_id}.{test_table_id}\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Save embedding into a BigQuery table for downstream processing." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "embeddings_df.to_gbq(destination_table=embedding_table_id,if_exists=\"replace\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Create vector search index" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Construction of an index over these embeddings" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import bigframes.bigquery as bbq\n", - "bbq.create_vector_index(\n", - " table_id=embedding_table_id,\n", - " column_name=\"ml_generate_embedding_result\",\n", - " distance_type=\"cosine\",\n", - " index_type=\"ivf\",\n", - " ivf_options={\"num_lists\": 100},\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Search with pointers to the original pdf" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Execution of vector search, with results linked back to the original PDFs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# generate the embedding of the words for search\n", - "searched_words = [\"reinforce\"]\n", - "searched_words_embeddings = text_embedding_model.predict(searched_words)\n", - "embedding_result_column = \"ml_generate_embedding_result\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# perform vector search\n", - "search_result = (\n", - " bbq.vector_search(\n", - " base_table=embedding_table_id,\n", - " column_to_search=embedding_result_column,\n", - " query=searched_words_embeddings,\n", - " query_column_to_search=embedding_result_column,\n", - " top_k=3,\n", - " )\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "search_result" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/google_sql_notebook.ipynb b/notebooks/google_sql_notebook.ipynb deleted file mode 100644 index 5b6fd2d7b3..0000000000 --- a/notebooks/google_sql_notebook.ipynb +++ /dev/null @@ -1,54 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import bigframes.pandas as bpd\n", - "bpd.options.bigquery.project = 'bigquery-public-data'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "gender_filter = 'M'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%%bigquery --params {\"gender_filter\": gender_filter}\n", - "SELECT * FROM `bigquery-public-data.usa_names.usa_1910_2013` WHERE gender = @gender_filter" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.5" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/multimodal/transcribe_partial_mode.ipynb b/notebooks/multimodal/transcribe_partial_mode.ipynb deleted file mode 100644 index 4d3598df8d..0000000000 --- a/notebooks/multimodal/transcribe_partial_mode.ipynb +++ /dev/null @@ -1,153 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "6d77cb8d", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/global_session.py:103: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", - " _global_session = bigframes.session.connect(\n" - ] - }, - { - "data": { - "text/html": [ - "Query job 536d38d9-59d8-49ac-9247-f8d66dccabdc is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job d6f4bc43-015f-427c-97a4-4005fcd5dc37 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 3dc5742c-35a4-45ec-9b1d-a08fd072b7b8 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 0b18e95b-0728-448e-b27a-8094d27d8135 is RUNNING. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "ename": "Forbidden", - "evalue": "403 GET https://bigquery.googleapis.com/bigquery/v2/projects/bigframes-dev/queries/0b18e95b-0728-448e-b27a-8094d27d8135?maxResults=0&location=US&prettyPrint=false: Access Denied: BigQuery BigQuery: Permission denied while globbing file pattern. bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission 'storage.objects.list' denied on resource (or it may not exist). Please make sure gs://your-bucket/audio-files/* is accessible via appropriate IAM roles, e.g. Storage Object Viewer or Storage Object Creator.\n\nLocation: US\nJob ID: 0b18e95b-0728-448e-b27a-8094d27d8135\n Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey. You are currently running BigFrames version 2.10.0. [{'@type': 'type.googleapis.com/google.rpc.DebugInfo', 'detail': '[ACCESS_DENIED] message=ACCESS_DENIED: [BigQuery, BigQuery, Permission denied while globbing file pattern. bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\'storage.objects.list\\' denied on resource (or it may not exist). Please make sure gs://your-bucket/audio-files/* is accessible via appropriate IAM roles, e.g. Storage Object Viewer or Storage Object Creator.] debug=code: \\t AUTHORIZATION_ERROR\\ndescription: \"Permission denied while globbing file pattern. bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\'storage.objects.list\\\\\\' denied on resource (or it may not exist). Please make sure gs://your-bucket/audio-files/* is accessible via appropriate IAM roles, e.g. Storage Object Viewer or Storage Object Creator. message: \\\\\"GCS aka Bigstore is not approved for storing Google user data (go/blobstore-gcs-getting-started). If you have obtained security exceptions for Bigstore instead of Blobstore, Please make sure Dremel has access to the files and all directories on the path to the files (http://go/dremel-access).\\\\\"\\\\npermission_denied_error {\\\\n accessed_resource_uri: \\\\\"/bigstore/your-bucket/audio-files/*\\\\\"\\\\n system: BIGSTORE\\\\n}\\\\nunderlying_status {\\\\n code: 7\\\\n space: \\\\\"generic\\\\\"\\\\n message: \\\\\"Calling Match with file \\\\\\\\\\\\\"/bigstore/your-bucket/audio-files/**\\\\\\\\\\\\\": cloud.bigstore.ResponseCode.ErrorCode::ACCESS_DENIED: bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist). bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist). [google.rpc.error_details_ext] { message: \\\\\\\\\\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\\\\\\\\\" details { [type.googleapis.com/google.rpc.DebugInfo] { stack_entries: \\\\\\\\\\\\\"com.google.net.rpc3.client.RpcClientException: APPLICATION_ERROR;cloud.bigstore/FrontendObjectsService.List;bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist). bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist).;AppErrorCode=1;StartTimeMs=1752257982548;tcp;Deadline(sec)=29.991333608;ResFormat=uncompressed;interceptors={[com.google.prod.fireaxe.filters.FireaxeRpcClientInterceptorImpl;com.google.cloud.bigstore.common.LatencyCollectingInterceptor;com.google.frameworks.debug.sherlog.core.rpcutil.Stubby3ClientInterceptor];overrides={}};ServerTimeSec=0.021769909;LogBytes=256;Non-FailFast;EffSecLevel=strong_privacy_and_integrity;ReqFormat=uncompressed;ReqID=f0298bdab819b7ef;GlobalID=632dd03b377a3319;Server=[::1]:14003\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.net.rpc3.client.RpcStub.startBlockingRpcInternal(RpcStub.java:571)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.net.rpc3.client.RpcStub.startBlockingRpc(RpcStub.java:471)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.proto.BackendObjectsService$Stub.list(BackendObjectsService.java:734)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.common.LampreyServiceBase.call(LampreyServiceBase.java:37)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.common.BigstoreFrontendObjectsServiceImpl.call(BigstoreFrontendObjectsServiceImpl.java:40)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.common.BigstoreFrontendObjectsServiceImpl.list(BigstoreFrontendObjectsServiceImpl.java:97)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.stubby.BigstoreStubbyImpl.lambda$listObjects$0(BigstoreStubbyImpl.java:832)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.common.CallbackRequestHandler.handleCallbackRequest(CallbackRequestHandler.java:288)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.common.CallbackRequestHandler.handleCallbackRequest(CallbackRequestHandler.java:158)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.stubby.BigstoreStubbyImpl.listObjects(BigstoreStubbyImpl.java:821)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.common.ProxyDelegatorBase.callAndRecordLatency(ProxyDelegatorBase.java:109)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.stubby.StubbyProxyDelegator.call(StubbyProxyDelegator.java:184)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.stubby.BigstoreStubbyProxy.listObjects(BigstoreStubbyProxy.java:334)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.isolation.RpcReceiver.lambda$processRequestAsync$0(RpcReceiver.java:198)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.isolation.AsyncExecutor.lambda$submit$0(AsyncExecutor.java:213)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.common.context.ContextRunnable.runInContext(ContextRunnable.java:83)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"io.grpc.Context.run(Context.java:536)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.tracing.GenericContextCallback.runInInheritedContext(GenericContextCallback.java:58)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.common.context.ContextRunnable.run(ContextRunnable.java:74)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"java.base/java.lang.Thread.run(Unknown Source)\\\\\\\\\\\\\" } } } [blobstore2.GcsErrorDetails] { xml_code: \\\\\\\\\\\\\"AccessDenied\\\\\\\\\\\\\" msg: \\\\\\\\\\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\\\\\\\\\" http_code: 403 details { first: \\\\\\\\\\\\\"Details\\\\\\\\\\\\\" second: \\\\\\\\\\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\\\\\\\\\" } debug_info: \\\\\\\\\\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist). bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\\\\\\\\\" } [cloud.bigstore.GcsLatencyInfo] { requests { method: \\\\\\\\\\\\\"/BackendObjectsService.List\\\\\\\\\\\\\" deadline { seconds: 29 nanos: 979569783 } start { seconds: 1752257982 nanos: 549935579 } end { seconds: 1752257982 nanos: 570470964 } status { code: 1 space: \\\\\\\\\\\\\"cloud.bigstore.ResponseCode.ErrorCode\\\\\\\\\\\\\" message: \\\\\\\\\\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist). bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\\\\\\\\\" canonical_code: 7 } } requests { method: \\\\\\\\\\\\\"/BucketMdService.LookupBucket\\\\\\\\\\\\\" deadline { seconds: 4 nanos: 999967650 } start { seconds: 1752257982 nanos: 551314439 } end { seconds: 1752257982 nanos: 553833108 } status { } metadata_spanner_stats { read_walltime_millis: 0 read_cpu_millis: 0 read_scheduler_delay_millis: 0 read_throttle_delay_millis: 0 read_per_service_limit_queue_delay_millis: 0 read_locking_delay_millis: 0 read_client_overhead_delay_millis: 0 read_client_flow_control_delay_millis: 0 read_io_delay_millis: 0 } elapsed_time_isolator_metrics { } sunspot_verdict: VERDICT_CAT_UNKNOWN } requests { method: \\\\\\\\\\\\\"/AccessService.CheckListObjects\\\\\\\\\\\\\" deadline { seconds: 29 nanos: 969642965 } start { seconds: 1752257982 nanos: 553970188 } end { seconds: 1752257982 nanos: 569602062 } status { } } }\\\\\"\\\\n message_set {\\\\n [google.rpc.error_details_ext] {\\\\n message: \\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\"\\\\n }\\\\n }\\\\n}\\\\nerror_context {\\\\n table_name: \\\\\"bigframes-dev._8b037bfb7316dddf9d92b12dcf93e008906bfe52.bqdf20250711_sessionee050e_bb3969eb60f548be8a4b6f5e2564f69f\\\\\"\\\\n}\\\\n\"\\ncause: USER_ERROR\\naddress: \"http://jfs5.prod.google.com:4901/task?handle=logs.2071.serving.shard-mals.cloud-dataengine.11785653463719 Partition description: __SHUFFLE0/0 TableDef \\\\\\'bigframes-dev._8b037bfb7316dddf9d92b12dcf93e008906bfe52.bqdf20250711_sessionee050e_bb3969eb60f548be8a4b6f5e2564f69f\\\\\\' of type \\\\\\'object-meta\\\\\\': /bigstore/your-bucket/audio-files/*\"\\nstatus_proto {\\n code: 7\\n space: \"generic\"\\n message: \"Permission denied while globbing file pattern. bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\'storage.objects.list\\\\\\' denied on resource (or it may not exist). Please make sure gs://your-bucket/audio-files/* is accessible via appropriate IAM roles, e.g. Storage Object Viewer or Storage Object Creator.\"\\n}\\nerror_details {\\n message: \"GCS aka Bigstore is not approved for storing Google user data (go/blobstore-gcs-getting-started). If you have obtained security exceptions for Bigstore instead of Blobstore, Please make sure Dremel has access to the files and all directories on the path to the files (http://go/dremel-access).\"\\n permission_denied_error {\\n accessed_resource_uri: \"/bigstore/your-bucket/audio-files/*\"\\n system: BIGSTORE\\n }\\n underlying_status {\\n code: 7\\n space: \"generic\"\\n message: \"Calling Match with file \\\\\"/bigstore/your-bucket/audio-files/**\\\\\": cloud.bigstore.ResponseCode.ErrorCode::ACCESS_DENIED: bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\'storage.objects.list\\\\\\' denied on resource (or it may not exist). bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\'storage.objects.list\\\\\\' denied on resource (or it may not exist). [google.rpc.error_details_ext] { message: \\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\" details { [type.googleapis.com/google.rpc.DebugInfo] { stack_entries: \\\\\"com.google.net.rpc3.client.RpcClientException: APPLICATION_ERROR;cloud.bigstore/FrontendObjectsService.List;bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist). bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist).;AppErrorCode=1;StartTimeMs=1752257982548;tcp;Deadline(sec)=29.991333608;ResFormat=uncompressed;interceptors={[com.google.prod.fireaxe.filters.FireaxeRpcClientInterceptorImpl;com.google.cloud.bigstore.common.LatencyCollectingInterceptor;com.google.frameworks.debug.sherlog.core.rpcutil.Stubby3ClientInterceptor];overrides={}};ServerTimeSec=0.021769909;LogBytes=256;Non-FailFast;EffSecLevel=strong_privacy_and_integrity;ReqFormat=uncompressed;ReqID=f0298bdab819b7ef;GlobalID=632dd03b377a3319;Server=[::1]:14003\\\\\" stack_entries: \\\\\"com.google.net.rpc3.client.RpcStub.startBlockingRpcInternal(RpcStub.java:571)\\\\\" stack_entries: \\\\\"com.google.net.rpc3.client.RpcStub.startBlockingRpc(RpcStub.java:471)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.proto.BackendObjectsService$Stub.list(BackendObjectsService.java:734)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.common.LampreyServiceBase.call(LampreyServiceBase.java:37)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.common.BigstoreFrontendObjectsServiceImpl.call(BigstoreFrontendObjectsServiceImpl.java:40)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.common.BigstoreFrontendObjectsServiceImpl.list(BigstoreFrontendObjectsServiceImpl.java:97)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.stubby.BigstoreStubbyImpl.lambda$listObjects$0(BigstoreStubbyImpl.java:832)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.common.CallbackRequestHandler.handleCallbackRequest(CallbackRequestHandler.java:288)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.common.CallbackRequestHandler.handleCallbackRequest(CallbackRequestHandler.java:158)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.stubby.BigstoreStubbyImpl.listObjects(BigstoreStubbyImpl.java:821)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.common.ProxyDelegatorBase.callAndRecordLatency(ProxyDelegatorBase.java:109)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.stubby.StubbyProxyDelegator.call(StubbyProxyDelegator.java:184)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.stubby.BigstoreStubbyProxy.listObjects(BigstoreStubbyProxy.java:334)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.isolation.RpcReceiver.lambda$processRequestAsync$0(RpcReceiver.java:198)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.isolation.AsyncExecutor.lambda$submit$0(AsyncExecutor.java:213)\\\\\" stack_entries: \\\\\"com.google.common.context.ContextRunnable.runInContext(ContextRunnable.java:83)\\\\\" stack_entries: \\\\\"io.grpc.Context.run(Context.java:536)\\\\\" stack_entries: \\\\\"com.google.tracing.GenericContextCallback.runInInheritedContext(GenericContextCallback.java:58)\\\\\" stack_entries: \\\\\"com.google.common.context.ContextRunnable.run(ContextRunnable.java:74)\\\\\" stack_entries: \\\\\"java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)\\\\\" stack_entries: \\\\\"java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)\\\\\" stack_entries: \\\\\"java.base/java.lang.Thread.run(Unknown Source)\\\\\" } } } [blobstore2.GcsErrorDetails] { xml_code: \\\\\"AccessDenied\\\\\" msg: \\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\" http_code: 403 details { first: \\\\\"Details\\\\\" second: \\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\" } debug_info: \\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist). bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\" } [cloud.bigstore.GcsLatencyInfo] { requests { method: \\\\\"/BackendObjectsService.List\\\\\" deadline { seconds: 29 nanos: 979569783 } start { seconds: 1752257982 nanos: 549935579 } end { seconds: 1752257982 nanos: 570470964 } status { code: 1 space: \\\\\"cloud.bigstore.ResponseCode.ErrorCode\\\\\" message: \\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist). bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\" canonical_code: 7 } } requests { method: \\\\\"/BucketMdService.LookupBucket\\\\\" deadline { seconds: 4 nanos: 999967650 } start { seconds: 1752257982 nanos: 551314439 } end { seconds: 1752257982 nanos: 553833108 } status { } metadata_spanner_stats { read_walltime_millis: 0 read_cpu_millis: 0 read_scheduler_delay_millis: 0 read_throttle_delay_millis: 0 read_per_service_limit_queue_delay_millis: 0 read_locking_delay_millis: 0 read_client_overhead_delay_millis: 0 read_client_flow_control_delay_millis: 0 read_io_delay_millis: 0 } elapsed_time_isolator_metrics { } sunspot_verdict: VERDICT_CAT_UNKNOWN } requests { method: \\\\\"/AccessService.CheckListObjects\\\\\" deadline { seconds: 29 nanos: 969642965 } start { seconds: 1752257982 nanos: 553970188 } end { seconds: 1752257982 nanos: 569602062 } status { } } }\"\\n message_set {\\n [google.rpc.error_details_ext] {\\n message: \"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\'storage.objects.list\\\\\\' denied on resource (or it may not exist).\"\\n }\\n }\\n }\\n error_context {\\n table_name: \"bigframes-dev._8b037bfb7316dddf9d92b12dcf93e008906bfe52.bqdf20250711_sessionee050e_bb3969eb60f548be8a4b6f5e2564f69f\"\\n }\\n}\\n errorProto=code: \"ACCESS_DENIED\"\\nargument: \"BigQuery\"\\nargument: \"BigQuery\"\\nargument: \"Permission denied while globbing file pattern. bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\'storage.objects.list\\\\\\' denied on resource (or it may not exist). Please make sure gs://your-bucket/audio-files/* is accessible via appropriate IAM roles, e.g. Storage Object Viewer or Storage Object Creator.\"\\nlocation_type: OTHER\\nlocation: \"gs://your-bucket/audio-files/*\"\\n\\n\\tat com.google.cloud.helix.common.Exceptions.fromProto(Exceptions.java:2016)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl.mapDremelErrorsTohelixException(QueryExecutorImpl.java:1194)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:769)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:695)\\n\\tat com.google.net.rpc3.stream.RpcMessageCallback$ForwardingRpcMessageCallback.onMessage(RpcMessageCallback.java:128)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.processMessageUnlocked(RpcStreamInternalContext.java:1852)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksInternalUnlocked(RpcStreamInternalContext.java:2904)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksUnlocked(RpcStreamInternalContext.java:2830)\\n\\tat com.google.net.eventmanager.AbstractFutureTask$Sync.innerRun(AbstractFutureTask.java:259)\\n\\tat com.google.net.eventmanager.AbstractFutureTask.run(AbstractFutureTask.java:120)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runTaskTraced(EventManagerImpl.java:900)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runTask(EventManagerImpl.java:892)\\n\\tat com.google.net.eventmanager.EventManagerImpl.internalRunWorkerLoop(EventManagerImpl.java:1319)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runWorkerLoop(EventManagerImpl.java:1210)\\n\\tat com.google.net.eventmanager.WorkerThreadInfo.runWorkerLoop(WorkerThreadInfo.java:153)\\n\\tat com.google.net.eventmanager.EventManagerImpl$WorkerThread.run(EventManagerImpl.java:1999)\\n'}]", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mForbidden\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[1], line 20\u001b[0m\n\u001b[1;32m 9\u001b[0m flattened \u001b[38;5;241m=\u001b[39m bpd\u001b[38;5;241m.\u001b[39mfrom_glob_path( \n\u001b[1;32m 10\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mgs://your-bucket/audio-files/*\u001b[39m\u001b[38;5;124m\"\u001b[39m, \n\u001b[1;32m 11\u001b[0m name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGCS Blob\u001b[39m\u001b[38;5;124m\"\u001b[39m \n\u001b[1;32m 12\u001b[0m ) \n\u001b[1;32m 14\u001b[0m \u001b[38;5;66;03m# Alternatively, create from URI strings with null index \u001b[39;00m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;66;03m# df = bpd.DataFrame({\"uri\": [\"gs://bucket/audio1.wav\", \"gs://bucket/audio2.wav\"]}) \u001b[39;00m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;66;03m# df[\"GCS Blob\"] = df[\"uri\"].str.to_blob() \u001b[39;00m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;66;03m# flattened = bpd.read_gbq_table(\"your_table\", index_col=bigframes.enums.DefaultIndexKind.NULL) \u001b[39;00m\n\u001b[1;32m 18\u001b[0m \n\u001b[1;32m 19\u001b[0m \u001b[38;5;66;03m# 3. This will trigger the NullIndexError \u001b[39;00m\n\u001b[0;32m---> 20\u001b[0m flattened[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTranscription\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mflattened\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mGCS Blob\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mblob\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43maudio_transcribe\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 21\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgemini-2.0-flash-001\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 22\u001b[0m \u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 23\u001b[0m \u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:175\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 172\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 175\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 178\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 179\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;66;03m# _block.session.bqclient.\u001b[39;00m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/operations/blob.py:822\u001b[0m, in \u001b[0;36mBlobAccessor.audio_transcribe\u001b[0;34m(self, engine, connection, model_name, verbose)\u001b[0m\n\u001b[1;32m 815\u001b[0m llm_model \u001b[38;5;241m=\u001b[39m llm\u001b[38;5;241m.\u001b[39mGeminiTextGenerator(\n\u001b[1;32m 816\u001b[0m model_name\u001b[38;5;241m=\u001b[39mmodel_name,\n\u001b[1;32m 817\u001b[0m session\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_block\u001b[38;5;241m.\u001b[39msession,\n\u001b[1;32m 818\u001b[0m connection_name\u001b[38;5;241m=\u001b[39mconnection,\n\u001b[1;32m 819\u001b[0m )\n\u001b[1;32m 821\u001b[0m \u001b[38;5;66;03m# transcribe audio using ML.GENERATE_TEXT\u001b[39;00m\n\u001b[0;32m--> 822\u001b[0m transcribed_results \u001b[38;5;241m=\u001b[39m \u001b[43mllm_model\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpredict\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 823\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maudio_series\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 824\u001b[0m \u001b[43m \u001b[49m\u001b[43mprompt\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43mprompt_text\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maudio_series\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 825\u001b[0m \u001b[43m \u001b[49m\u001b[43mtemperature\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0.0\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 826\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 828\u001b[0m transcribed_content_series \u001b[38;5;241m=\u001b[39m cast(\n\u001b[1;32m 829\u001b[0m bpd\u001b[38;5;241m.\u001b[39mSeries, transcribed_results[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mml_generate_text_llm_result\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 830\u001b[0m )\u001b[38;5;241m.\u001b[39mrename(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtranscribed_content\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 832\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m verbose:\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:175\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 172\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 175\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 178\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 179\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;66;03m# _block.session.bqclient.\u001b[39;00m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/ml/llm.py:745\u001b[0m, in \u001b[0;36mGeminiTextGenerator.predict\u001b[0;34m(self, X, temperature, max_output_tokens, top_k, top_p, ground_with_google_search, max_retries, prompt, output_schema)\u001b[0m\n\u001b[1;32m 737\u001b[0m options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutput_schema\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m output_schema\n\u001b[1;32m 738\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_predict_and_retry(\n\u001b[1;32m 739\u001b[0m core\u001b[38;5;241m.\u001b[39mBqmlModel\u001b[38;5;241m.\u001b[39mgenerate_table_tvf,\n\u001b[1;32m 740\u001b[0m X,\n\u001b[1;32m 741\u001b[0m options\u001b[38;5;241m=\u001b[39moptions,\n\u001b[1;32m 742\u001b[0m max_retries\u001b[38;5;241m=\u001b[39mmax_retries,\n\u001b[1;32m 743\u001b[0m )\n\u001b[0;32m--> 745\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_predict_and_retry\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 746\u001b[0m \u001b[43m \u001b[49m\u001b[43mcore\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mBqmlModel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate_text_tvf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 747\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 748\u001b[0m \u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 749\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_retries\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_retries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 750\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/ml/base.py:266\u001b[0m, in \u001b[0;36mRetriableRemotePredictor._predict_and_retry\u001b[0;34m(self, bqml_model_predict_tvf, X, options, max_retries)\u001b[0m\n\u001b[1;32m 263\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(msg, category\u001b[38;5;241m=\u001b[39m\u001b[38;5;167;01mRuntimeWarning\u001b[39;00m)\n\u001b[1;32m 264\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[0;32m--> 266\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43mbqml_model_predict_tvf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtvf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_bqml_model\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdf_fail\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 268\u001b[0m success \u001b[38;5;241m=\u001b[39m df[bqml_model_predict_tvf\u001b[38;5;241m.\u001b[39mstatus_col]\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mlen() \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 269\u001b[0m df_succ \u001b[38;5;241m=\u001b[39m df[success]\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/ml/core.py:179\u001b[0m, in \u001b[0;36mBqmlModel.generate_text\u001b[0;34m(self, input_data, options)\u001b[0m\n\u001b[1;32m 173\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mgenerate_text\u001b[39m(\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 175\u001b[0m input_data: bpd\u001b[38;5;241m.\u001b[39mDataFrame,\n\u001b[1;32m 176\u001b[0m options: \u001b[38;5;28mdict\u001b[39m[\u001b[38;5;28mstr\u001b[39m, Union[\u001b[38;5;28mint\u001b[39m, \u001b[38;5;28mfloat\u001b[39m, \u001b[38;5;28mbool\u001b[39m]],\n\u001b[1;32m 177\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m bpd\u001b[38;5;241m.\u001b[39mDataFrame:\n\u001b[1;32m 178\u001b[0m options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mflatten_json_output\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 179\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_apply_ml_tvf\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 180\u001b[0m \u001b[43m \u001b[49m\u001b[43minput_data\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 181\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mlambda\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43msource_sql\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sql_generator\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mml_generate_text\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 182\u001b[0m \u001b[43m \u001b[49m\u001b[43msource_sql\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msource_sql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 183\u001b[0m \u001b[43m \u001b[49m\u001b[43mstruct_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 184\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 185\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/ml/core.py:98\u001b[0m, in \u001b[0;36mBqmlModel._apply_ml_tvf\u001b[0;34m(self, input_data, apply_sql_tvf)\u001b[0m\n\u001b[1;32m 93\u001b[0m input_sql, index_col_ids, index_labels \u001b[38;5;241m=\u001b[39m input_data\u001b[38;5;241m.\u001b[39m_to_sql_query(\n\u001b[1;32m 94\u001b[0m include_index\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 95\u001b[0m )\n\u001b[1;32m 97\u001b[0m result_sql \u001b[38;5;241m=\u001b[39m apply_sql_tvf(input_sql)\n\u001b[0;32m---> 98\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_session\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_gbq\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresult_sql\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex_col_ids\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 99\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m df\u001b[38;5;241m.\u001b[39m_has_index:\n\u001b[1;32m 100\u001b[0m df\u001b[38;5;241m.\u001b[39mindex\u001b[38;5;241m.\u001b[39mnames \u001b[38;5;241m=\u001b[39m index_labels\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:175\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 172\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 175\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 178\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 179\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;66;03m# _block.session.bqclient.\u001b[39;00m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/__init__.py:439\u001b[0m, in \u001b[0;36mSession.read_gbq\u001b[0;34m(self, query_or_table, index_col, columns, configuration, max_results, filters, use_cache, col_order, dry_run)\u001b[0m\n\u001b[1;32m 436\u001b[0m columns \u001b[38;5;241m=\u001b[39m col_order\n\u001b[1;32m 438\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m bf_io_bigquery\u001b[38;5;241m.\u001b[39mis_query(query_or_table):\n\u001b[0;32m--> 439\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_loader\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_gbq_query\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# type: ignore # for dry_run overload\u001b[39;49;00m\n\u001b[1;32m 440\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_or_table\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 441\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex_col\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 442\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 443\u001b[0m \u001b[43m \u001b[49m\u001b[43mconfiguration\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfiguration\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 444\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 445\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 446\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 447\u001b[0m \u001b[43m \u001b[49m\u001b[43mdry_run\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdry_run\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 448\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 449\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 450\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m configuration \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/loader.py:996\u001b[0m, in \u001b[0;36mGbqDataLoader.read_gbq_query\u001b[0;34m(self, query, index_col, columns, configuration, max_results, use_cache, filters, dry_run, force_total_order, allow_large_results)\u001b[0m\n\u001b[1;32m 993\u001b[0m \u001b[38;5;66;03m# TODO(b/421161077): If an explicit destination table is set in\u001b[39;00m\n\u001b[1;32m 994\u001b[0m \u001b[38;5;66;03m# configuration, should we respect that setting?\u001b[39;00m\n\u001b[1;32m 995\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m allow_large_results:\n\u001b[0;32m--> 996\u001b[0m destination, query_job \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_query_to_destination\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 997\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 998\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# No cluster candidates as user query might not be clusterable\u001b[39;49;00m\n\u001b[1;32m 999\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# (eg because of ORDER BY clause)\u001b[39;49;00m\n\u001b[1;32m 1000\u001b[0m \u001b[43m \u001b[49m\u001b[43mcluster_candidates\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1001\u001b[0m \u001b[43m \u001b[49m\u001b[43mconfiguration\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfiguration\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1002\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1003\u001b[0m query_job_for_metrics \u001b[38;5;241m=\u001b[39m query_job\n\u001b[1;32m 1004\u001b[0m rows \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/loader.py:1120\u001b[0m, in \u001b[0;36mGbqDataLoader._query_to_destination\u001b[0;34m(self, query, cluster_candidates, configuration, do_clustering)\u001b[0m\n\u001b[1;32m 1116\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1117\u001b[0m \u001b[38;5;66;03m# Write to temp table to workaround BigQuery 10 GB query results\u001b[39;00m\n\u001b[1;32m 1118\u001b[0m \u001b[38;5;66;03m# limit. See: internal issue 303057336.\u001b[39;00m\n\u001b[1;32m 1119\u001b[0m job_config\u001b[38;5;241m.\u001b[39mlabels[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124merror_caught\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtrue\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1120\u001b[0m query_job \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_start_query_with_job\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1121\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1122\u001b[0m \u001b[43m \u001b[49m\u001b[43mjob_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjob_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1123\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1124\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1125\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m query_job\u001b[38;5;241m.\u001b[39mdestination, query_job\n\u001b[1;32m 1126\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m google\u001b[38;5;241m.\u001b[39mapi_core\u001b[38;5;241m.\u001b[39mexceptions\u001b[38;5;241m.\u001b[39mBadRequest:\n\u001b[1;32m 1127\u001b[0m \u001b[38;5;66;03m# Some SELECT statements still aren't compatible with cluster\u001b[39;00m\n\u001b[1;32m 1128\u001b[0m \u001b[38;5;66;03m# tables as the destination. For example, if the query has a\u001b[39;00m\n\u001b[1;32m 1129\u001b[0m \u001b[38;5;66;03m# top-level ORDER BY, this conflicts with our ability to cluster\u001b[39;00m\n\u001b[1;32m 1130\u001b[0m \u001b[38;5;66;03m# the table by the index column(s).\u001b[39;00m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/loader.py:1186\u001b[0m, in \u001b[0;36mGbqDataLoader._start_query_with_job\u001b[0;34m(self, sql, job_config, timeout)\u001b[0m\n\u001b[1;32m 1180\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1181\u001b[0m \u001b[38;5;124;03mStarts BigQuery query job and waits for results.\u001b[39;00m\n\u001b[1;32m 1182\u001b[0m \n\u001b[1;32m 1183\u001b[0m \u001b[38;5;124;03mDo not execute dataframe through this API, instead use the executor.\u001b[39;00m\n\u001b[1;32m 1184\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1185\u001b[0m job_config \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_prepare_job_config(job_config)\n\u001b[0;32m-> 1186\u001b[0m _, query_job \u001b[38;5;241m=\u001b[39m \u001b[43mbf_io_bigquery\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstart_query_with_client\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1187\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_bqclient\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1188\u001b[0m \u001b[43m \u001b[49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1189\u001b[0m \u001b[43m \u001b[49m\u001b[43mjob_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjob_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1190\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1191\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 1192\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 1193\u001b[0m \u001b[43m \u001b[49m\u001b[43mmetrics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 1194\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_with_job\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 1195\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1196\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m query_job\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/_io/bigquery/__init__.py:314\u001b[0m, in \u001b[0;36mstart_query_with_client\u001b[0;34m(bq_client, sql, job_config, location, project, timeout, metrics, query_with_job)\u001b[0m\n\u001b[1;32m 312\u001b[0m opts \u001b[38;5;241m=\u001b[39m bigframes\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mdisplay\n\u001b[1;32m 313\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m opts\u001b[38;5;241m.\u001b[39mprogress_bar \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m query_job\u001b[38;5;241m.\u001b[39mconfiguration\u001b[38;5;241m.\u001b[39mdry_run:\n\u001b[0;32m--> 314\u001b[0m results_iterator \u001b[38;5;241m=\u001b[39m \u001b[43mformatting_helpers\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait_for_query_job\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 315\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_job\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 316\u001b[0m \u001b[43m \u001b[49m\u001b[43mprogress_bar\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mopts\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprogress_bar\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 317\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 318\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 319\u001b[0m results_iterator \u001b[38;5;241m=\u001b[39m query_job\u001b[38;5;241m.\u001b[39mresult()\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/formatting_helpers.py:149\u001b[0m, in \u001b[0;36mwait_for_query_job\u001b[0;34m(query_job, max_results, page_size, progress_bar)\u001b[0m\n\u001b[1;32m 147\u001b[0m loading_bar \u001b[38;5;241m=\u001b[39m display\u001b[38;5;241m.\u001b[39mHTML(get_query_job_loading_html(query_job))\n\u001b[1;32m 148\u001b[0m display\u001b[38;5;241m.\u001b[39mdisplay(loading_bar, display_id\u001b[38;5;241m=\u001b[39mdisplay_id)\n\u001b[0;32m--> 149\u001b[0m query_result \u001b[38;5;241m=\u001b[39m \u001b[43mquery_job\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 150\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpage_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpage_size\u001b[49m\n\u001b[1;32m 151\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 152\u001b[0m query_job\u001b[38;5;241m.\u001b[39mreload()\n\u001b[1;32m 153\u001b[0m display\u001b[38;5;241m.\u001b[39mupdate_display(\n\u001b[1;32m 154\u001b[0m display\u001b[38;5;241m.\u001b[39mHTML(get_query_job_loading_html(query_job)),\n\u001b[1;32m 155\u001b[0m display_id\u001b[38;5;241m=\u001b[39mdisplay_id,\n\u001b[1;32m 156\u001b[0m )\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/job/query.py:1696\u001b[0m, in \u001b[0;36mQueryJob.result\u001b[0;34m(self, page_size, max_results, retry, timeout, start_index, job_retry)\u001b[0m\n\u001b[1;32m 1691\u001b[0m remaining_timeout \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1693\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m remaining_timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1694\u001b[0m \u001b[38;5;66;03m# Since is_job_done() calls jobs.getQueryResults, which is a\u001b[39;00m\n\u001b[1;32m 1695\u001b[0m \u001b[38;5;66;03m# long-running API, don't delay the next request at all.\u001b[39;00m\n\u001b[0;32m-> 1696\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[43mis_job_done\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 1697\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[1;32m 1698\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1699\u001b[0m \u001b[38;5;66;03m# Use a monotonic clock since we don't actually care about\u001b[39;00m\n\u001b[1;32m 1700\u001b[0m \u001b[38;5;66;03m# daylight savings or similar, just the elapsed time.\u001b[39;00m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:294\u001b[0m, in \u001b[0;36mRetry.__call__..retry_wrapped_func\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 290\u001b[0m target \u001b[38;5;241m=\u001b[39m functools\u001b[38;5;241m.\u001b[39mpartial(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 291\u001b[0m sleep_generator \u001b[38;5;241m=\u001b[39m exponential_sleep_generator(\n\u001b[1;32m 292\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_initial, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_maximum, multiplier\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_multiplier\n\u001b[1;32m 293\u001b[0m )\n\u001b[0;32m--> 294\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mretry_target\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 295\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 296\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_predicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 297\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep_generator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 298\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 299\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 300\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:156\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 152\u001b[0m \u001b[38;5;66;03m# pylint: disable=broad-except\u001b[39;00m\n\u001b[1;32m 153\u001b[0m \u001b[38;5;66;03m# This function explicitly must deal with broad exceptions.\u001b[39;00m\n\u001b[1;32m 154\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 155\u001b[0m \u001b[38;5;66;03m# defer to shared logic for handling errors\u001b[39;00m\n\u001b[0;32m--> 156\u001b[0m next_sleep \u001b[38;5;241m=\u001b[39m \u001b[43m_retry_error_helper\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 157\u001b[0m \u001b[43m \u001b[49m\u001b[43mexc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 158\u001b[0m \u001b[43m \u001b[49m\u001b[43mdeadline\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 159\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep_iter\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 160\u001b[0m \u001b[43m \u001b[49m\u001b[43merror_list\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 161\u001b[0m \u001b[43m \u001b[49m\u001b[43mpredicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 162\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 163\u001b[0m \u001b[43m \u001b[49m\u001b[43mexception_factory\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 164\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 165\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 166\u001b[0m \u001b[38;5;66;03m# if exception not raised, sleep before next attempt\u001b[39;00m\n\u001b[1;32m 167\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(next_sleep)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_base.py:214\u001b[0m, in \u001b[0;36m_retry_error_helper\u001b[0;34m(exc, deadline, sleep_iterator, error_list, predicate_fn, on_error_fn, exc_factory_fn, original_timeout)\u001b[0m\n\u001b[1;32m 208\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m predicate_fn(exc):\n\u001b[1;32m 209\u001b[0m final_exc, source_exc \u001b[38;5;241m=\u001b[39m exc_factory_fn(\n\u001b[1;32m 210\u001b[0m error_list,\n\u001b[1;32m 211\u001b[0m RetryFailureReason\u001b[38;5;241m.\u001b[39mNON_RETRYABLE_ERROR,\n\u001b[1;32m 212\u001b[0m original_timeout,\n\u001b[1;32m 213\u001b[0m )\n\u001b[0;32m--> 214\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m final_exc \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01msource_exc\u001b[39;00m\n\u001b[1;32m 215\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m on_error_fn \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 216\u001b[0m on_error_fn(exc)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:147\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 145\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[1;32m 146\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 147\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mtarget\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 148\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inspect\u001b[38;5;241m.\u001b[39misawaitable(result):\n\u001b[1;32m 149\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(_ASYNC_RETRY_WARNING)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/job/query.py:1665\u001b[0m, in \u001b[0;36mQueryJob.result..is_job_done\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1659\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 1661\u001b[0m \u001b[38;5;66;03m# Call jobs.getQueryResults with max results set to 0 just to\u001b[39;00m\n\u001b[1;32m 1662\u001b[0m \u001b[38;5;66;03m# wait for the query to finish. Unlike most methods,\u001b[39;00m\n\u001b[1;32m 1663\u001b[0m \u001b[38;5;66;03m# jobs.getQueryResults hangs as long as it can to ensure we\u001b[39;00m\n\u001b[1;32m 1664\u001b[0m \u001b[38;5;66;03m# know when the query has finished as soon as possible.\u001b[39;00m\n\u001b[0;32m-> 1665\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_reload_query_results\u001b[49m\u001b[43m(\u001b[49m\u001b[43mretry\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mretry\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mreload_query_results_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1667\u001b[0m \u001b[38;5;66;03m# Even if the query is finished now according to\u001b[39;00m\n\u001b[1;32m 1668\u001b[0m \u001b[38;5;66;03m# jobs.getQueryResults, we'll want to reload the job status if\u001b[39;00m\n\u001b[1;32m 1669\u001b[0m \u001b[38;5;66;03m# it's not already DONE.\u001b[39;00m\n\u001b[1;32m 1670\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/job/query.py:1463\u001b[0m, in \u001b[0;36mQueryJob._reload_query_results\u001b[0;34m(self, retry, timeout, page_size)\u001b[0m\n\u001b[1;32m 1460\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(transport_timeout, (\u001b[38;5;28mfloat\u001b[39m, \u001b[38;5;28mint\u001b[39m)):\n\u001b[1;32m 1461\u001b[0m transport_timeout \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1463\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_query_results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_client\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_query_results\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1464\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjob_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1465\u001b[0m \u001b[43m \u001b[49m\u001b[43mretry\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1466\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mproject\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1467\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout_ms\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout_ms\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1468\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlocation\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1469\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtransport_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1470\u001b[0m \u001b[43m \u001b[49m\u001b[43mpage_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpage_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1471\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/client.py:2060\u001b[0m, in \u001b[0;36mClient._get_query_results\u001b[0;34m(self, job_id, retry, project, timeout_ms, location, timeout, page_size)\u001b[0m\n\u001b[1;32m 2056\u001b[0m \u001b[38;5;66;03m# This call is typically made in a polling loop that checks whether the\u001b[39;00m\n\u001b[1;32m 2057\u001b[0m \u001b[38;5;66;03m# job is complete (from QueryJob.done(), called ultimately from\u001b[39;00m\n\u001b[1;32m 2058\u001b[0m \u001b[38;5;66;03m# QueryJob.result()). So we don't need to poll here.\u001b[39;00m\n\u001b[1;32m 2059\u001b[0m span_attributes \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpath\u001b[39m\u001b[38;5;124m\"\u001b[39m: path}\n\u001b[0;32m-> 2060\u001b[0m resource \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_api\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2061\u001b[0m \u001b[43m \u001b[49m\u001b[43mretry\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2062\u001b[0m \u001b[43m \u001b[49m\u001b[43mspan_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mBigQuery.getQueryResults\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2063\u001b[0m \u001b[43m \u001b[49m\u001b[43mspan_attributes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mspan_attributes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2064\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mGET\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2065\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2066\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_params\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextra_params\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2067\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2068\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2069\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _QueryResults\u001b[38;5;241m.\u001b[39mfrom_api_repr(resource)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/client.py:858\u001b[0m, in \u001b[0;36mClient._call_api\u001b[0;34m(self, retry, span_name, span_attributes, job_ref, headers, **kwargs)\u001b[0m\n\u001b[1;32m 854\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m span_name \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 855\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m create_span(\n\u001b[1;32m 856\u001b[0m name\u001b[38;5;241m=\u001b[39mspan_name, attributes\u001b[38;5;241m=\u001b[39mspan_attributes, client\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m, job_ref\u001b[38;5;241m=\u001b[39mjob_ref\n\u001b[1;32m 857\u001b[0m ):\n\u001b[0;32m--> 858\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcall\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 860\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m call()\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:294\u001b[0m, in \u001b[0;36mRetry.__call__..retry_wrapped_func\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 290\u001b[0m target \u001b[38;5;241m=\u001b[39m functools\u001b[38;5;241m.\u001b[39mpartial(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 291\u001b[0m sleep_generator \u001b[38;5;241m=\u001b[39m exponential_sleep_generator(\n\u001b[1;32m 292\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_initial, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_maximum, multiplier\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_multiplier\n\u001b[1;32m 293\u001b[0m )\n\u001b[0;32m--> 294\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mretry_target\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 295\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 296\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_predicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 297\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep_generator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 298\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 299\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 300\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:156\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 152\u001b[0m \u001b[38;5;66;03m# pylint: disable=broad-except\u001b[39;00m\n\u001b[1;32m 153\u001b[0m \u001b[38;5;66;03m# This function explicitly must deal with broad exceptions.\u001b[39;00m\n\u001b[1;32m 154\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 155\u001b[0m \u001b[38;5;66;03m# defer to shared logic for handling errors\u001b[39;00m\n\u001b[0;32m--> 156\u001b[0m next_sleep \u001b[38;5;241m=\u001b[39m \u001b[43m_retry_error_helper\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 157\u001b[0m \u001b[43m \u001b[49m\u001b[43mexc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 158\u001b[0m \u001b[43m \u001b[49m\u001b[43mdeadline\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 159\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep_iter\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 160\u001b[0m \u001b[43m \u001b[49m\u001b[43merror_list\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 161\u001b[0m \u001b[43m \u001b[49m\u001b[43mpredicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 162\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 163\u001b[0m \u001b[43m \u001b[49m\u001b[43mexception_factory\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 164\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 165\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 166\u001b[0m \u001b[38;5;66;03m# if exception not raised, sleep before next attempt\u001b[39;00m\n\u001b[1;32m 167\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(next_sleep)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_base.py:214\u001b[0m, in \u001b[0;36m_retry_error_helper\u001b[0;34m(exc, deadline, sleep_iterator, error_list, predicate_fn, on_error_fn, exc_factory_fn, original_timeout)\u001b[0m\n\u001b[1;32m 208\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m predicate_fn(exc):\n\u001b[1;32m 209\u001b[0m final_exc, source_exc \u001b[38;5;241m=\u001b[39m exc_factory_fn(\n\u001b[1;32m 210\u001b[0m error_list,\n\u001b[1;32m 211\u001b[0m RetryFailureReason\u001b[38;5;241m.\u001b[39mNON_RETRYABLE_ERROR,\n\u001b[1;32m 212\u001b[0m original_timeout,\n\u001b[1;32m 213\u001b[0m )\n\u001b[0;32m--> 214\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m final_exc \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01msource_exc\u001b[39;00m\n\u001b[1;32m 215\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m on_error_fn \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 216\u001b[0m on_error_fn(exc)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:147\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 145\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[1;32m 146\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 147\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mtarget\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 148\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inspect\u001b[38;5;241m.\u001b[39misawaitable(result):\n\u001b[1;32m 149\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(_ASYNC_RETRY_WARNING)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/_http/__init__.py:494\u001b[0m, in \u001b[0;36mJSONConnection.api_request\u001b[0;34m(self, method, path, query_params, data, content_type, headers, api_base_url, api_version, expect_json, _target_object, timeout, extra_api_info)\u001b[0m\n\u001b[1;32m 482\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_make_request(\n\u001b[1;32m 483\u001b[0m method\u001b[38;5;241m=\u001b[39mmethod,\n\u001b[1;32m 484\u001b[0m url\u001b[38;5;241m=\u001b[39murl,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 490\u001b[0m extra_api_info\u001b[38;5;241m=\u001b[39mextra_api_info,\n\u001b[1;32m 491\u001b[0m )\n\u001b[1;32m 493\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;241m200\u001b[39m \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m response\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m300\u001b[39m:\n\u001b[0;32m--> 494\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exceptions\u001b[38;5;241m.\u001b[39mfrom_http_response(response)\n\u001b[1;32m 496\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m expect_json \u001b[38;5;129;01mand\u001b[39;00m response\u001b[38;5;241m.\u001b[39mcontent:\n\u001b[1;32m 497\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m response\u001b[38;5;241m.\u001b[39mjson()\n", - "\u001b[0;31mForbidden\u001b[0m: 403 GET https://bigquery.googleapis.com/bigquery/v2/projects/bigframes-dev/queries/0b18e95b-0728-448e-b27a-8094d27d8135?maxResults=0&location=US&prettyPrint=false: Access Denied: BigQuery BigQuery: Permission denied while globbing file pattern. bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission 'storage.objects.list' denied on resource (or it may not exist). Please make sure gs://your-bucket/audio-files/* is accessible via appropriate IAM roles, e.g. Storage Object Viewer or Storage Object Creator.\n\nLocation: US\nJob ID: 0b18e95b-0728-448e-b27a-8094d27d8135\n Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey. You are currently running BigFrames version 2.10.0. [{'@type': 'type.googleapis.com/google.rpc.DebugInfo', 'detail': '[ACCESS_DENIED] message=ACCESS_DENIED: [BigQuery, BigQuery, Permission denied while globbing file pattern. bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\'storage.objects.list\\' denied on resource (or it may not exist). Please make sure gs://your-bucket/audio-files/* is accessible via appropriate IAM roles, e.g. Storage Object Viewer or Storage Object Creator.] debug=code: \\t AUTHORIZATION_ERROR\\ndescription: \"Permission denied while globbing file pattern. bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\'storage.objects.list\\\\\\' denied on resource (or it may not exist). Please make sure gs://your-bucket/audio-files/* is accessible via appropriate IAM roles, e.g. Storage Object Viewer or Storage Object Creator. message: \\\\\"GCS aka Bigstore is not approved for storing Google user data (go/blobstore-gcs-getting-started). If you have obtained security exceptions for Bigstore instead of Blobstore, Please make sure Dremel has access to the files and all directories on the path to the files (http://go/dremel-access).\\\\\"\\\\npermission_denied_error {\\\\n accessed_resource_uri: \\\\\"/bigstore/your-bucket/audio-files/*\\\\\"\\\\n system: BIGSTORE\\\\n}\\\\nunderlying_status {\\\\n code: 7\\\\n space: \\\\\"generic\\\\\"\\\\n message: \\\\\"Calling Match with file \\\\\\\\\\\\\"/bigstore/your-bucket/audio-files/**\\\\\\\\\\\\\": cloud.bigstore.ResponseCode.ErrorCode::ACCESS_DENIED: bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist). bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist). [google.rpc.error_details_ext] { message: \\\\\\\\\\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\\\\\\\\\" details { [type.googleapis.com/google.rpc.DebugInfo] { stack_entries: \\\\\\\\\\\\\"com.google.net.rpc3.client.RpcClientException: APPLICATION_ERROR;cloud.bigstore/FrontendObjectsService.List;bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist). bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist).;AppErrorCode=1;StartTimeMs=1752257982548;tcp;Deadline(sec)=29.991333608;ResFormat=uncompressed;interceptors={[com.google.prod.fireaxe.filters.FireaxeRpcClientInterceptorImpl;com.google.cloud.bigstore.common.LatencyCollectingInterceptor;com.google.frameworks.debug.sherlog.core.rpcutil.Stubby3ClientInterceptor];overrides={}};ServerTimeSec=0.021769909;LogBytes=256;Non-FailFast;EffSecLevel=strong_privacy_and_integrity;ReqFormat=uncompressed;ReqID=f0298bdab819b7ef;GlobalID=632dd03b377a3319;Server=[::1]:14003\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.net.rpc3.client.RpcStub.startBlockingRpcInternal(RpcStub.java:571)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.net.rpc3.client.RpcStub.startBlockingRpc(RpcStub.java:471)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.proto.BackendObjectsService$Stub.list(BackendObjectsService.java:734)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.common.LampreyServiceBase.call(LampreyServiceBase.java:37)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.common.BigstoreFrontendObjectsServiceImpl.call(BigstoreFrontendObjectsServiceImpl.java:40)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.common.BigstoreFrontendObjectsServiceImpl.list(BigstoreFrontendObjectsServiceImpl.java:97)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.stubby.BigstoreStubbyImpl.lambda$listObjects$0(BigstoreStubbyImpl.java:832)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.common.CallbackRequestHandler.handleCallbackRequest(CallbackRequestHandler.java:288)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.common.CallbackRequestHandler.handleCallbackRequest(CallbackRequestHandler.java:158)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.stubby.BigstoreStubbyImpl.listObjects(BigstoreStubbyImpl.java:821)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.common.ProxyDelegatorBase.callAndRecordLatency(ProxyDelegatorBase.java:109)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.stubby.StubbyProxyDelegator.call(StubbyProxyDelegator.java:184)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.api.stubby.BigstoreStubbyProxy.listObjects(BigstoreStubbyProxy.java:334)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.isolation.RpcReceiver.lambda$processRequestAsync$0(RpcReceiver.java:198)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.cloud.bigstore.isolation.AsyncExecutor.lambda$submit$0(AsyncExecutor.java:213)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.common.context.ContextRunnable.runInContext(ContextRunnable.java:83)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"io.grpc.Context.run(Context.java:536)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.tracing.GenericContextCallback.runInInheritedContext(GenericContextCallback.java:58)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"com.google.common.context.ContextRunnable.run(ContextRunnable.java:74)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)\\\\\\\\\\\\\" stack_entries: \\\\\\\\\\\\\"java.base/java.lang.Thread.run(Unknown Source)\\\\\\\\\\\\\" } } } [blobstore2.GcsErrorDetails] { xml_code: \\\\\\\\\\\\\"AccessDenied\\\\\\\\\\\\\" msg: \\\\\\\\\\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\\\\\\\\\" http_code: 403 details { first: \\\\\\\\\\\\\"Details\\\\\\\\\\\\\" second: \\\\\\\\\\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\\\\\\\\\" } debug_info: \\\\\\\\\\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist). bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\\\\\\\\\" } [cloud.bigstore.GcsLatencyInfo] { requests { method: \\\\\\\\\\\\\"/BackendObjectsService.List\\\\\\\\\\\\\" deadline { seconds: 29 nanos: 979569783 } start { seconds: 1752257982 nanos: 549935579 } end { seconds: 1752257982 nanos: 570470964 } status { code: 1 space: \\\\\\\\\\\\\"cloud.bigstore.ResponseCode.ErrorCode\\\\\\\\\\\\\" message: \\\\\\\\\\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist). bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\\\\\\\\\" canonical_code: 7 } } requests { method: \\\\\\\\\\\\\"/BucketMdService.LookupBucket\\\\\\\\\\\\\" deadline { seconds: 4 nanos: 999967650 } start { seconds: 1752257982 nanos: 551314439 } end { seconds: 1752257982 nanos: 553833108 } status { } metadata_spanner_stats { read_walltime_millis: 0 read_cpu_millis: 0 read_scheduler_delay_millis: 0 read_throttle_delay_millis: 0 read_per_service_limit_queue_delay_millis: 0 read_locking_delay_millis: 0 read_client_overhead_delay_millis: 0 read_client_flow_control_delay_millis: 0 read_io_delay_millis: 0 } elapsed_time_isolator_metrics { } sunspot_verdict: VERDICT_CAT_UNKNOWN } requests { method: \\\\\\\\\\\\\"/AccessService.CheckListObjects\\\\\\\\\\\\\" deadline { seconds: 29 nanos: 969642965 } start { seconds: 1752257982 nanos: 553970188 } end { seconds: 1752257982 nanos: 569602062 } status { } } }\\\\\"\\\\n message_set {\\\\n [google.rpc.error_details_ext] {\\\\n message: \\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\"\\\\n }\\\\n }\\\\n}\\\\nerror_context {\\\\n table_name: \\\\\"bigframes-dev._8b037bfb7316dddf9d92b12dcf93e008906bfe52.bqdf20250711_sessionee050e_bb3969eb60f548be8a4b6f5e2564f69f\\\\\"\\\\n}\\\\n\"\\ncause: USER_ERROR\\naddress: \"http://jfs5.prod.google.com:4901/task?handle=logs.2071.serving.shard-mals.cloud-dataengine.11785653463719 Partition description: __SHUFFLE0/0 TableDef \\\\\\'bigframes-dev._8b037bfb7316dddf9d92b12dcf93e008906bfe52.bqdf20250711_sessionee050e_bb3969eb60f548be8a4b6f5e2564f69f\\\\\\' of type \\\\\\'object-meta\\\\\\': /bigstore/your-bucket/audio-files/*\"\\nstatus_proto {\\n code: 7\\n space: \"generic\"\\n message: \"Permission denied while globbing file pattern. bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\'storage.objects.list\\\\\\' denied on resource (or it may not exist). Please make sure gs://your-bucket/audio-files/* is accessible via appropriate IAM roles, e.g. Storage Object Viewer or Storage Object Creator.\"\\n}\\nerror_details {\\n message: \"GCS aka Bigstore is not approved for storing Google user data (go/blobstore-gcs-getting-started). If you have obtained security exceptions for Bigstore instead of Blobstore, Please make sure Dremel has access to the files and all directories on the path to the files (http://go/dremel-access).\"\\n permission_denied_error {\\n accessed_resource_uri: \"/bigstore/your-bucket/audio-files/*\"\\n system: BIGSTORE\\n }\\n underlying_status {\\n code: 7\\n space: \"generic\"\\n message: \"Calling Match with file \\\\\"/bigstore/your-bucket/audio-files/**\\\\\": cloud.bigstore.ResponseCode.ErrorCode::ACCESS_DENIED: bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\'storage.objects.list\\\\\\' denied on resource (or it may not exist). bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\'storage.objects.list\\\\\\' denied on resource (or it may not exist). [google.rpc.error_details_ext] { message: \\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\" details { [type.googleapis.com/google.rpc.DebugInfo] { stack_entries: \\\\\"com.google.net.rpc3.client.RpcClientException: APPLICATION_ERROR;cloud.bigstore/FrontendObjectsService.List;bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist). bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist).;AppErrorCode=1;StartTimeMs=1752257982548;tcp;Deadline(sec)=29.991333608;ResFormat=uncompressed;interceptors={[com.google.prod.fireaxe.filters.FireaxeRpcClientInterceptorImpl;com.google.cloud.bigstore.common.LatencyCollectingInterceptor;com.google.frameworks.debug.sherlog.core.rpcutil.Stubby3ClientInterceptor];overrides={}};ServerTimeSec=0.021769909;LogBytes=256;Non-FailFast;EffSecLevel=strong_privacy_and_integrity;ReqFormat=uncompressed;ReqID=f0298bdab819b7ef;GlobalID=632dd03b377a3319;Server=[::1]:14003\\\\\" stack_entries: \\\\\"com.google.net.rpc3.client.RpcStub.startBlockingRpcInternal(RpcStub.java:571)\\\\\" stack_entries: \\\\\"com.google.net.rpc3.client.RpcStub.startBlockingRpc(RpcStub.java:471)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.proto.BackendObjectsService$Stub.list(BackendObjectsService.java:734)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.common.LampreyServiceBase.call(LampreyServiceBase.java:37)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.common.BigstoreFrontendObjectsServiceImpl.call(BigstoreFrontendObjectsServiceImpl.java:40)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.common.BigstoreFrontendObjectsServiceImpl.list(BigstoreFrontendObjectsServiceImpl.java:97)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.stubby.BigstoreStubbyImpl.lambda$listObjects$0(BigstoreStubbyImpl.java:832)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.common.CallbackRequestHandler.handleCallbackRequest(CallbackRequestHandler.java:288)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.common.CallbackRequestHandler.handleCallbackRequest(CallbackRequestHandler.java:158)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.stubby.BigstoreStubbyImpl.listObjects(BigstoreStubbyImpl.java:821)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.common.ProxyDelegatorBase.callAndRecordLatency(ProxyDelegatorBase.java:109)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.stubby.StubbyProxyDelegator.call(StubbyProxyDelegator.java:184)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.api.stubby.BigstoreStubbyProxy.listObjects(BigstoreStubbyProxy.java:334)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.isolation.RpcReceiver.lambda$processRequestAsync$0(RpcReceiver.java:198)\\\\\" stack_entries: \\\\\"com.google.cloud.bigstore.isolation.AsyncExecutor.lambda$submit$0(AsyncExecutor.java:213)\\\\\" stack_entries: \\\\\"com.google.common.context.ContextRunnable.runInContext(ContextRunnable.java:83)\\\\\" stack_entries: \\\\\"io.grpc.Context.run(Context.java:536)\\\\\" stack_entries: \\\\\"com.google.tracing.GenericContextCallback.runInInheritedContext(GenericContextCallback.java:58)\\\\\" stack_entries: \\\\\"com.google.common.context.ContextRunnable.run(ContextRunnable.java:74)\\\\\" stack_entries: \\\\\"java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)\\\\\" stack_entries: \\\\\"java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)\\\\\" stack_entries: \\\\\"java.base/java.lang.Thread.run(Unknown Source)\\\\\" } } } [blobstore2.GcsErrorDetails] { xml_code: \\\\\"AccessDenied\\\\\" msg: \\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\" http_code: 403 details { first: \\\\\"Details\\\\\" second: \\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\" } debug_info: \\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist). bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\" } [cloud.bigstore.GcsLatencyInfo] { requests { method: \\\\\"/BackendObjectsService.List\\\\\" deadline { seconds: 29 nanos: 979569783 } start { seconds: 1752257982 nanos: 549935579 } end { seconds: 1752257982 nanos: 570470964 } status { code: 1 space: \\\\\"cloud.bigstore.ResponseCode.ErrorCode\\\\\" message: \\\\\"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist). bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\\\\\\\\\'storage.objects.list\\\\\\\\\\\\\\' denied on resource (or it may not exist).\\\\\" canonical_code: 7 } } requests { method: \\\\\"/BucketMdService.LookupBucket\\\\\" deadline { seconds: 4 nanos: 999967650 } start { seconds: 1752257982 nanos: 551314439 } end { seconds: 1752257982 nanos: 553833108 } status { } metadata_spanner_stats { read_walltime_millis: 0 read_cpu_millis: 0 read_scheduler_delay_millis: 0 read_throttle_delay_millis: 0 read_per_service_limit_queue_delay_millis: 0 read_locking_delay_millis: 0 read_client_overhead_delay_millis: 0 read_client_flow_control_delay_millis: 0 read_io_delay_millis: 0 } elapsed_time_isolator_metrics { } sunspot_verdict: VERDICT_CAT_UNKNOWN } requests { method: \\\\\"/AccessService.CheckListObjects\\\\\" deadline { seconds: 29 nanos: 969642965 } start { seconds: 1752257982 nanos: 553970188 } end { seconds: 1752257982 nanos: 569602062 } status { } } }\"\\n message_set {\\n [google.rpc.error_details_ext] {\\n message: \"bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\'storage.objects.list\\\\\\' denied on resource (or it may not exist).\"\\n }\\n }\\n }\\n error_context {\\n table_name: \"bigframes-dev._8b037bfb7316dddf9d92b12dcf93e008906bfe52.bqdf20250711_sessionee050e_bb3969eb60f548be8a4b6f5e2564f69f\"\\n }\\n}\\n errorProto=code: \"ACCESS_DENIED\"\\nargument: \"BigQuery\"\\nargument: \"BigQuery\"\\nargument: \"Permission denied while globbing file pattern. bqcx-1084210331973-pcbl@gcp-sa-bigquery-condel.iam.gserviceaccount.com does not have storage.objects.list access to the Google Cloud Storage bucket. Permission \\\\\\'storage.objects.list\\\\\\' denied on resource (or it may not exist). Please make sure gs://your-bucket/audio-files/* is accessible via appropriate IAM roles, e.g. Storage Object Viewer or Storage Object Creator.\"\\nlocation_type: OTHER\\nlocation: \"gs://your-bucket/audio-files/*\"\\n\\n\\tat com.google.cloud.helix.common.Exceptions.fromProto(Exceptions.java:2016)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl.mapDremelErrorsTohelixException(QueryExecutorImpl.java:1194)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:769)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:695)\\n\\tat com.google.net.rpc3.stream.RpcMessageCallback$ForwardingRpcMessageCallback.onMessage(RpcMessageCallback.java:128)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.processMessageUnlocked(RpcStreamInternalContext.java:1852)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksInternalUnlocked(RpcStreamInternalContext.java:2904)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksUnlocked(RpcStreamInternalContext.java:2830)\\n\\tat com.google.net.eventmanager.AbstractFutureTask$Sync.innerRun(AbstractFutureTask.java:259)\\n\\tat com.google.net.eventmanager.AbstractFutureTask.run(AbstractFutureTask.java:120)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runTaskTraced(EventManagerImpl.java:900)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runTask(EventManagerImpl.java:892)\\n\\tat com.google.net.eventmanager.EventManagerImpl.internalRunWorkerLoop(EventManagerImpl.java:1319)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runWorkerLoop(EventManagerImpl.java:1210)\\n\\tat com.google.net.eventmanager.WorkerThreadInfo.runWorkerLoop(WorkerThreadInfo.java:153)\\n\\tat com.google.net.eventmanager.EventManagerImpl$WorkerThread.run(EventManagerImpl.java:1999)\\n'}]" - ] - } - ], - "source": [ - "import bigframes.pandas as bpd \n", - "import bigframes.enums \n", - " \n", - "# 1. Enable partial ordering mode \n", - "bpd.options.bigquery.ordering_mode = \"partial\" \n", - " \n", - "# 2. Create a DataFrame with blob data and null index \n", - "# Using from_glob_path which creates multimodal DataFrames \n", - "flattened = bpd.from_glob_path( \n", - " \"gs://your-bucket/audio-files/*\", \n", - " name=\"GCS Blob\" \n", - ") \n", - " \n", - "# Alternatively, create from URI strings with null index \n", - "# df = bpd.DataFrame({\"uri\": [\"gs://bucket/audio1.wav\", \"gs://bucket/audio2.wav\"]}) \n", - "# df[\"GCS Blob\"] = df[\"uri\"].str.to_blob() \n", - "# flattened = bpd.read_gbq_table(\"your_table\", index_col=bigframes.enums.DefaultIndexKind.NULL) \n", - " \n", - "# 3. This will trigger the NullIndexError \n", - "flattened[\"Transcription\"] = flattened[\"GCS Blob\"].blob.audio_transcribe( \n", - " model_name=\"gemini-2.0-flash-001\", \n", - " verbose=True, \n", - ")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/test.ipynb b/notebooks/test.ipynb deleted file mode 100644 index 85202fc76e..0000000000 --- a/notebooks/test.ipynb +++ /dev/null @@ -1,225 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import bigframes.pandas as bpd\n", - "import pandas as pd\n", - "import numpy as np\n", - "\n", - "data=pd.Series(np.random.randn(8),\n", - " index=[[\"a\",\"a\",\"a\",\"b\",\n", - " \"b\",\"b\",\"c\",\"c\"],\n", - " [1,2,3,1,2,3,1,2]])\n", - "data\n", - "bq_data = bpd.read_pandas(data)\n", - "print(bq_data)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.loc[\"a\":\"b\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "bq_data.loc[\"a\": \"b\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/global_session.py:103: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", - " _global_session = bigframes.session.connect(\n" - ] - }, - { - "data": { - "text/html": [ - "Query job c7df462c-e617-4ca3-83a0-6d99f7494ad9 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1\n" - ] - } - ], - "source": [ - "import bigframes.pandas as bpd\n", - "\n", - "idx = bpd.Index(['Apple', 'Banana', 'Orange'])\n", - "print(idx.get_loc('Banana')) # Output: 1 " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "monotonic_index = bpd.Index(list('abbc'))\n", - "monotonic_index.get_loc('b') # Output: slice(1, 3, None)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job d6b32147-67cc-478d-80a4-ad1a0d6615bc is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job b143b4d9-70d2-4845-8d1c-aa1cb2f4e9ed is DONE. 68 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "0 False\n", - "1 True\n", - "2 False\n", - "3 True\n", - "dtype: boolean" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "non_monotonic_index = bpd.Index(list('abcb'))\n", - "non_monotonic_index.get_loc('b') # Expect array([False, True, False, True])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 9d531a74-26ec-4ede-97e6-f8fc25c00068 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/pandas/__init__.py:149: PreviewWarning: udf is in preview.\n", - " return global_session.with_default_session(\n", - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/functions/_function_session.py:971: PreviewWarning: input_types=Series is in preview.\n", - " warnings.warn(msg, stacklevel=1, category=bfe.PreviewWarning)\n", - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/functions/_utils.py:86: FunctionPackageVersionWarning: numpy, pandas, and pyarrow versions in the function execution\n", - "environment may not precisely match your local environment.\n", - " warnings.warn(msg, category=bfe.FunctionPackageVersionWarning)\n" - ] - }, - { - "data": { - "text/html": [ - "Query job 0d58b789-d658-4dd6-ba9f-3f03fc77895a is RUNNING. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import bigframes.pandas as bpd\n", - "import pandas as pd\n", - "\n", - "bpd.options.bigquery.project = 'bigframes-dev' # project #: 1084210331973\n", - "bpd.options.bigquery.location = \"us\"\n", - "\n", - "@bpd.udf(\n", - " dataset='jialuo_test_us', name='test',\n", - " packages = ['pypdf[crypto]'],\n", - " )\n", - "def func(s: bpd.Series) -> bool:\n", - " return s['a'] + s['b'] > 0\n", - "\n", - "bdf = {'a': [0, 1, 2], 'b': [3, 4, 5]}\n", - "res = bdf.apply(func, axis=1)\n", - "print(res)\n", - "\n", - "res1 = bdf.where(func)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/test_blob_trancription.ipynb b/notebooks/test_blob_trancription.ipynb deleted file mode 100644 index ea7a64fc0b..0000000000 --- a/notebooks/test_blob_trancription.ipynb +++ /dev/null @@ -1,310 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import bigframes.pandas as bpd" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# 1. Enable partial ordering mode\n", - "bpd.options.bigquery.ordering_mode = \"partial\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/global_session.py:103: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", - " _global_session = bigframes.session.connect(\n" - ] - }, - { - "data": { - "text/html": [ - "Query job 1034294a-3478-4a47-8f3c-1db4b8aab29c is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 7ff09bc9-bcd4-44bd-bed7-9a5ccc90d23a is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "df = bpd.from_glob_path(\"gs://bigframes_blob_test/audio/*\", name=\"audio\")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "df = df.reset_index()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:175: FutureWarning: Since upgrading the default model can cause unintended breakages, the\n", - "default model will be removed in BigFrames 3.0. Please supply an\n", - "explicit model to avoid this message.\n", - " return method(*args, **kwargs)\n" - ] - }, - { - "data": { - "text/html": [ - "Query job f7a6f58f-8bd2-4874-a2cd-76b53da0fb4b is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job ca9c9951-4394-4919-b9c0-56d984a56514 is DONE. 364 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/array_value.py:108: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", - "`db_dtypes` is a preview feature and subject to change.\n", - " warnings.warn(msg, bfe.PreviewWarning)\n" - ] - } - ], - "source": [ - "bq_connection = \"bigframes-dev.us.bigframes-default-connection\"\n", - "df[\"transcribe_audio\"] = df[\"audio\"].blob.audio_transcribe(\n", - " connection=bq_connection)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/array_value.py:254: AmbiguousWindowWarning: Window ordering may be ambiguous, this can cause unstable results.\n", - " warnings.warn(msg, category=bfe.AmbiguousWindowWarning)\n" - ] - }, - { - "data": { - "text/html": [ - "Query job 52332330-b306-4531-9e3b-fe2b7b9fb1ba is DONE. 487 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/array_value.py:230: AmbiguousWindowWarning: Window ordering may be ambiguous, this can cause unstable results.\n", - " warnings.warn(msg, bfe.AmbiguousWindowWarning)\n", - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/array_value.py:254: AmbiguousWindowWarning: Window ordering may be ambiguous, this can cause unstable results.\n", - " warnings.warn(msg, category=bfe.AmbiguousWindowWarning)\n" - ] - }, - { - "data": { - "text/html": [ - "Query job b11dbe0b-58a7-4971-a93f-5a4a96838640 is DONE. 223 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/array_value.py:230: AmbiguousWindowWarning: Window ordering may be ambiguous, this can cause unstable results.\n", - " warnings.warn(msg, bfe.AmbiguousWindowWarning)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
audiotranscribe_audio
0uri: gs://bigframes_blob_test/audio/LJ001-0010.wav, authorizer: bigframes-dev.us.bigframes-default-connectionNow, as all books, not primarily intended as picture books, consist principally of types composed to form letter press,
\n", - "

1 rows × 2 columns

\n", - "
[1 rows x 2 columns in total]" - ], - "text/plain": [ - " audio \\\n", - "{'uri': 'gs://bigframes_blob_test/audio/LJ001-0... \n", - "\n", - " transcribe_audio \n", - "Now, as all books, not primarily intended as pi... \n", - "\n", - "[1 rows x 2 columns]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'\\n# Example test\\nimport pandas as pd\\nimport bigframes.pandas as bpd # Assuming you use this for Series creation in tests\\n\\ntest_source_uris = bpd.Series([\\n \"gs://source-bucket/path/audio_001.wav\",\\n \"gs://source-bucket/another/path/to/track_beta.mp3?version=2\"\\n])\\nprint(test_source_uris)\\n# YOUR FILE_FOLDER_REGEX (ensure this matches what\\'s in your class scope)\\nTEST_REGEX = r\"gs://[^/]+/(?:.+/)?([^/?]+)(?:\\\\?.*)?\"\\nfixed_folder = \"gs://bigframes_blob_test/audio/chunked/\"\\n\\ntry:\\n expected_dest_bases = test_source_uris.str.replace(TEST_REGEX, rf\"{fixed_folder}\\x01\", regex=True)\\n print(\"Test Regex Output (destination bases):\")\\n print(expected_dest_bases.to_pandas())\\n # Expected:\\n # gs://bigframes_blob_test/audio/chunked/audio_001.wav\\n # gs://bigframes_blob_test/audio/chunked/track_beta.mp3\\nexcept Exception as e:\\n print(f\"Error in regex test: {e}\")\\n'" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "'''\n", - "# Example test\n", - "import pandas as pd\n", - "import bigframes.pandas as bpd # Assuming you use this for Series creation in tests\n", - "\n", - "test_source_uris = bpd.Series([\n", - " \"gs://source-bucket/path/audio_001.wav\",\n", - " \"gs://source-bucket/another/path/to/track_beta.mp3?version=2\"\n", - "])\n", - "print(test_source_uris)\n", - "# YOUR FILE_FOLDER_REGEX (ensure this matches what's in your class scope)\n", - "TEST_REGEX = r\"gs://[^/]+/(?:.+/)?([^/?]+)(?:\\?.*)?\"\n", - "fixed_folder = \"gs://bigframes_blob_test/audio/chunked/\"\n", - "\n", - "try:\n", - " expected_dest_bases = test_source_uris.str.replace(TEST_REGEX, rf\"{fixed_folder}\\1\", regex=True)\n", - " print(\"Test Regex Output (destination bases):\")\n", - " print(expected_dest_bases.to_pandas())\n", - " # Expected:\n", - " # gs://bigframes_blob_test/audio/chunked/audio_001.wav\n", - " # gs://bigframes_blob_test/audio/chunked/track_beta.mp3\n", - "except Exception as e:\n", - " print(f\"Error in regex test: {e}\")\n", - "'''" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/test_blob_trans_blur_image.ipynb b/notebooks/test_blob_trans_blur_image.ipynb deleted file mode 100644 index e117db4475..0000000000 --- a/notebooks/test_blob_trans_blur_image.ipynb +++ /dev/null @@ -1,200 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/_config/experiment_options.py:71: ApiDeprecationWarning: BigFrames Blob is in preview now. This flag is no longer needed.\n", - " warnings.warn(msg, category=bfe.ApiDeprecationWarning)\n", - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/_config/bigquery_options.py:363: UserWarning: This is an advanced configuration option for directly setting\n", - "endpoints. Incorrect use may lead to unexpected behavior or system\n", - "instability. Proceed only if you fully understand its implications.\n", - " warnings.warn(msg)\n" - ] - } - ], - "source": [ - "import bigframes\n", - "bigframes.options.experiments.blob = True\n", - "bigframes.options._bigquery_options.client_endpoints_override = {\"bqclient\": \"https://test-bigquery.sandbox.google.com\", \n", - " \"bqconnectionclient\": \"test-bigqueryconnection.sandbox.googleapis.com\", \n", - " \"bqstoragereadclient\": \"test-bigquerystorage-grpc.sandbox.googleapis.com\"}\n", - "import bigframes.pandas as bpd" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/global_session.py:114: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", - " return func(get_global_session(), *args, **kwargs)\n" - ] - }, - { - "data": { - "text/html": [ - "Query job 5da10498-1978-42fe-afea-d3b07c933daa is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "#import bigframes.pandas as bpd\n", - "df = bpd.DataFrame({\"s\": [\"gs://shuowei_bucket/images/images.jpeg\", \"gs://shuowei_bucket/images/tree.jpeg\"]})\n", - "df[\"src\"] = df[\"s\"].str.to_blob(connection=\"bigframes-dev.us.bigframes-default-connection\")" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 3b5d9f0e-0df7-46df-a44c-b07d9d0e96af is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:164: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", - " return method(self, *args, **kwargs)\n" - ] - }, - { - "data": { - "text/html": [ - "Query job 4bf61049-4507-4925-817e-cbb8b19d5721 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "df[\"result\"] = df[\"src\"].blob.image_blur(ksize=[8,8],dst=\"gs://shuowei_bucket/image_blur_transformed2/\", connection=\"bigframes-dev.us.bigframes-default-connection\")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ssrcresult
0gs://shuowei_bucket/images/images.jpeg
1gs://shuowei_bucket/images/tree.jpeg
\n", - "

2 rows × 3 columns

\n", - "
[2 rows x 3 columns in total]" - ], - "text/plain": [ - " s \\\n", - "0 gs://shuowei_bucket/images/images.jpeg \n", - "1 gs://shuowei_bucket/images/tree.jpeg \n", - "\n", - " src \\\n", - "0 {'uri': 'gs://shuowei_bucket/images/images.jpe... \n", - "1 {'uri': 'gs://shuowei_bucket/images/tree.jpeg'... \n", - "\n", - " result \n", - "0 {'uri': 'gs://shuowei_bucket/image_blur_transf... \n", - "1 {'uri': 'gs://shuowei_bucket/image_blur_transf... \n", - "\n", - "[2 rows x 3 columns]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/test_blob_trans_pdf_extract.ipynb b/notebooks/test_blob_trans_pdf_extract.ipynb deleted file mode 100644 index c05acfdd01..0000000000 --- a/notebooks/test_blob_trans_pdf_extract.ipynb +++ /dev/null @@ -1,445 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/_config/experiment_options.py:69: PreviewWarning: BigFrames Blob is still under experiments. It may not work and subject\n", - "to change in the future.\n", - " warnings.warn(msg, category=bfe.PreviewWarning)\n" - ] - } - ], - "source": [ - "import bigframes\n", - "bigframes.options.experiments.blob = True\n", - "#bigframes.options._bigquery_options.client_endpoints_override = {\"bqclient\": \"https://test-bigquery.sandbox.google.com\", \n", - "# \"bqconnectionclient\": \"test-bigqueryconnection.sandbox.googleapis.com\", \n", - "# \"bqstoragereadclient\": \"test-bigquerystorage-grpc.sandbox.googleapis.com\"}\n", - "import bigframes.pandas as bpd" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/global_session.py:114: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", - " return func(get_global_session(), *args, **kwargs)\n" - ] - }, - { - "data": { - "text/html": [ - "Query job d86dc475-9976-46bf-ac41-dd5a726d7093 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job d1dacc17-89b9-4e2b-ac6c-10d550d967bb is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "extract_df = bpd.from_glob_path(\"gs://bigframes_blob_test/pdfs/*\", name=\"pdf\")\n", - "extract_df = extract_df.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job a42b5a1c-9f68-4257-a7c5-2680e7036993 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "bq_connection = \"bigframes-dev.us.bigframes-default-connection\"\n", - "extract_df[\"extract_text\"] = extract_df[\"pdf\"].blob.pdf_extract(\n", - " connection=bq_connection, verbose=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job a2306203-146a-4efc-b591-9928a5b6301a is DONE. 228 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 454cca1b-1c8c-493b-8d59-80d08f8a45ce is DONE. 228 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
pdfextract_text
0uri: gs://bigframes_blob_test/pdfs/test-protected.pdf, authorizer: bigframes-dev.us.bigframes-default-connection{'status': 'File has not been decrypted', 'content': ''}
1uri: gs://bigframes_blob_test/pdfs/sample-local-pdf.pdf, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': 'Sample PDF This is a testing file. Some dummy messages are used for testing purposes. '}
\n", - "

2 rows × 2 columns

\n", - "
[2 rows x 2 columns in total]" - ], - "text/plain": [ - " pdf \\\n", - "0 {'uri': 'gs://bigframes_blob_test/pdfs/test-pr... \n", - "1 {'uri': 'gs://bigframes_blob_test/pdfs/sample-... \n", - "\n", - " extract_text \n", - "0 {'status': 'File has not been decrypted', 'con... \n", - "1 {'status': '', 'content': 'Sample PDF This... \n", - "\n", - "[2 rows x 2 columns]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "extract_df" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 609f1624-8658-4b6e-bf9f-da8cc860b97c is DONE. 228 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 5e5306c3-2379-4e87-841a-dfe748afcc7f is DONE. 171 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "0 {'status': 'File has not been decrypted', 'con...\n", - "1 {'status': '', 'content': 'Sample PDF This...\n", - "Name: extract_text, dtype: struct[pyarrow]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "extract_df[\"extract_text\"].explode()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job ba554f28-afa3-47b0-a4fc-6128e86f7547 is DONE. 367 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 86d8b95f-aa96-4bb0-8a12-da8b5cf5ca20 is DONE. 171 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "0 {'status': 'File has not been decrypted', 'con...\n", - "1 {'status': '', 'content': 'Sample PDF This...\n", - "Name: extract_text, dtype: struct[pyarrow]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "extract_df[\"extract_text\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 0848e6d2-11b8-4908-b9c3-b7d5e0449b3a is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/operations/blob.py:736: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n", - " \n" - ] - }, - { - "data": { - "text/html": [ - "Query job 9edb0395-9cfe-4e6b-8779-643080630518 is DONE. 228 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "0 \n", - "1 Sample PDF This is a testing file. So...\n", - "dtype: string" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "extract_df[\"pdf\"].blob.pdf_chunk(connection=bq_connection, verbose=False).explode().to_pandas()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 0fd80b90-55d2-4827-8547-2a5c36e810ec is DONE. 765 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 9dc25ae3-fe67-4547-886a-612074660749 is DONE. 140 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "0 \n", - "1 Sample PDF This is a testing file. So...\n", - "Name: test, dtype: string" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "extract_df[\"test\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0 \n", - "1 Sample PDF This is a testing file.\n", - "2 Some dummy messages are used for testing...\n", - "dtype: object\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "\n", - "chunk_data = [\n", - " {\"status\": \"File has not been decrypted\", \"content\": []},\n", - " {\n", - " \"status\": \"\",\n", - " \"content\": [\"Sample PDF This is a testing file.\", \"Some dummy messages are used for testing purposes. \"],\n", - " },\n", - "]\n", - "\n", - "\n", - "content_values = []\n", - "for item in chunk_data:\n", - " if not item[\"content\"]:\n", - " content_values.append(pd.NA)\n", - " else:\n", - " content_values.extend(item[\"content\"])\n", - "\n", - "expected = pd.Series(content_values)\n", - "print(expected)\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/test_blob_transcribe.ipynb b/notebooks/test_blob_transcribe.ipynb deleted file mode 100644 index 4db81494bd..0000000000 --- a/notebooks/test_blob_transcribe.ipynb +++ /dev/null @@ -1,157 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import bigframes\n", - "#bigframes.options._bigquery_options.client_endpoints_override = {\"bqclient\": \"https://test-bigquery.sandbox.google.com\", \n", - "# \"bqconnectionclient\": \"test-bigqueryconnection.sandbox.googleapis.com\", \n", - "# \"bqstoragereadclient\": \"test-bigquerystorage-grpc.sandbox.googleapis.com\"}\n", - "import bigframes.pandas as bpd" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/global_session.py:103: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", - " _global_session = bigframes.session.connect(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=262006177488-ka1m0ue4fptfmt9siejdd5lom7p39upa.apps.googleusercontent.com&redirect_uri=https%3A%2F%2Fpydata-google-auth.readthedocs.io%2Fen%2Flatest%2Foauth.html&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform&state=dACVRKbeOGCf4ooMnkd63GGaampwXy&prompt=consent&access_type=offline\n" - ] - }, - { - "ename": "ValueError", - "evalue": "Please supply either code or authorization_response parameters.", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[2], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m chunks_df \u001b[38;5;241m=\u001b[39m \u001b[43mbpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_glob_path\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgs://garrettwu_bucket/pdfs/*\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2\u001b[0m chunks_df\u001b[38;5;241m.\u001b[39mcolumns \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124muri\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 3\u001b[0m bq_connection \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbigframes-dev.us.bigframes-default-connection\u001b[39m\u001b[38;5;124m\"\u001b[39m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:175\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 172\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 175\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 178\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 179\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;66;03m# _block.session.bqclient.\u001b[39;00m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/pandas/io/api.py:606\u001b[0m, in \u001b[0;36mfrom_glob_path\u001b[0;34m(path, connection, name)\u001b[0m\n\u001b[1;32m 603\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mfrom_glob_path\u001b[39m(\n\u001b[1;32m 604\u001b[0m path: \u001b[38;5;28mstr\u001b[39m, \u001b[38;5;241m*\u001b[39m, connection: Optional[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m, name: Optional[\u001b[38;5;28mstr\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 605\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m bigframes\u001b[38;5;241m.\u001b[39mdataframe\u001b[38;5;241m.\u001b[39mDataFrame:\n\u001b[0;32m--> 606\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mglobal_session\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwith_default_session\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 607\u001b[0m \u001b[43m \u001b[49m\u001b[43mbigframes\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mSession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_glob_path\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 608\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 609\u001b[0m \u001b[43m \u001b[49m\u001b[43mconnection\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconnection\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 610\u001b[0m \u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 611\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/global_session.py:114\u001b[0m, in \u001b[0;36mwith_default_session\u001b[0;34m(func_, *args, **kwargs)\u001b[0m\n\u001b[1;32m 113\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mwith_default_session\u001b[39m(func_: Callable[\u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;241m.\u001b[39m, _T], \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m _T:\n\u001b[0;32m--> 114\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func_(\u001b[43mget_global_session\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/global_session.py:103\u001b[0m, in \u001b[0;36mget_global_session\u001b[0;34m()\u001b[0m\n\u001b[1;32m 101\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m _global_session_lock:\n\u001b[1;32m 102\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m _global_session \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 103\u001b[0m _global_session \u001b[38;5;241m=\u001b[39m \u001b[43mbigframes\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconnect\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 104\u001b[0m \u001b[43m \u001b[49m\u001b[43mbigframes\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_config\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbigquery\u001b[49m\n\u001b[1;32m 105\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 107\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _global_session\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/__init__.py:2188\u001b[0m, in \u001b[0;36mconnect\u001b[0;34m(context)\u001b[0m\n\u001b[1;32m 2187\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mconnect\u001b[39m(context: Optional[bigquery_options\u001b[38;5;241m.\u001b[39mBigQueryOptions] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Session:\n\u001b[0;32m-> 2188\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mSession\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcontext\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:175\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 172\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 175\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 178\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 179\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;66;03m# _block.session.bqclient.\u001b[39;00m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/__init__.py:170\u001b[0m, in \u001b[0;36mSession.__init__\u001b[0;34m(self, context, clients_provider)\u001b[0m\n\u001b[1;32m 168\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_clients_provider \u001b[38;5;241m=\u001b[39m clients_provider\n\u001b[1;32m 169\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 170\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_clients_provider \u001b[38;5;241m=\u001b[39m \u001b[43mbigframes\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msession\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mclients\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mClientsProvider\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 171\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcontext\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mproject\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 172\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_location\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 173\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_regional_endpoints\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcontext\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43muse_regional_endpoints\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 174\u001b[0m \u001b[43m \u001b[49m\u001b[43mcredentials\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcontext\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcredentials\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 175\u001b[0m \u001b[43m \u001b[49m\u001b[43mapplication_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcontext\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapplication_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[43m \u001b[49m\u001b[43mbq_kms_key_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_bq_kms_key_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 177\u001b[0m \u001b[43m \u001b[49m\u001b[43mclient_endpoints_override\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcontext\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mclient_endpoints_override\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 178\u001b[0m \u001b[43m \u001b[49m\u001b[43mrequests_transport_adapters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcontext\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrequests_transport_adapters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 179\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;66;03m# TODO(shobs): Remove this logic after https://github.com/ibis-project/ibis/issues/8494\u001b[39;00m\n\u001b[1;32m 182\u001b[0m \u001b[38;5;66;03m# has been fixed. The ibis client changes the default query job config\u001b[39;00m\n\u001b[1;32m 183\u001b[0m \u001b[38;5;66;03m# so we are going to remember the current config and restore it after\u001b[39;00m\n\u001b[1;32m 184\u001b[0m \u001b[38;5;66;03m# the ibis client has been created\u001b[39;00m\n\u001b[1;32m 185\u001b[0m original_default_query_job_config \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbqclient\u001b[38;5;241m.\u001b[39mdefault_query_job_config\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/clients.py:91\u001b[0m, in \u001b[0;36mClientsProvider.__init__\u001b[0;34m(self, project, location, use_regional_endpoints, credentials, application_name, bq_kms_key_name, client_endpoints_override, requests_transport_adapters)\u001b[0m\n\u001b[1;32m 89\u001b[0m credentials_project \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 90\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m credentials \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m---> 91\u001b[0m credentials, credentials_project \u001b[38;5;241m=\u001b[39m \u001b[43m_get_default_credentials_with_project\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 93\u001b[0m \u001b[38;5;66;03m# Ensure an access token is available.\u001b[39;00m\n\u001b[1;32m 94\u001b[0m credentials\u001b[38;5;241m.\u001b[39mrefresh(google\u001b[38;5;241m.\u001b[39mauth\u001b[38;5;241m.\u001b[39mtransport\u001b[38;5;241m.\u001b[39mrequests\u001b[38;5;241m.\u001b[39mRequest())\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/clients.py:54\u001b[0m, in \u001b[0;36m_get_default_credentials_with_project\u001b[0;34m()\u001b[0m\n\u001b[1;32m 53\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m_get_default_credentials_with_project\u001b[39m():\n\u001b[0;32m---> 54\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mpydata_google_auth\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdefault\u001b[49m\u001b[43m(\u001b[49m\u001b[43mscopes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m_SCOPES\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43muse_local_webserver\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/pydata_google_auth/auth.py:151\u001b[0m, in \u001b[0;36mdefault\u001b[0;34m(scopes, client_id, client_secret, credentials_cache, use_local_webserver, auth_local_webserver, redirect_uri)\u001b[0m\n\u001b[1;32m 148\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m credentials \u001b[38;5;129;01mand\u001b[39;00m credentials\u001b[38;5;241m.\u001b[39mvalid:\n\u001b[1;32m 149\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m credentials, default_project\n\u001b[0;32m--> 151\u001b[0m credentials \u001b[38;5;241m=\u001b[39m \u001b[43mget_user_credentials\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 152\u001b[0m \u001b[43m \u001b[49m\u001b[43mscopes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 153\u001b[0m \u001b[43m \u001b[49m\u001b[43mclient_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mclient_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 154\u001b[0m \u001b[43m \u001b[49m\u001b[43mclient_secret\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mclient_secret\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 155\u001b[0m \u001b[43m \u001b[49m\u001b[43mcredentials_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcredentials_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 156\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_local_webserver\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_local_webserver\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 157\u001b[0m \u001b[43m \u001b[49m\u001b[43mredirect_uri\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mredirect_uri\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 158\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 160\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m credentials \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m credentials\u001b[38;5;241m.\u001b[39mvalid:\n\u001b[1;32m 161\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exceptions\u001b[38;5;241m.\u001b[39mPyDataCredentialsError(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCould not get any valid credentials.\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/pydata_google_auth/auth.py:400\u001b[0m, in \u001b[0;36mget_user_credentials\u001b[0;34m(scopes, client_id, client_secret, credentials_cache, use_local_webserver, auth_local_webserver, redirect_uri)\u001b[0m\n\u001b[1;32m 398\u001b[0m credentials \u001b[38;5;241m=\u001b[39m _webserver\u001b[38;5;241m.\u001b[39mrun_local_server(app_flow, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mAUTH_URI_KWARGS)\n\u001b[1;32m 399\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 400\u001b[0m credentials \u001b[38;5;241m=\u001b[39m \u001b[43m_run_webapp\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 401\u001b[0m \u001b[43m \u001b[49m\u001b[43mapp_flow\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mredirect_uri\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mredirect_uri\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mAUTH_URI_KWARGS\u001b[49m\n\u001b[1;32m 402\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 404\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m oauthlib\u001b[38;5;241m.\u001b[39moauth2\u001b[38;5;241m.\u001b[39mrfc6749\u001b[38;5;241m.\u001b[39merrors\u001b[38;5;241m.\u001b[39mOAuth2Error \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 405\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exceptions\u001b[38;5;241m.\u001b[39mPyDataCredentialsError(\n\u001b[1;32m 406\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUnable to get valid credentials: \u001b[39m\u001b[38;5;132;01m{}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mformat(exc)\n\u001b[1;32m 407\u001b[0m )\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/pydata_google_auth/auth.py:59\u001b[0m, in \u001b[0;36m_run_webapp\u001b[0;34m(flow, redirect_uri, **kwargs)\u001b[0m\n\u001b[1;32m 56\u001b[0m authorization_code_message \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEnter the authorization code: \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 58\u001b[0m code \u001b[38;5;241m=\u001b[39m \u001b[38;5;28minput\u001b[39m(authorization_code_message)\n\u001b[0;32m---> 59\u001b[0m \u001b[43mflow\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfetch_token\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcode\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 60\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m flow\u001b[38;5;241m.\u001b[39mcredentials\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google_auth_oauthlib/flow.py:285\u001b[0m, in \u001b[0;36mFlow.fetch_token\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m 283\u001b[0m kwargs\u001b[38;5;241m.\u001b[39msetdefault(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mclient_secret\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclient_config[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mclient_secret\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m 284\u001b[0m kwargs\u001b[38;5;241m.\u001b[39msetdefault(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcode_verifier\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcode_verifier)\n\u001b[0;32m--> 285\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moauth2session\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfetch_token\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mclient_config\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mtoken_uri\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/requests_oauthlib/oauth2_session.py:278\u001b[0m, in \u001b[0;36mOAuth2Session.fetch_token\u001b[0;34m(self, token_url, code, authorization_response, body, auth, username, password, method, force_querystring, timeout, headers, verify, proxies, include_client_id, client_secret, cert, **kwargs)\u001b[0m\n\u001b[1;32m 276\u001b[0m code \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_client\u001b[38;5;241m.\u001b[39mcode\n\u001b[1;32m 277\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m code:\n\u001b[0;32m--> 278\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 279\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease supply either code or \u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mauthorization_response parameters.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 280\u001b[0m )\n\u001b[1;32m 282\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pkce:\n\u001b[1;32m 283\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_code_verifier \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", - "\u001b[0;31mValueError\u001b[0m: Please supply either code or authorization_response parameters." - ] - } - ], - "source": [ - "chunks_df = bpd.from_glob_path(\"gs://garrettwu_bucket/pdfs/*\")\n", - "chunks_df.columns = [\"uri\"]\n", - "bq_connection = \"bigframes-dev.us.bigframes-default-connection\"\n", - "chunks_df[\"chunk_text\"] = chunks_df[\"uri\"].blob.pdf_chunk(\n", - " connection=bq_connection, chunk_size=2000, overlap_size=200,\n", - " max_batching_rows=1\n", - ")\n", - "chunk_df_exploded = chunks_df[\"chunk_text\"].explode()\n", - "chunk_df_exploded.cache()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df = bpd.from_glob_path(\"gs://bigframes_blob_test/audio/*\", name=\"audio\")\n", - "\n", - "#df[\"audio\"] = \"gs://bigframes_blob_test/audio/LJ001-0010.wav\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# copy files again, now we have 1000 audio files\n", - "copies = [df] * 3\n", - "df = bpd.concat(copies, ignore_index=True)\n", - "df = df.cache()\n", - "df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# copy files again, now we have 1,000,000 audio files\n", - "#copies = [df] * 2 * 100\n", - "#df = bpd.concat(copies, ignore_index=True)\n", - "#df = df.cache()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#bq_connection = \"bigframes-dev.us.bigframes-default-connection\"\n", - "df[\"text\"] = df[\"audio\"].blob.audio_transcribe(model_name=\"gemini-2.0-flash-001\", verbose=True)\n", - "# gemini-2.5-pro-preview-05-20" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/test_blob_transcribe_1M_short_audio.ipynb b/notebooks/test_blob_transcribe_1M_short_audio.ipynb deleted file mode 100644 index f494133f53..0000000000 --- a/notebooks/test_blob_transcribe_1M_short_audio.ipynb +++ /dev/null @@ -1,450 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/_config/experiment_options.py:71: ApiDeprecationWarning: BigFrames Blob is in preview now. This flag is no longer needed.\n", - " warnings.warn(msg, category=bfe.ApiDeprecationWarning)\n" - ] - } - ], - "source": [ - "import bigframes\n", - "bigframes.options.experiments.blob = True\n", - "#bigframes.options._bigquery_options.client_endpoints_override = {\"bqclient\": \"https://test-bigquery.sandbox.google.com\", \n", - "# \"bqconnectionclient\": \"test-bigqueryconnection.sandbox.googleapis.com\", \n", - "# \"bqstoragereadclient\": \"test-bigquerystorage-grpc.sandbox.googleapis.com\"}\n", - "import bigframes.pandas as bpd" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/global_session.py:103: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", - " _global_session = bigframes.session.connect(\n" - ] - }, - { - "data": { - "text/html": [ - "Query job 7d45f84a-01f6-413a-a8f5-04fe41cf60c7 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 79486ecb-ac6d-4e05-b348-809d4baa6da4 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "df = bpd.from_glob_path(\"gs://shuowei_bucket/audio/*\", name=\"audio\")" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job cc70ecf3-5095-4f3f-8c49-796dfc68af69 is DONE. 354.8 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# copy files again, now we have 1000 audio files\n", - "copies = [df] * 2 * 100\n", - "df = bpd.concat(copies, ignore_index=True)\n", - "df = df.cache()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job d3b7a0c7-0359-4aa9-846d-f99250cbe37b is DONE. 96.0 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# copy files again, now we have 1,000,000 audio files\n", - "copies = [df] * 2 * 100\n", - "df = bpd.concat(copies, ignore_index=True)\n", - "df = df.cache()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 5ab7d996-8290-4526-954f-64e31a3f3d81 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job fcfd94e4-9e14-46b0-a187-6cc65885d559 is DONE. 19.2 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/array_value.py:109: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", - "`db_dtypes` is a preview feature and subject to change.\n", - " warnings.warn(msg, bfe.PreviewWarning)\n" - ] - }, - { - "data": { - "text/html": [ - "Load job 586c243f-92c4-43cd-871e-09add370f1c9 is DONE. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "#bq_connection = \"bigframes-dev.us.bigframes-default-connection\"\n", - "df[\"text\"] = df[\"audio\"].blob.transcribe(df=df, audio_column=\"audio\", model_name=\"gemini-2.0-flash-001\", verbose=True)\n", - "# gemini-2.5-pro-preview-05-06" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job b3ca3d9c-b359-4e8d-8675-7ad6d5609762 is DONE. 164.0 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 0405effe-08f2-4d4d-ad7f-6e8876e23e94 is DONE. 64.5 MB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
audiotext
0uri: gs://shuowei_bucket/audio/ID014clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
1uri: gs://shuowei_bucket/audio/HI009clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
2uri: gs://shuowei_bucket/audio/AK012clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
3uri: gs://shuowei_bucket/audio/CA138clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': \"was rather interesting just to watch them gathering their materials and bouncing around. Yeah, they are what they call it kangaroo walk or or something. Really? Uh, something like that. They named it that. I don't know. I bet those men are going to get quite a reception when they get back to earth. Oh, yes. I'll be so glad when they land back now, but I think that's uh pretty well uh fact because they've landed so many safely now that I I feel relieved. Just getting off of the moon was the thing that was. Have they met with the um one that was circling? Yes, they've rendezvoused. So I understand. I that wasn't shown either, so I but so they say they have rendezvoused, so that's a matter of making the circles and then coming down. What do you sort of imagine for the future? Do you imagine them sending a I think they will. I think they will do some more exploring up there. Right. Positive, you know, to continue with this. Uh-huh. Because that was such a very small area when you think of it that they just gathered uh rocks and and the oh samples of soil and all and they did uh probe for some samples. And just what is going to come of that, I don't know. I'd be glad to read it.\"}
4uri: gs://shuowei_bucket/audio/TX048clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': \"Say this is entered customs. And uh the next step is to prepare a Mexican customs entry. Mexico call them pedimentos. It's really an appraisal sheet. It's it's a breakdown of the sheet. You use the same information. That's right. But here you give them all the information, for example, let me see, I found one right here. You give them all the information on on what the invoice contains. Here's one. On what the invoice contains, what uh what what the material is itself. If you hear, for example, this is some bottling equipment going over there and on the appraisal sheet, so you tell them who it's for, who made the invoice out of the states. This came from St. Louis, Missouri. Yeah. How much the freight was. And on the reverse of this of this thing, but the everything has to be marked. So complete identification. This identification number is that traffic number I spoke to you a little while ago. Number of boxes, what they contain, the tariff duty, the tariff number, the ad valorem rate, and what the actual duties would be paid. For example, here would be 996 pesos and 73 cents. And as you can see, this is a form that has already been approved by the appraiser. These appraisers are called Vistas. It's very different from the Vistas we have nowadays here.\"}
5uri: gs://shuowei_bucket/audio/ID014clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': \"I'll tell you something, when the first revelation Joseph Smith received about marriage of any kind was that you couldn't get anywhere, you into the celestial kingdom. No man could get into the celestial kingdom without a wife, because he's only half there if he hasn't got a wife. And no woman, of course, could get there without a husband either, because she's only half there. Well, now at that time, the Mormons were very strict, uh, uh, religious people, you see. Sure. Sure. And, uh, thought that a man had to live clean all his life as well as a woman did in order to be worthy to be a man. Well, this is still true. Well, it's still true certainly. But I said, in that those circumstances, at that time, there were, uh, about three times as many women that joined the church and were faithful Latter-day Saints than there were men. When Joseph Smith comes out with the revelation that, uh, marriage was for eternity. And if if you're going to be separated from death do you part, then you can't get into the celestial kingdom at all to become as God is. Now, they got that doctrine. And that was, the prophet said that, they believed that. What about these here two-thirds of the women? Yeah, that's a good question. What is going to become of us then? It won't do us any good to go out here and marry one of these scallywags that spends his time drinking and boozing around and mining and maybe works in the mine a while and does all kind of skullduggery. That won't do us any good to marry that kind of a man. Besides, we don't have a chance to raise a respectable family with that kind of a man. So, what are we going to do about it now? What's going to become of us? And Joseph Smith always prayed about every problem that came to him. And there's where he got the revelation that if they live thoroughly clean and faithful, if they can understand the purity and sacredness of sex, so they don't desecrate it, then one man can have a number of wives and and permit them to raise a respectable, honorable family as well as have those wives throughout eternity, have their husband to take them into the celestial kingdom. Now, there was the origin of polygamy.\"}
6uri: gs://shuowei_bucket/audio/HI009clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
7uri: gs://shuowei_bucket/audio/AK012clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
8uri: gs://shuowei_bucket/audio/CA138clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
9uri: gs://shuowei_bucket/audio/TX048clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
10uri: gs://shuowei_bucket/audio/ID014clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
11uri: gs://shuowei_bucket/audio/HI009clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
12uri: gs://shuowei_bucket/audio/AK012clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
13uri: gs://shuowei_bucket/audio/CA138clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
14uri: gs://shuowei_bucket/audio/TX048clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
15uri: gs://shuowei_bucket/audio/ID014clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
16uri: gs://shuowei_bucket/audio/HI009clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
17uri: gs://shuowei_bucket/audio/AK012clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
18uri: gs://shuowei_bucket/audio/CA138clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
19uri: gs://shuowei_bucket/audio/TX048clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
20uri: gs://shuowei_bucket/audio/ID014clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
21uri: gs://shuowei_bucket/audio/HI009clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': \"Uh, I hear that there's a Hawaiian custom of something called a Hukilau. Hukilau, Hukilau is a they have a rope and on the rope they have a net there. Mhm. They surround at a certain place and they drag the net. They surround it, you mean they carry the net out? Carry the net, uh-huh. Uh, how do they is there something to keep the net floating? Yeah, some they have those floaters uh floating. Mhm. Some they have a lead down at the bottom, then they drag that uh net in. What do the floats what are the floats made of? The float made of those uh some coconut uh dry coconut I guess. Oh, dry coconut. That's a good use for coconut. Uh, and that floats the top of the net. Top of the net. Now, the word lau means leaf, doesn't it? Lau, lau is a yeah, yeah, lau is a leaf. All right, well now the the where does the leaf come in in this Hukilau? Hukilau, the leaves, you know, they stay on at the top top of the rope. So that the when they pull, the fish going to not to jump over the rope. The ropes, the the uh the leaves keep the fish from jumping over to the Jump over. They scare they scare them away so they want to go underneath the water, see. When moment she go underneath the water there, they will go to the net. They have a pocket on the net. They have a pocket on the net. Now the bottom of the net then Bottom of the net they have a pocket in. Mhm. The the bottom must have some kind of things to make it sink. Sink is a lead, they have a lead. Oh, they have lead on it. I see. Then uh uh somebody goes out and puts the net out around. Yeah, they throw it with a boat. With a boat? Two two boats they goes out. Throw it all around. And then uh who pulls it in? Well, they have a lots of uh from the shore about 40, 50 guys they pull that rope up. Ah, I see. Yeah. And uh they must have a pretty big school of fish there. Yeah, once in a while they have a nice school school, right? And then they pull them all in and the fish don't are scared to jump out so they get caught. No, they all in up on the net already. Mhm. They get caught in the net. I suppose the top of the net comes in sooner than the bottom so they if they do try to go out there, that's pocket. Yeah, they have to they going to, you know, about six or seven guys is swimming on it around the net to they're watching how the fish going. Oh, I see. They they're chasing at the hole. Uh-huh. Well, about how much fish do they get out of a good haul of those? Once I saw up Waimea they caught about about a ton I guess, you know, those Akule. Mhm. Oh, they have a nice crew up there. What's Akule? Mm, the Akule some kind of, you know, that's Hawaiian used to call Akule, Akule. They have been calling it Aji in Japan. \"}
22uri: gs://shuowei_bucket/audio/AK012clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
23uri: gs://shuowei_bucket/audio/CA138clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
24uri: gs://shuowei_bucket/audio/TX048clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': None}
\n", - "

25 rows × 2 columns

\n", - "
[200000 rows x 2 columns in total]" - ], - "text/plain": [ - " audio \\\n", - "0 {'uri': 'gs://shuowei_bucket/audio/ID014clip.m... \n", - "1 {'uri': 'gs://shuowei_bucket/audio/HI009clip.m... \n", - "2 {'uri': 'gs://shuowei_bucket/audio/AK012clip.m... \n", - "3 {'uri': 'gs://shuowei_bucket/audio/CA138clip.m... \n", - "4 {'uri': 'gs://shuowei_bucket/audio/TX048clip.m... \n", - "5 {'uri': 'gs://shuowei_bucket/audio/ID014clip.m... \n", - "6 {'uri': 'gs://shuowei_bucket/audio/HI009clip.m... \n", - "7 {'uri': 'gs://shuowei_bucket/audio/AK012clip.m... \n", - "8 {'uri': 'gs://shuowei_bucket/audio/CA138clip.m... \n", - "9 {'uri': 'gs://shuowei_bucket/audio/TX048clip.m... \n", - "10 {'uri': 'gs://shuowei_bucket/audio/ID014clip.m... \n", - "11 {'uri': 'gs://shuowei_bucket/audio/HI009clip.m... \n", - "12 {'uri': 'gs://shuowei_bucket/audio/AK012clip.m... \n", - "13 {'uri': 'gs://shuowei_bucket/audio/CA138clip.m... \n", - "14 {'uri': 'gs://shuowei_bucket/audio/TX048clip.m... \n", - "15 {'uri': 'gs://shuowei_bucket/audio/ID014clip.m... \n", - "16 {'uri': 'gs://shuowei_bucket/audio/HI009clip.m... \n", - "17 {'uri': 'gs://shuowei_bucket/audio/AK012clip.m... \n", - "18 {'uri': 'gs://shuowei_bucket/audio/CA138clip.m... \n", - "19 {'uri': 'gs://shuowei_bucket/audio/TX048clip.m... \n", - "20 {'uri': 'gs://shuowei_bucket/audio/ID014clip.m... \n", - "21 {'uri': 'gs://shuowei_bucket/audio/HI009clip.m... \n", - "22 {'uri': 'gs://shuowei_bucket/audio/AK012clip.m... \n", - "23 {'uri': 'gs://shuowei_bucket/audio/CA138clip.m... \n", - "24 {'uri': 'gs://shuowei_bucket/audio/TX048clip.m... \n", - "\n", - " text \n", - "0 {'status': '', 'content': None} \n", - "1 {'status': '', 'content': None} \n", - "2 {'status': '', 'content': None} \n", - "3 {'status': '', 'content': \"was rather interest... \n", - "4 {'status': '', 'content': \"Say this is entered... \n", - "5 {'status': '', 'content': \"I'll tell you somet... \n", - "6 {'status': '', 'content': None} \n", - "7 {'status': '', 'content': None} \n", - "8 {'status': '', 'content': None} \n", - "9 {'status': '', 'content': None} \n", - "10 {'status': '', 'content': None} \n", - "11 {'status': '', 'content': None} \n", - "12 {'status': '', 'content': None} \n", - "13 {'status': '', 'content': None} \n", - "14 {'status': '', 'content': None} \n", - "15 {'status': '', 'content': None} \n", - "16 {'status': '', 'content': None} \n", - "17 {'status': '', 'content': None} \n", - "18 {'status': '', 'content': None} \n", - "19 {'status': '', 'content': None} \n", - "20 {'status': '', 'content': None} \n", - "21 {'status': '', 'content': \"Uh, I hear that the... \n", - "22 {'status': '', 'content': None} \n", - "23 {'status': '', 'content': None} \n", - "24 {'status': '', 'content': None} \n", - "...\n", - "\n", - "[200000 rows x 2 columns]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/test_blob_transcribe_1M_short_audio_v1.ipynb b/notebooks/test_blob_transcribe_1M_short_audio_v1.ipynb deleted file mode 100644 index 94ecbccd24..0000000000 --- a/notebooks/test_blob_transcribe_1M_short_audio_v1.ipynb +++ /dev/null @@ -1,345 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/_config/experiment_options.py:71: ApiDeprecationWarning: BigFrames Blob is in preview now. This flag is no longer needed.\n", - " warnings.warn(msg, category=bfe.ApiDeprecationWarning)\n" - ] - } - ], - "source": [ - "import bigframes\n", - "bigframes.options.experiments.blob = True\n", - "#bigframes.options._bigquery_options.client_endpoints_override = {\"bqclient\": \"https://test-bigquery.sandbox.google.com\", \n", - "# \"bqconnectionclient\": \"test-bigqueryconnection.sandbox.googleapis.com\", \n", - "# \"bqstoragereadclient\": \"test-bigquerystorage-grpc.sandbox.googleapis.com\"}\n", - "import bigframes.pandas as bpd" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/global_session.py:103: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", - " _global_session = bigframes.session.connect(\n" - ] - }, - { - "data": { - "text/html": [ - "Query job 111af7c8-313c-4884-b630-632a4b1ea5fe is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 3a680a1a-208b-4c9f-8637-eb048b0a651d is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "df = bpd.from_glob_path(\"gs://shuowei_bucket/audio/*\", name=\"audio\")" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job ce7097d7-b758-49d4-af1b-3963f44c7df3 is DONE. 354.8 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# copy files again, now we have 1000 audio files\n", - "copies = [df] * 2 * 100\n", - "df = bpd.concat(copies, ignore_index=True)\n", - "df = df.cache()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job f94ca3c8-fc8b-4102-ada0-c0db8c4716e1 is DONE. 96.0 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# copy files again, now we have 1,000,000 audio files\n", - "copies = [df] * 2 * 100\n", - "df = bpd.concat(copies, ignore_index=True)\n", - "df = df.cache()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 0a889fd5-0cc6-4f1d-8154-30fb00b394be is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 853cc347-ae70-4d1a-aa20-c5801153217f is RUNNING. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requested cancellation for Query job 853cc347-ae70-4d1a-aa20-c5801153217f in location US...\n" - ] - }, - { - "ename": "KeyboardInterrupt", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[5], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m#bq_connection = \"bigframes-dev.us.bigframes-default-connection\"\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43maudio\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mblob\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtranscribe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maudio_column\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43maudio\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmodel_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgemini-2.0-flash-001\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# gemini-2.5-pro-preview-05-06\u001b[39;00m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:175\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 172\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 175\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 178\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 179\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;66;03m# _block.session.bqclient.\u001b[39;00m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/operations/blob.py:820\u001b[0m, in \u001b[0;36mBlobAccessor.transcribe\u001b[0;34m(self, df, audio_column, model_name, prompt_text, temperature, output_schema, verbose)\u001b[0m\n\u001b[1;32m 817\u001b[0m model \u001b[38;5;241m=\u001b[39m llm\u001b[38;5;241m.\u001b[39mGeminiTextGenerator(model_name\u001b[38;5;241m=\u001b[39mmodel_name)\n\u001b[1;32m 819\u001b[0m \u001b[38;5;66;03m# transcribe audio using ML.GENERATE_TEXT\u001b[39;00m\n\u001b[0;32m--> 820\u001b[0m results \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpredict\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 821\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdf_prompt\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 822\u001b[0m \u001b[43m \u001b[49m\u001b[43mprompt\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43mdf_prompt\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mprompt\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdf_prompt\u001b[49m\u001b[43m[\u001b[49m\u001b[43maudio_column\u001b[49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 823\u001b[0m \u001b[43m \u001b[49m\u001b[43mtemperature\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtemperature\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 824\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_schema\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_schema\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 825\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 827\u001b[0m content_series \u001b[38;5;241m=\u001b[39m cast(bpd\u001b[38;5;241m.\u001b[39mSeries, results[transcribe_col_name])\n\u001b[1;32m 829\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m verbose:\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:175\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 172\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 175\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 178\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 179\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;66;03m# _block.session.bqclient.\u001b[39;00m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/ml/llm.py:738\u001b[0m, in \u001b[0;36mGeminiTextGenerator.predict\u001b[0;34m(self, X, temperature, max_output_tokens, top_k, top_p, ground_with_google_search, max_retries, prompt, output_schema)\u001b[0m\n\u001b[1;32m 734\u001b[0m output_schema \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 735\u001b[0m k: utils\u001b[38;5;241m.\u001b[39mstandardize_type(v) \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m output_schema\u001b[38;5;241m.\u001b[39mitems()\n\u001b[1;32m 736\u001b[0m }\n\u001b[1;32m 737\u001b[0m options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutput_schema\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m output_schema\n\u001b[0;32m--> 738\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_predict_and_retry\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 739\u001b[0m \u001b[43m \u001b[49m\u001b[43mcore\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mBqmlModel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate_table_tvf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 740\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 741\u001b[0m \u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 742\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_retries\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_retries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 743\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 745\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_predict_and_retry(\n\u001b[1;32m 746\u001b[0m core\u001b[38;5;241m.\u001b[39mBqmlModel\u001b[38;5;241m.\u001b[39mgenerate_text_tvf,\n\u001b[1;32m 747\u001b[0m X,\n\u001b[1;32m 748\u001b[0m options\u001b[38;5;241m=\u001b[39moptions,\n\u001b[1;32m 749\u001b[0m max_retries\u001b[38;5;241m=\u001b[39mmax_retries,\n\u001b[1;32m 750\u001b[0m )\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/ml/base.py:266\u001b[0m, in \u001b[0;36mRetriableRemotePredictor._predict_and_retry\u001b[0;34m(self, bqml_model_predict_tvf, X, options, max_retries)\u001b[0m\n\u001b[1;32m 263\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(msg, category\u001b[38;5;241m=\u001b[39m\u001b[38;5;167;01mRuntimeWarning\u001b[39;00m)\n\u001b[1;32m 264\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[0;32m--> 266\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43mbqml_model_predict_tvf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtvf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_bqml_model\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdf_fail\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 268\u001b[0m success \u001b[38;5;241m=\u001b[39m df[bqml_model_predict_tvf\u001b[38;5;241m.\u001b[39mstatus_col]\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mlen() \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 269\u001b[0m df_succ \u001b[38;5;241m=\u001b[39m df[success]\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/ml/core.py:202\u001b[0m, in \u001b[0;36mBqmlModel.generate_table\u001b[0;34m(self, input_data, options)\u001b[0m\n\u001b[1;32m 197\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mgenerate_table\u001b[39m(\n\u001b[1;32m 198\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 199\u001b[0m input_data: bpd\u001b[38;5;241m.\u001b[39mDataFrame,\n\u001b[1;32m 200\u001b[0m options: \u001b[38;5;28mdict\u001b[39m[\u001b[38;5;28mstr\u001b[39m, Union[\u001b[38;5;28mint\u001b[39m, \u001b[38;5;28mfloat\u001b[39m, \u001b[38;5;28mbool\u001b[39m, Mapping]],\n\u001b[1;32m 201\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m bpd\u001b[38;5;241m.\u001b[39mDataFrame:\n\u001b[0;32m--> 202\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_apply_ml_tvf\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 203\u001b[0m \u001b[43m \u001b[49m\u001b[43minput_data\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 204\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mlambda\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43msource_sql\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_model_manipulation_sql_generator\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mai_generate_table\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 205\u001b[0m \u001b[43m \u001b[49m\u001b[43msource_sql\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msource_sql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 206\u001b[0m \u001b[43m \u001b[49m\u001b[43mstruct_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 207\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 208\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/ml/core.py:88\u001b[0m, in \u001b[0;36mBqmlModel._apply_ml_tvf\u001b[0;34m(self, input_data, apply_sql_tvf)\u001b[0m\n\u001b[1;32m 83\u001b[0m input_sql, index_col_ids, index_labels \u001b[38;5;241m=\u001b[39m input_data\u001b[38;5;241m.\u001b[39m_to_sql_query(\n\u001b[1;32m 84\u001b[0m include_index\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 85\u001b[0m )\n\u001b[1;32m 87\u001b[0m result_sql \u001b[38;5;241m=\u001b[39m apply_sql_tvf(input_sql)\n\u001b[0;32m---> 88\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_session\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_gbq\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresult_sql\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex_col_ids\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 89\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m df\u001b[38;5;241m.\u001b[39m_has_index:\n\u001b[1;32m 90\u001b[0m df\u001b[38;5;241m.\u001b[39mindex\u001b[38;5;241m.\u001b[39mnames \u001b[38;5;241m=\u001b[39m index_labels\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:175\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 172\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 175\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 178\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 179\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;66;03m# _block.session.bqclient.\u001b[39;00m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/__init__.py:445\u001b[0m, in \u001b[0;36mSession.read_gbq\u001b[0;34m(self, query_or_table, index_col, columns, configuration, max_results, filters, use_cache, col_order, dry_run)\u001b[0m\n\u001b[1;32m 442\u001b[0m columns \u001b[38;5;241m=\u001b[39m col_order\n\u001b[1;32m 444\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m bf_io_bigquery\u001b[38;5;241m.\u001b[39mis_query(query_or_table):\n\u001b[0;32m--> 445\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_loader\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_gbq_query\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# type: ignore # for dry_run overload\u001b[39;49;00m\n\u001b[1;32m 446\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_or_table\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 447\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex_col\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 448\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 449\u001b[0m \u001b[43m \u001b[49m\u001b[43mconfiguration\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfiguration\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 450\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 451\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 452\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 453\u001b[0m \u001b[43m \u001b[49m\u001b[43mdry_run\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdry_run\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 454\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 455\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 456\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m configuration \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/loader.py:824\u001b[0m, in \u001b[0;36mGbqDataLoader.read_gbq_query\u001b[0;34m(self, query, index_col, columns, configuration, max_results, use_cache, filters, dry_run, force_total_order)\u001b[0m\n\u001b[1;32m 819\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m dry_runs\u001b[38;5;241m.\u001b[39mget_query_stats_with_inferred_dtypes(\n\u001b[1;32m 820\u001b[0m query_job, \u001b[38;5;28mlist\u001b[39m(columns), index_cols\n\u001b[1;32m 821\u001b[0m )\n\u001b[1;32m 823\u001b[0m \u001b[38;5;66;03m# No cluster candidates as user query might not be clusterable (eg because of ORDER BY clause)\u001b[39;00m\n\u001b[0;32m--> 824\u001b[0m destination, query_job \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_query_to_destination\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 825\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 826\u001b[0m \u001b[43m \u001b[49m\u001b[43mcluster_candidates\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 827\u001b[0m \u001b[43m \u001b[49m\u001b[43mconfiguration\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfiguration\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 828\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 830\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_metrics \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 831\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_metrics\u001b[38;5;241m.\u001b[39mcount_job_stats(query_job)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/loader.py:906\u001b[0m, in \u001b[0;36mGbqDataLoader._query_to_destination\u001b[0;34m(self, query, cluster_candidates, configuration, do_clustering)\u001b[0m\n\u001b[1;32m 902\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 903\u001b[0m \u001b[38;5;66;03m# Write to temp table to workaround BigQuery 10 GB query results\u001b[39;00m\n\u001b[1;32m 904\u001b[0m \u001b[38;5;66;03m# limit. See: internal issue 303057336.\u001b[39;00m\n\u001b[1;32m 905\u001b[0m job_config\u001b[38;5;241m.\u001b[39mlabels[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124merror_caught\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtrue\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m--> 906\u001b[0m _, query_job \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_start_query\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 907\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 908\u001b[0m \u001b[43m \u001b[49m\u001b[43mjob_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjob_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 909\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 910\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 911\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m query_job\u001b[38;5;241m.\u001b[39mdestination, query_job\n\u001b[1;32m 912\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m google\u001b[38;5;241m.\u001b[39mapi_core\u001b[38;5;241m.\u001b[39mexceptions\u001b[38;5;241m.\u001b[39mBadRequest:\n\u001b[1;32m 913\u001b[0m \u001b[38;5;66;03m# Some SELECT statements still aren't compatible with cluster\u001b[39;00m\n\u001b[1;32m 914\u001b[0m \u001b[38;5;66;03m# tables as the destination. For example, if the query has a\u001b[39;00m\n\u001b[1;32m 915\u001b[0m \u001b[38;5;66;03m# top-level ORDER BY, this conflicts with our ability to cluster\u001b[39;00m\n\u001b[1;32m 916\u001b[0m \u001b[38;5;66;03m# the table by the index column(s).\u001b[39;00m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/loader.py:937\u001b[0m, in \u001b[0;36mGbqDataLoader._start_query\u001b[0;34m(self, sql, job_config, timeout)\u001b[0m\n\u001b[1;32m 932\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m bigframes\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mcompute\u001b[38;5;241m.\u001b[39mmaximum_bytes_billed \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 933\u001b[0m \u001b[38;5;66;03m# Maybe this should be pushed down into start_query_with_client\u001b[39;00m\n\u001b[1;32m 934\u001b[0m job_config\u001b[38;5;241m.\u001b[39mmaximum_bytes_billed \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 935\u001b[0m bigframes\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mcompute\u001b[38;5;241m.\u001b[39mmaximum_bytes_billed\n\u001b[1;32m 936\u001b[0m )\n\u001b[0;32m--> 937\u001b[0m iterator, query_job \u001b[38;5;241m=\u001b[39m \u001b[43mbf_io_bigquery\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstart_query_with_client\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 938\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_bqclient\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 939\u001b[0m \u001b[43m \u001b[49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 940\u001b[0m \u001b[43m \u001b[49m\u001b[43mjob_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjob_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 941\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 942\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 943\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m query_job \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 944\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m iterator, query_job\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/_io/bigquery/__init__.py:280\u001b[0m, in \u001b[0;36mstart_query_with_client\u001b[0;34m(bq_client, sql, job_config, location, project, timeout, metrics, query_with_job)\u001b[0m\n\u001b[1;32m 278\u001b[0m opts \u001b[38;5;241m=\u001b[39m bigframes\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mdisplay\n\u001b[1;32m 279\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m opts\u001b[38;5;241m.\u001b[39mprogress_bar \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m query_job\u001b[38;5;241m.\u001b[39mconfiguration\u001b[38;5;241m.\u001b[39mdry_run:\n\u001b[0;32m--> 280\u001b[0m results_iterator \u001b[38;5;241m=\u001b[39m \u001b[43mformatting_helpers\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait_for_query_job\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 281\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_job\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 282\u001b[0m \u001b[43m \u001b[49m\u001b[43mprogress_bar\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mopts\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprogress_bar\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 283\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 284\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 285\u001b[0m results_iterator \u001b[38;5;241m=\u001b[39m query_job\u001b[38;5;241m.\u001b[39mresult()\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/formatting_helpers.py:149\u001b[0m, in \u001b[0;36mwait_for_query_job\u001b[0;34m(query_job, max_results, page_size, progress_bar)\u001b[0m\n\u001b[1;32m 147\u001b[0m loading_bar \u001b[38;5;241m=\u001b[39m display\u001b[38;5;241m.\u001b[39mHTML(get_query_job_loading_html(query_job))\n\u001b[1;32m 148\u001b[0m display\u001b[38;5;241m.\u001b[39mdisplay(loading_bar, display_id\u001b[38;5;241m=\u001b[39mdisplay_id)\n\u001b[0;32m--> 149\u001b[0m query_result \u001b[38;5;241m=\u001b[39m \u001b[43mquery_job\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 150\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpage_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpage_size\u001b[49m\n\u001b[1;32m 151\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 152\u001b[0m query_job\u001b[38;5;241m.\u001b[39mreload()\n\u001b[1;32m 153\u001b[0m display\u001b[38;5;241m.\u001b[39mupdate_display(\n\u001b[1;32m 154\u001b[0m display\u001b[38;5;241m.\u001b[39mHTML(get_query_job_loading_html(query_job)),\n\u001b[1;32m 155\u001b[0m display_id\u001b[38;5;241m=\u001b[39mdisplay_id,\n\u001b[1;32m 156\u001b[0m )\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/job/query.py:1681\u001b[0m, in \u001b[0;36mQueryJob.result\u001b[0;34m(self, page_size, max_results, retry, timeout, start_index, job_retry)\u001b[0m\n\u001b[1;32m 1676\u001b[0m remaining_timeout \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1678\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m remaining_timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1679\u001b[0m \u001b[38;5;66;03m# Since is_job_done() calls jobs.getQueryResults, which is a\u001b[39;00m\n\u001b[1;32m 1680\u001b[0m \u001b[38;5;66;03m# long-running API, don't delay the next request at all.\u001b[39;00m\n\u001b[0;32m-> 1681\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[43mis_job_done\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 1682\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[1;32m 1683\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1684\u001b[0m \u001b[38;5;66;03m# Use a monotonic clock since we don't actually care about\u001b[39;00m\n\u001b[1;32m 1685\u001b[0m \u001b[38;5;66;03m# daylight savings or similar, just the elapsed time.\u001b[39;00m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:293\u001b[0m, in \u001b[0;36mRetry.__call__..retry_wrapped_func\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 289\u001b[0m target \u001b[38;5;241m=\u001b[39m functools\u001b[38;5;241m.\u001b[39mpartial(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 290\u001b[0m sleep_generator \u001b[38;5;241m=\u001b[39m exponential_sleep_generator(\n\u001b[1;32m 291\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_initial, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_maximum, multiplier\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_multiplier\n\u001b[1;32m 292\u001b[0m )\n\u001b[0;32m--> 293\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mretry_target\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 294\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 295\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_predicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 296\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep_generator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 297\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 298\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 299\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:144\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m sleep \u001b[38;5;129;01min\u001b[39;00m sleep_generator:\n\u001b[1;32m 143\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 144\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mtarget\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 145\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inspect\u001b[38;5;241m.\u001b[39misawaitable(result):\n\u001b[1;32m 146\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(_ASYNC_RETRY_WARNING)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/job/query.py:1650\u001b[0m, in \u001b[0;36mQueryJob.result..is_job_done\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1644\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 1646\u001b[0m \u001b[38;5;66;03m# Call jobs.getQueryResults with max results set to 0 just to\u001b[39;00m\n\u001b[1;32m 1647\u001b[0m \u001b[38;5;66;03m# wait for the query to finish. Unlike most methods,\u001b[39;00m\n\u001b[1;32m 1648\u001b[0m \u001b[38;5;66;03m# jobs.getQueryResults hangs as long as it can to ensure we\u001b[39;00m\n\u001b[1;32m 1649\u001b[0m \u001b[38;5;66;03m# know when the query has finished as soon as possible.\u001b[39;00m\n\u001b[0;32m-> 1650\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_reload_query_results\u001b[49m\u001b[43m(\u001b[49m\u001b[43mretry\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mretry\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mreload_query_results_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1652\u001b[0m \u001b[38;5;66;03m# Even if the query is finished now according to\u001b[39;00m\n\u001b[1;32m 1653\u001b[0m \u001b[38;5;66;03m# jobs.getQueryResults, we'll want to reload the job status if\u001b[39;00m\n\u001b[1;32m 1654\u001b[0m \u001b[38;5;66;03m# it's not already DONE.\u001b[39;00m\n\u001b[1;32m 1655\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/job/query.py:1448\u001b[0m, in \u001b[0;36mQueryJob._reload_query_results\u001b[0;34m(self, retry, timeout, page_size)\u001b[0m\n\u001b[1;32m 1445\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(transport_timeout, (\u001b[38;5;28mfloat\u001b[39m, \u001b[38;5;28mint\u001b[39m)):\n\u001b[1;32m 1446\u001b[0m transport_timeout \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1448\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_query_results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_client\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_query_results\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1449\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjob_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1450\u001b[0m \u001b[43m \u001b[49m\u001b[43mretry\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1451\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mproject\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1452\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout_ms\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout_ms\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1453\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlocation\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1454\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtransport_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1455\u001b[0m \u001b[43m \u001b[49m\u001b[43mpage_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpage_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1456\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/client.py:2034\u001b[0m, in \u001b[0;36mClient._get_query_results\u001b[0;34m(self, job_id, retry, project, timeout_ms, location, timeout, page_size)\u001b[0m\n\u001b[1;32m 2030\u001b[0m \u001b[38;5;66;03m# This call is typically made in a polling loop that checks whether the\u001b[39;00m\n\u001b[1;32m 2031\u001b[0m \u001b[38;5;66;03m# job is complete (from QueryJob.done(), called ultimately from\u001b[39;00m\n\u001b[1;32m 2032\u001b[0m \u001b[38;5;66;03m# QueryJob.result()). So we don't need to poll here.\u001b[39;00m\n\u001b[1;32m 2033\u001b[0m span_attributes \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpath\u001b[39m\u001b[38;5;124m\"\u001b[39m: path}\n\u001b[0;32m-> 2034\u001b[0m resource \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_api\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2035\u001b[0m \u001b[43m \u001b[49m\u001b[43mretry\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2036\u001b[0m \u001b[43m \u001b[49m\u001b[43mspan_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mBigQuery.getQueryResults\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2037\u001b[0m \u001b[43m \u001b[49m\u001b[43mspan_attributes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mspan_attributes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2038\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mGET\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2039\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2040\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_params\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextra_params\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2041\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2042\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2043\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _QueryResults\u001b[38;5;241m.\u001b[39mfrom_api_repr(resource)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/client.py:843\u001b[0m, in \u001b[0;36mClient._call_api\u001b[0;34m(self, retry, span_name, span_attributes, job_ref, headers, **kwargs)\u001b[0m\n\u001b[1;32m 839\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m span_name \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 840\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m create_span(\n\u001b[1;32m 841\u001b[0m name\u001b[38;5;241m=\u001b[39mspan_name, attributes\u001b[38;5;241m=\u001b[39mspan_attributes, client\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m, job_ref\u001b[38;5;241m=\u001b[39mjob_ref\n\u001b[1;32m 842\u001b[0m ):\n\u001b[0;32m--> 843\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcall\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 845\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m call()\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:293\u001b[0m, in \u001b[0;36mRetry.__call__..retry_wrapped_func\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 289\u001b[0m target \u001b[38;5;241m=\u001b[39m functools\u001b[38;5;241m.\u001b[39mpartial(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 290\u001b[0m sleep_generator \u001b[38;5;241m=\u001b[39m exponential_sleep_generator(\n\u001b[1;32m 291\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_initial, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_maximum, multiplier\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_multiplier\n\u001b[1;32m 292\u001b[0m )\n\u001b[0;32m--> 293\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mretry_target\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 294\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 295\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_predicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 296\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep_generator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 297\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 298\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 299\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:144\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m sleep \u001b[38;5;129;01min\u001b[39;00m sleep_generator:\n\u001b[1;32m 143\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 144\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mtarget\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 145\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inspect\u001b[38;5;241m.\u001b[39misawaitable(result):\n\u001b[1;32m 146\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(_ASYNC_RETRY_WARNING)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/_http/__init__.py:482\u001b[0m, in \u001b[0;36mJSONConnection.api_request\u001b[0;34m(self, method, path, query_params, data, content_type, headers, api_base_url, api_version, expect_json, _target_object, timeout, extra_api_info)\u001b[0m\n\u001b[1;32m 479\u001b[0m data \u001b[38;5;241m=\u001b[39m json\u001b[38;5;241m.\u001b[39mdumps(data)\n\u001b[1;32m 480\u001b[0m content_type \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mapplication/json\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m--> 482\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 483\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 484\u001b[0m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 485\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 486\u001b[0m \u001b[43m \u001b[49m\u001b[43mcontent_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcontent_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 487\u001b[0m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 488\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget_object\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m_target_object\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 489\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 490\u001b[0m \u001b[43m \u001b[49m\u001b[43mextra_api_info\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextra_api_info\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 491\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 493\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;241m200\u001b[39m \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m response\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m300\u001b[39m:\n\u001b[1;32m 494\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exceptions\u001b[38;5;241m.\u001b[39mfrom_http_response(response)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/_http/__init__.py:341\u001b[0m, in \u001b[0;36mJSONConnection._make_request\u001b[0;34m(self, method, url, data, content_type, headers, target_object, timeout, extra_api_info)\u001b[0m\n\u001b[1;32m 338\u001b[0m headers[CLIENT_INFO_HEADER] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39muser_agent\n\u001b[1;32m 339\u001b[0m headers[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUser-Agent\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39muser_agent\n\u001b[0;32m--> 341\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_do_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 342\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtarget_object\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\n\u001b[1;32m 343\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/_http/__init__.py:379\u001b[0m, in \u001b[0;36mJSONConnection._do_request\u001b[0;34m(self, method, url, headers, data, target_object, timeout)\u001b[0m\n\u001b[1;32m 345\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m_do_request\u001b[39m(\n\u001b[1;32m 346\u001b[0m \u001b[38;5;28mself\u001b[39m, method, url, headers, data, target_object, timeout\u001b[38;5;241m=\u001b[39m_DEFAULT_TIMEOUT\n\u001b[1;32m 347\u001b[0m ): \u001b[38;5;66;03m# pylint: disable=unused-argument\u001b[39;00m\n\u001b[1;32m 348\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Low-level helper: perform the actual API request over HTTP.\u001b[39;00m\n\u001b[1;32m 349\u001b[0m \n\u001b[1;32m 350\u001b[0m \u001b[38;5;124;03m Allows batch context managers to override and defer a request.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 377\u001b[0m \u001b[38;5;124;03m :returns: The HTTP response.\u001b[39;00m\n\u001b[1;32m 378\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 379\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mhttp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 380\u001b[0m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\n\u001b[1;32m 381\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/auth/transport/requests.py:537\u001b[0m, in \u001b[0;36mAuthorizedSession.request\u001b[0;34m(self, method, url, data, headers, max_allowed_time, timeout, **kwargs)\u001b[0m\n\u001b[1;32m 534\u001b[0m remaining_time \u001b[38;5;241m=\u001b[39m guard\u001b[38;5;241m.\u001b[39mremaining_timeout\n\u001b[1;32m 536\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m TimeoutGuard(remaining_time) \u001b[38;5;28;01mas\u001b[39;00m guard:\n\u001b[0;32m--> 537\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mAuthorizedSession\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 538\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 539\u001b[0m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 540\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 541\u001b[0m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest_headers\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 542\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 543\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\n\u001b[1;32m 544\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 545\u001b[0m remaining_time \u001b[38;5;241m=\u001b[39m guard\u001b[38;5;241m.\u001b[39mremaining_timeout\n\u001b[1;32m 547\u001b[0m \u001b[38;5;66;03m# If the response indicated that the credentials needed to be\u001b[39;00m\n\u001b[1;32m 548\u001b[0m \u001b[38;5;66;03m# refreshed, then refresh the credentials and re-attempt the\u001b[39;00m\n\u001b[1;32m 549\u001b[0m \u001b[38;5;66;03m# request.\u001b[39;00m\n\u001b[1;32m 550\u001b[0m \u001b[38;5;66;03m# A stored token may expire between the time it is retrieved and\u001b[39;00m\n\u001b[1;32m 551\u001b[0m \u001b[38;5;66;03m# the time the request is made, so we may need to try twice.\u001b[39;00m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/requests/sessions.py:589\u001b[0m, in \u001b[0;36mSession.request\u001b[0;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[1;32m 584\u001b[0m send_kwargs \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 585\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtimeout\u001b[39m\u001b[38;5;124m\"\u001b[39m: timeout,\n\u001b[1;32m 586\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mallow_redirects\u001b[39m\u001b[38;5;124m\"\u001b[39m: allow_redirects,\n\u001b[1;32m 587\u001b[0m }\n\u001b[1;32m 588\u001b[0m send_kwargs\u001b[38;5;241m.\u001b[39mupdate(settings)\n\u001b[0;32m--> 589\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mprep\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43msend_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 591\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m resp\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/requests/sessions.py:703\u001b[0m, in \u001b[0;36mSession.send\u001b[0;34m(self, request, **kwargs)\u001b[0m\n\u001b[1;32m 700\u001b[0m start \u001b[38;5;241m=\u001b[39m preferred_clock()\n\u001b[1;32m 702\u001b[0m \u001b[38;5;66;03m# Send the request\u001b[39;00m\n\u001b[0;32m--> 703\u001b[0m r \u001b[38;5;241m=\u001b[39m \u001b[43madapter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msend\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrequest\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 705\u001b[0m \u001b[38;5;66;03m# Total elapsed time of the request (approximately)\u001b[39;00m\n\u001b[1;32m 706\u001b[0m elapsed \u001b[38;5;241m=\u001b[39m preferred_clock() \u001b[38;5;241m-\u001b[39m start\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/requests/adapters.py:667\u001b[0m, in \u001b[0;36mHTTPAdapter.send\u001b[0;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[1;32m 664\u001b[0m timeout \u001b[38;5;241m=\u001b[39m TimeoutSauce(connect\u001b[38;5;241m=\u001b[39mtimeout, read\u001b[38;5;241m=\u001b[39mtimeout)\n\u001b[1;32m 666\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 667\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[43mconn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43murlopen\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 668\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 669\u001b[0m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 670\u001b[0m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 671\u001b[0m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrequest\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 672\u001b[0m \u001b[43m \u001b[49m\u001b[43mredirect\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 673\u001b[0m \u001b[43m \u001b[49m\u001b[43massert_same_host\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 674\u001b[0m \u001b[43m \u001b[49m\u001b[43mpreload_content\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 675\u001b[0m \u001b[43m \u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 676\u001b[0m \u001b[43m \u001b[49m\u001b[43mretries\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmax_retries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 677\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 678\u001b[0m \u001b[43m \u001b[49m\u001b[43mchunked\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchunked\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 679\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 681\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (ProtocolError, \u001b[38;5;167;01mOSError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m err:\n\u001b[1;32m 682\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mConnectionError\u001b[39;00m(err, request\u001b[38;5;241m=\u001b[39mrequest)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/urllib3/connectionpool.py:787\u001b[0m, in \u001b[0;36mHTTPConnectionPool.urlopen\u001b[0;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, preload_content, decode_content, **response_kw)\u001b[0m\n\u001b[1;32m 784\u001b[0m response_conn \u001b[38;5;241m=\u001b[39m conn \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m release_conn \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 786\u001b[0m \u001b[38;5;66;03m# Make the request on the HTTPConnection object\u001b[39;00m\n\u001b[0;32m--> 787\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_request\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 788\u001b[0m \u001b[43m \u001b[49m\u001b[43mconn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 789\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 790\u001b[0m \u001b[43m \u001b[49m\u001b[43murl\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 791\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout_obj\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 792\u001b[0m \u001b[43m \u001b[49m\u001b[43mbody\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbody\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 793\u001b[0m \u001b[43m \u001b[49m\u001b[43mheaders\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mheaders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 794\u001b[0m \u001b[43m \u001b[49m\u001b[43mchunked\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchunked\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 795\u001b[0m \u001b[43m \u001b[49m\u001b[43mretries\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mretries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 796\u001b[0m \u001b[43m \u001b[49m\u001b[43mresponse_conn\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresponse_conn\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 797\u001b[0m \u001b[43m \u001b[49m\u001b[43mpreload_content\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpreload_content\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 798\u001b[0m \u001b[43m \u001b[49m\u001b[43mdecode_content\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdecode_content\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 799\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mresponse_kw\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 800\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 802\u001b[0m \u001b[38;5;66;03m# Everything went great!\u001b[39;00m\n\u001b[1;32m 803\u001b[0m clean_exit \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/urllib3/connectionpool.py:534\u001b[0m, in \u001b[0;36mHTTPConnectionPool._make_request\u001b[0;34m(self, conn, method, url, body, headers, retries, timeout, chunked, response_conn, preload_content, decode_content, enforce_content_length)\u001b[0m\n\u001b[1;32m 532\u001b[0m \u001b[38;5;66;03m# Receive the response from the server\u001b[39;00m\n\u001b[1;32m 533\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 534\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[43mconn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgetresponse\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 535\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (BaseSSLError, \u001b[38;5;167;01mOSError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 536\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_raise_timeout(err\u001b[38;5;241m=\u001b[39me, url\u001b[38;5;241m=\u001b[39murl, timeout_value\u001b[38;5;241m=\u001b[39mread_timeout)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/urllib3/connection.py:516\u001b[0m, in \u001b[0;36mHTTPConnection.getresponse\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 513\u001b[0m _shutdown \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msock, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mshutdown\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m 515\u001b[0m \u001b[38;5;66;03m# Get the response from http.client.HTTPConnection\u001b[39;00m\n\u001b[0;32m--> 516\u001b[0m httplib_response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgetresponse\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 518\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 519\u001b[0m assert_header_parsing(httplib_response\u001b[38;5;241m.\u001b[39mmsg)\n", - "File \u001b[0;32m~/.pyenv/versions/3.10.15/lib/python3.10/http/client.py:1375\u001b[0m, in \u001b[0;36mHTTPConnection.getresponse\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1373\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1374\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m-> 1375\u001b[0m \u001b[43mresponse\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbegin\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1376\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mConnectionError\u001b[39;00m:\n\u001b[1;32m 1377\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclose()\n", - "File \u001b[0;32m~/.pyenv/versions/3.10.15/lib/python3.10/http/client.py:318\u001b[0m, in \u001b[0;36mHTTPResponse.begin\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 316\u001b[0m \u001b[38;5;66;03m# read until we get a non-100 response\u001b[39;00m\n\u001b[1;32m 317\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[0;32m--> 318\u001b[0m version, status, reason \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_read_status\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 319\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m status \u001b[38;5;241m!=\u001b[39m CONTINUE:\n\u001b[1;32m 320\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n", - "File \u001b[0;32m~/.pyenv/versions/3.10.15/lib/python3.10/http/client.py:279\u001b[0m, in \u001b[0;36mHTTPResponse._read_status\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 278\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21m_read_status\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[0;32m--> 279\u001b[0m line \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreadline\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_MAXLINE\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124miso-8859-1\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 280\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(line) \u001b[38;5;241m>\u001b[39m _MAXLINE:\n\u001b[1;32m 281\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m LineTooLong(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mstatus line\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "File \u001b[0;32m~/.pyenv/versions/3.10.15/lib/python3.10/socket.py:717\u001b[0m, in \u001b[0;36mSocketIO.readinto\u001b[0;34m(self, b)\u001b[0m\n\u001b[1;32m 715\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[1;32m 716\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 717\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sock\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mrecv_into\u001b[49m\u001b[43m(\u001b[49m\u001b[43mb\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 718\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m timeout:\n\u001b[1;32m 719\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_timeout_occurred \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n", - "File \u001b[0;32m~/.pyenv/versions/3.10.15/lib/python3.10/ssl.py:1307\u001b[0m, in \u001b[0;36mSSLSocket.recv_into\u001b[0;34m(self, buffer, nbytes, flags)\u001b[0m\n\u001b[1;32m 1303\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m flags \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 1304\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 1305\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnon-zero flags not allowed in calls to recv_into() on \u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m%\u001b[39m\n\u001b[1;32m 1306\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m)\n\u001b[0;32m-> 1307\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnbytes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuffer\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1308\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1309\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m()\u001b[38;5;241m.\u001b[39mrecv_into(buffer, nbytes, flags)\n", - "File \u001b[0;32m~/.pyenv/versions/3.10.15/lib/python3.10/ssl.py:1163\u001b[0m, in \u001b[0;36mSSLSocket.read\u001b[0;34m(self, len, buffer)\u001b[0m\n\u001b[1;32m 1161\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1162\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m buffer \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 1163\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sslobj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuffer\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1164\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1165\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_sslobj\u001b[38;5;241m.\u001b[39mread(\u001b[38;5;28mlen\u001b[39m)\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " - ] - } - ], - "source": [ - "#bq_connection = \"bigframes-dev.us.bigframes-default-connection\"\n", - "df[\"text\"] = df[\"audio\"].blob.transcribe(df=df, audio_column=\"audio\", model_name=\"gemini-2.0-flash-001\", verbose=True)\n", - "# gemini-2.5-pro-preview-05-06" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job ed3d84ac-2e1b-42ab-b867-58560e1a3167 is DONE. 8.8 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job a65b6550-f250-4052-92e2-492ab58da692 is DONE. 8.8 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
audiotext
0uri: gs://shuowei_bucket/audio/ID014clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': \"I'll tell you something, when the first revelation Joseph Smith received about marriage of any kind was that you couldn't get anywhere, you into the celestial kingdom. No man could get into the celestial kingdom without a wife, because he's only half there if he hasn't got a wife. And no woman, of course, could get there without a husband either, because she's only half there. Well, now at that time, the Mormons were very strict, uh, uh, religious people, you see. Sure. Sure. And, uh, thought that a man had to live clean all his life as well as a woman did in order to be worthy to be a man. Well, this is still true. Well, it's still true certainly. But I said, in that those circumstances, at that time, there were, uh, about three times as many women that joined the church and were faithful Latter-day Saints than there were men. When Joseph Smith comes out with the revelation that, uh, marriage was for eternity. And if if you're going to be separated from death do you part, then you can't get into the celestial kingdom at all to become as God is. Now, they got that doctrine. And that was, the prophet said that, they believed that. What about these here two-thirds of the women? Yeah, that's a good question. What is going to become of us then? It won't do us any good to go out here and marry one of these scallywags that spends his time drinking and boozing around and mining and maybe works in the mine a while and does all kind of skullduggery. That won't do us any good to marry that kind of a man. Besides, we don't have a chance to raise a respectable family with that kind of a man. So, what are we going to do about it now? What's going to become of us? And Joseph Smith always prayed about every problem that came to him. And there's where he got the revelation that if they live thoroughly clean and faithful, if they can understand the purity and sacredness of sex, so they don't desecrate it, then one man can have a number of wives and and permit them to raise a respectable, honorable family as well as have those wives throughout eternity, have their husband to take them into the celestial kingdom. Now, there was the origin of polygamy.\"}
1uri: gs://shuowei_bucket/audio/HI009clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': \"Uh, I hear that there's a Hawaiian custom of something called a hukilau. Hukilau. Hukilau is they have a rope and on the rope they have a net there. Mhm. They surround it a certain place and they drag the net. They surround it, you mean they carry the net out? Carry the net, uh-huh. Uh, how do they, is there something to keep the net floating? Yeah, some they have those floaters uh floating. Mhm. Some they have a lead down on the bottom, then they drag that uh net in. What do the floats, what are the floats made of? The float made of those uh some coconut uh dry coconut, I guess. Oh, dry coconut. Yeah. That's a good use for coconut. Uh, and that floats the top of the net. Top of the net. Now, the word lau means leaf, doesn't it? Lau, lau is a yeah, yeah, lau is a leaf. All right. Well now, the the where does the leaf come in in this hukilau? Hukilau, the leaves, you know, they stay on the top, top of the rope. So that the when they pull, the fish going to not to jump over the rope. The ropes, the the uh the leaves keep the fish from jumping over to the scared, they scared them away. They don't want to go underneath the water, see. When some moment she go underneath the water there, they will go to the net. They have a pocket on the net. They have a pocket on the net. Now the bottom of the net then. Bottom of the net they have a pocket in. Mhm. The water must have some kind of things to make it sink. Sink is a lead, they have a lead. Oh, they have lead on it. I see. Then uh uh somebody goes out and puts the net out around. Yeah, they tow it with a boat. With a boat? Two, two boats they goes out. Tow it all around. And then uh who pulls it in? Well, they have a lots of uh from the shore about 40, 50 guys they pull that rope up. Ah, I see. Yeah. And uh they must have a pretty big school of fish there. Yeah, once in a while they have a nice school, right? And then they pull them all in and the fish don't are scared to jump out so they get caught. No, they all in up on the net already. Mhm. They get caught in the net. I suppose the top of the net comes in sooner than the bottom so they if they do try to go out there, that's pocket. Yeah, they have to take on uh, you know, about six or seven guys is swimming on it around the net to they're watching how the fish going. Oh, I see. They they're chasing at the hole. Uh-huh. Well, about how much fish do they get out of a good haul of those? Once I saw up Waimea, they caught about about a ton, I guess, you know, those akule. Mhm. Oh, they have a nice crew up there. What's akule? Mm, the akule some kind of, you know, that's Hawaiian used to call akule, akule. They have been calling it Aji in Japan.\"}
2uri: gs://shuowei_bucket/audio/AK012clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': \"the soapstone carvings are those done, have you done any of those? Yes. Yeah. It's pretty much like ivory carving. Only it takes more care. What? To work on soapstone. Why? Why is it? Because it is brittle. Oh, it is. And very soft. Mhm. Uh, you you can hack on ivory, but you can't do that on soapstone. Chip too easily. Uh-huh. But then you have to use your files on those. Yes. And then once you've done the filing and so on, how do you smooth it down? Uh, we I use uh fine file. Mhm. To finish it up and then use uh sandpaper or emery cloth. Mhm. I think you said that um quite often the thing that is carved is determined by the shape of the piece of stone that one starts carving. Yes, yeah. Sometimes uh uh an ivory carver or a soapstone carver will take advantage of the shape of the stone. Mhm. And uh try to visualize what it'd look like. Uh, maybe a polar bear or or Mhm. He makes it just the way it is.\"}
3uri: gs://shuowei_bucket/audio/CA138clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': \"was rather interesting just to watch them gathering their materials and bouncing around. Yeah, they they are what they call it kangaroo walk or or something. Really? Uh, something like that. They named it that. I don't know. I bet those men are going to get quite a reception when they get back to earth. Oh, yes. I'll be so glad when they land back now, but I think that's uh pretty well uh fact because they've landed so many safely now that I I feel really relieved. Just getting off of the moon was the thing that was. Have they met with the um one that was circling? Yes, they've rendezvoused. So I understand. I that wasn't shown either, so I but uh they say they have rendezvoused, so that's a matter of making the circles and then coming down. What do you sort of imagine for the future? Do you imagine them sending a I think they will. I think they will will do some more exploring up there. Right. Positive, you know, to continue with this. Uh-huh. Because that was such a very small area when you think of it that they just gathered uh rocks and and uh oh samples of soil and all and they did uh probe for some samples. And just what is going to come of that, I don't know. I'd be glad to read it.\"}
4uri: gs://shuowei_bucket/audio/TX048clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': \"Say this is entered customs. And uh the next step is to prepare a Mexican customs entry. Mexico call them pedimentos. It's really an appraisal sheet. It's it's a breakdown of the sheet. You use the same information. That's right. But here you give them all the information, for example, let me see, I found one right here. You give them all the information on on what the invoice contains. Here's one. On what the invoice contains, what uh what what the material is itself. If you hear, for example, this is some bottling equipment going over there and on the appraisal sheet, so you tell them who it's for, who made the invoice out of the states. This came from St. Louis, Missouri. Yeah. How much the freight was. And on the reverse of this of this thing, but the everything has to be marked. So complete identification. This identification number is that traffic number I spoke to you a little while ago. Number of boxes, what they contain, the tariff duty, the tariff number, the ad valorem rate, and what the actual duties would be paid. For example, here would be 996 pesos and 73 cents. And as you can see, this is a form that has already been approved by the appraiser. These appraisers are called Vistas. It's very different from the Vistas we have nowadays here.\"}
\n", - "

5 rows × 2 columns

\n", - "
[5 rows x 2 columns in total]" - ], - "text/plain": [ - " audio \\\n", - "0 {'uri': 'gs://shuowei_bucket/audio/ID014clip.m... \n", - "1 {'uri': 'gs://shuowei_bucket/audio/HI009clip.m... \n", - "2 {'uri': 'gs://shuowei_bucket/audio/AK012clip.m... \n", - "3 {'uri': 'gs://shuowei_bucket/audio/CA138clip.m... \n", - "4 {'uri': 'gs://shuowei_bucket/audio/TX048clip.m... \n", - "\n", - " text \n", - "0 {'status': '', 'content': \"I'll tell you somet... \n", - "1 {'status': '', 'content': \"Uh, I hear that the... \n", - "2 {'status': '', 'content': \"the soapstone carvi... \n", - "3 {'status': '', 'content': \"was rather interest... \n", - "4 {'status': '', 'content': \"Say this is entered... \n", - "\n", - "[5 rows x 2 columns]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/test_blob_transcribe_long_audio.ipynb b/notebooks/test_blob_transcribe_long_audio.ipynb deleted file mode 100644 index 7292981d7d..0000000000 --- a/notebooks/test_blob_transcribe_long_audio.ipynb +++ /dev/null @@ -1,315 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/_config/experiment_options.py:71: ApiDeprecationWarning: BigFrames Blob is in preview now. This flag is no longer needed.\n", - " warnings.warn(msg, category=bfe.ApiDeprecationWarning)\n" - ] - } - ], - "source": [ - "import bigframes\n", - "bigframes.options.experiments.blob = True\n", - "#bigframes.options._bigquery_options.client_endpoints_override = {\"bqclient\": \"https://test-bigquery.sandbox.google.com\", \n", - "# \"bqconnectionclient\": \"test-bigqueryconnection.sandbox.googleapis.com\", \n", - "# \"bqstoragereadclient\": \"test-bigquerystorage-grpc.sandbox.googleapis.com\"}\n", - "import bigframes.pandas as bpd" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/global_session.py:103: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", - " _global_session = bigframes.session.connect(\n" - ] - }, - { - "data": { - "text/html": [ - "Query job 5bbcf8fc-1e44-46c1-84b8-0629383cc1e7 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 7f3923d3-999f-45d3-9b86-dc8595de53ac is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "df = bpd.from_glob_path(\"gs://shuowei_bucket/long_audio/*\", name=\"audio\")" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 05eae664-16d7-4d12-a4f0-94ffcd81dde6 is DONE. 533.0 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# copy files again, now we have 1000 audio files\n", - "copies = [df] * 2 * 100\n", - "df = bpd.concat(copies, ignore_index=True)\n", - "df = df.cache()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job eb3e5b47-54c5-4ec4-9edf-8ef0b2d0629d is DONE. 183.8 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# copy files again, now we have 1,000,000 audio files\n", - "copies = [df] * 2 * 100\n", - "df = bpd.concat(copies, ignore_index=True)\n", - "df = df.cache()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 42a64933-a2dc-4b59-9198-2e746f7fa38f is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 475b4580-c2ae-4c22-8cb6-0aa06f419c69 is RUNNING. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job b1947d40-947f-4693-bf90-24c48564df5d is RUNNING. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "ename": "BadRequest", - "evalue": "400 GET https://bigquery.googleapis.com/bigquery/v2/projects/bigframes-dev/queries/b1947d40-947f-4693-bf90-24c48564df5d?maxResults=0&location=US&prettyPrint=false: Operation timed out after 6.0 hours. Consider reducing the amount of work performed by your operation so that it can complete within this limit.\n\nLocation: US\nJob ID: b1947d40-947f-4693-bf90-24c48564df5d\n Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey. You are currently running BigFrames version 2.5.0. [{'@type': 'type.googleapis.com/google.rpc.DebugInfo', 'detail': '[TIMEOUT] errorProto=code: \"TIMEOUT\"\\nargument: \"Operation timed out after 6.0 hours. Consider reducing the amount of work performed by your operation so that it can complete within this limit.\"\\n\\n\\tat com.google.cloud.helix.common.Exceptions$Public.timeout(Exceptions.java:958)\\n\\tat com.google.cloud.helix.server.job.DremelErrorUtil.createHelixErrorForDeadlineExceeded(DremelErrorUtil.java:75)\\n\\tat com.google.cloud.helix.server.job.DremelErrorUtil.createHelixErrorFromDremelRpcException(DremelErrorUtil.java:61)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:784)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:696)\\n\\tat com.google.net.rpc3.stream.RpcMessageCallback$ForwardingRpcMessageCallback.onMessage(RpcMessageCallback.java:128)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.processMessageUnlocked(RpcStreamInternalContext.java:1876)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksInternalUnlocked(RpcStreamInternalContext.java:2930)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksUnlocked(RpcStreamInternalContext.java:2854)\\n\\tat com.google.net.eventmanager.AbstractFutureTask$Sync.innerRun(AbstractFutureTask.java:259)\\n\\tat com.google.net.eventmanager.AbstractFutureTask.run(AbstractFutureTask.java:120)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runTaskTraced(EventManagerImpl.java:900)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runTask(EventManagerImpl.java:892)\\n\\tat com.google.net.eventmanager.EventManagerImpl.internalRunWorkerLoop(EventManagerImpl.java:1319)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runWorkerLoop(EventManagerImpl.java:1210)\\n\\tat com.google.net.eventmanager.WorkerThreadInfo.runWorkerLoop(WorkerThreadInfo.java:153)\\n\\tat com.google.net.eventmanager.EventManagerImpl$WorkerThread.run(EventManagerImpl.java:1999)\\n'}]", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mBadRequest\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[7], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m#bq_connection = \"bigframes-dev.us.bigframes-default-connection\"\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43maudio\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mblob\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43maudio_transcribe\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgemini-2.0-flash-001\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mverbose\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;66;03m# gemini-2.5-pro-preview-05-06\u001b[39;00m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:175\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 172\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 175\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 178\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 179\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;66;03m# _block.session.bqclient.\u001b[39;00m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/operations/blob.py:789\u001b[0m, in \u001b[0;36maudio_transcribe\u001b[0;34m(self, connection, model_name, verbose)\u001b[0m\n\u001b[1;32m 0\u001b[0m \n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:175\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 172\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 175\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 178\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 179\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;66;03m# _block.session.bqclient.\u001b[39;00m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/ml/llm.py:745\u001b[0m, in \u001b[0;36mGeminiTextGenerator.predict\u001b[0;34m(self, X, temperature, max_output_tokens, top_k, top_p, ground_with_google_search, max_retries, prompt, output_schema)\u001b[0m\n\u001b[1;32m 737\u001b[0m options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moutput_schema\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m output_schema\n\u001b[1;32m 738\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_predict_and_retry(\n\u001b[1;32m 739\u001b[0m core\u001b[38;5;241m.\u001b[39mBqmlModel\u001b[38;5;241m.\u001b[39mgenerate_table_tvf,\n\u001b[1;32m 740\u001b[0m X,\n\u001b[1;32m 741\u001b[0m options\u001b[38;5;241m=\u001b[39moptions,\n\u001b[1;32m 742\u001b[0m max_retries\u001b[38;5;241m=\u001b[39mmax_retries,\n\u001b[1;32m 743\u001b[0m )\n\u001b[0;32m--> 745\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_predict_and_retry\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 746\u001b[0m \u001b[43m \u001b[49m\u001b[43mcore\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mBqmlModel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate_text_tvf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 747\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 748\u001b[0m \u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 749\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_retries\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_retries\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 750\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/ml/base.py:266\u001b[0m, in \u001b[0;36mRetriableRemotePredictor._predict_and_retry\u001b[0;34m(self, bqml_model_predict_tvf, X, options, max_retries)\u001b[0m\n\u001b[1;32m 263\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(msg, category\u001b[38;5;241m=\u001b[39m\u001b[38;5;167;01mRuntimeWarning\u001b[39;00m)\n\u001b[1;32m 264\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[0;32m--> 266\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43mbqml_model_predict_tvf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtvf\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_bqml_model\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdf_fail\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43moptions\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 268\u001b[0m success \u001b[38;5;241m=\u001b[39m df[bqml_model_predict_tvf\u001b[38;5;241m.\u001b[39mstatus_col]\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mlen() \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 269\u001b[0m df_succ \u001b[38;5;241m=\u001b[39m df[success]\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/ml/core.py:171\u001b[0m, in \u001b[0;36mBqmlModel.generate_text\u001b[0;34m(self, input_data, options)\u001b[0m\n\u001b[1;32m 165\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mgenerate_text\u001b[39m(\n\u001b[1;32m 166\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 167\u001b[0m input_data: bpd\u001b[38;5;241m.\u001b[39mDataFrame,\n\u001b[1;32m 168\u001b[0m options: \u001b[38;5;28mdict\u001b[39m[\u001b[38;5;28mstr\u001b[39m, Union[\u001b[38;5;28mint\u001b[39m, \u001b[38;5;28mfloat\u001b[39m, \u001b[38;5;28mbool\u001b[39m]],\n\u001b[1;32m 169\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m bpd\u001b[38;5;241m.\u001b[39mDataFrame:\n\u001b[1;32m 170\u001b[0m options[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mflatten_json_output\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 171\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_apply_ml_tvf\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 172\u001b[0m \u001b[43m \u001b[49m\u001b[43minput_data\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 173\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43;01mlambda\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43msource_sql\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_model_manipulation_sql_generator\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mml_generate_text\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 174\u001b[0m \u001b[43m \u001b[49m\u001b[43msource_sql\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msource_sql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 175\u001b[0m \u001b[43m \u001b[49m\u001b[43mstruct_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 177\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/ml/core.py:88\u001b[0m, in \u001b[0;36mBqmlModel._apply_ml_tvf\u001b[0;34m(self, input_data, apply_sql_tvf)\u001b[0m\n\u001b[1;32m 83\u001b[0m input_sql, index_col_ids, index_labels \u001b[38;5;241m=\u001b[39m input_data\u001b[38;5;241m.\u001b[39m_to_sql_query(\n\u001b[1;32m 84\u001b[0m include_index\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 85\u001b[0m )\n\u001b[1;32m 87\u001b[0m result_sql \u001b[38;5;241m=\u001b[39m apply_sql_tvf(input_sql)\n\u001b[0;32m---> 88\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_session\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_gbq\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresult_sql\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex_col_ids\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 89\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m df\u001b[38;5;241m.\u001b[39m_has_index:\n\u001b[1;32m 90\u001b[0m df\u001b[38;5;241m.\u001b[39mindex\u001b[38;5;241m.\u001b[39mnames \u001b[38;5;241m=\u001b[39m index_labels\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:175\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 172\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 175\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 178\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 179\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;66;03m# _block.session.bqclient.\u001b[39;00m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/__init__.py:446\u001b[0m, in \u001b[0;36mSession.read_gbq\u001b[0;34m(self, query_or_table, index_col, columns, configuration, max_results, filters, use_cache, col_order, dry_run)\u001b[0m\n\u001b[1;32m 443\u001b[0m columns \u001b[38;5;241m=\u001b[39m col_order\n\u001b[1;32m 445\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m bf_io_bigquery\u001b[38;5;241m.\u001b[39mis_query(query_or_table):\n\u001b[0;32m--> 446\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_loader\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_gbq_query\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# type: ignore # for dry_run overload\u001b[39;49;00m\n\u001b[1;32m 447\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_or_table\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 448\u001b[0m \u001b[43m \u001b[49m\u001b[43mindex_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mindex_col\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 449\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 450\u001b[0m \u001b[43m \u001b[49m\u001b[43mconfiguration\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfiguration\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 451\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 452\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 453\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 454\u001b[0m \u001b[43m \u001b[49m\u001b[43mdry_run\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdry_run\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 455\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 456\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 457\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m configuration \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/loader.py:837\u001b[0m, in \u001b[0;36mGbqDataLoader.read_gbq_query\u001b[0;34m(self, query, index_col, columns, configuration, max_results, use_cache, filters, dry_run, force_total_order, allow_large_results)\u001b[0m\n\u001b[1;32m 834\u001b[0m \u001b[38;5;66;03m# TODO(b/421161077): If an explicit destination table is set in\u001b[39;00m\n\u001b[1;32m 835\u001b[0m \u001b[38;5;66;03m# configuration, should we respect that setting?\u001b[39;00m\n\u001b[1;32m 836\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m allow_large_results:\n\u001b[0;32m--> 837\u001b[0m destination, query_job \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_query_to_destination\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 838\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 839\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# No cluster candidates as user query might not be clusterable\u001b[39;49;00m\n\u001b[1;32m 840\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# (eg because of ORDER BY clause)\u001b[39;49;00m\n\u001b[1;32m 841\u001b[0m \u001b[43m \u001b[49m\u001b[43mcluster_candidates\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 842\u001b[0m \u001b[43m \u001b[49m\u001b[43mconfiguration\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconfiguration\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 843\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 844\u001b[0m query_job_for_metrics \u001b[38;5;241m=\u001b[39m query_job\n\u001b[1;32m 845\u001b[0m rows \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/loader.py:972\u001b[0m, in \u001b[0;36mGbqDataLoader._query_to_destination\u001b[0;34m(self, query, cluster_candidates, configuration, do_clustering)\u001b[0m\n\u001b[1;32m 966\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m query_job\u001b[38;5;241m.\u001b[39mdestination, query_job\n\u001b[1;32m 967\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m google\u001b[38;5;241m.\u001b[39mapi_core\u001b[38;5;241m.\u001b[39mexceptions\u001b[38;5;241m.\u001b[39mBadRequest:\n\u001b[1;32m 968\u001b[0m \u001b[38;5;66;03m# Some SELECT statements still aren't compatible with cluster\u001b[39;00m\n\u001b[1;32m 969\u001b[0m \u001b[38;5;66;03m# tables as the destination. For example, if the query has a\u001b[39;00m\n\u001b[1;32m 970\u001b[0m \u001b[38;5;66;03m# top-level ORDER BY, this conflicts with our ability to cluster\u001b[39;00m\n\u001b[1;32m 971\u001b[0m \u001b[38;5;66;03m# the table by the index column(s).\u001b[39;00m\n\u001b[0;32m--> 972\u001b[0m query_job \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_start_query_with_job\u001b[49m\u001b[43m(\u001b[49m\u001b[43mquery\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 973\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m query_job\u001b[38;5;241m.\u001b[39mdestination, query_job\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/loader.py:1027\u001b[0m, in \u001b[0;36mGbqDataLoader._start_query_with_job\u001b[0;34m(self, sql, job_config, timeout)\u001b[0m\n\u001b[1;32m 1021\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1022\u001b[0m \u001b[38;5;124;03mStarts BigQuery query job and waits for results.\u001b[39;00m\n\u001b[1;32m 1023\u001b[0m \n\u001b[1;32m 1024\u001b[0m \u001b[38;5;124;03mDo not execute dataframe through this API, instead use the executor.\u001b[39;00m\n\u001b[1;32m 1025\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1026\u001b[0m job_config \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_prepare_job_config(job_config)\n\u001b[0;32m-> 1027\u001b[0m _, query_job \u001b[38;5;241m=\u001b[39m \u001b[43mbf_io_bigquery\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstart_query_with_client\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1028\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_bqclient\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1029\u001b[0m \u001b[43m \u001b[49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1030\u001b[0m \u001b[43m \u001b[49m\u001b[43mjob_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjob_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1031\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1032\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 1033\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 1034\u001b[0m \u001b[43m \u001b[49m\u001b[43mmetrics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 1035\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_with_job\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 1036\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1037\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m query_job\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/_io/bigquery/__init__.py:314\u001b[0m, in \u001b[0;36mstart_query_with_client\u001b[0;34m(bq_client, sql, job_config, location, project, timeout, metrics, query_with_job)\u001b[0m\n\u001b[1;32m 312\u001b[0m opts \u001b[38;5;241m=\u001b[39m bigframes\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mdisplay\n\u001b[1;32m 313\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m opts\u001b[38;5;241m.\u001b[39mprogress_bar \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m query_job\u001b[38;5;241m.\u001b[39mconfiguration\u001b[38;5;241m.\u001b[39mdry_run:\n\u001b[0;32m--> 314\u001b[0m results_iterator \u001b[38;5;241m=\u001b[39m \u001b[43mformatting_helpers\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait_for_query_job\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 315\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_job\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 316\u001b[0m \u001b[43m \u001b[49m\u001b[43mprogress_bar\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mopts\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprogress_bar\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 317\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 318\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 319\u001b[0m results_iterator \u001b[38;5;241m=\u001b[39m query_job\u001b[38;5;241m.\u001b[39mresult()\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/formatting_helpers.py:149\u001b[0m, in \u001b[0;36mwait_for_query_job\u001b[0;34m(query_job, max_results, page_size, progress_bar)\u001b[0m\n\u001b[1;32m 147\u001b[0m loading_bar \u001b[38;5;241m=\u001b[39m display\u001b[38;5;241m.\u001b[39mHTML(get_query_job_loading_html(query_job))\n\u001b[1;32m 148\u001b[0m display\u001b[38;5;241m.\u001b[39mdisplay(loading_bar, display_id\u001b[38;5;241m=\u001b[39mdisplay_id)\n\u001b[0;32m--> 149\u001b[0m query_result \u001b[38;5;241m=\u001b[39m \u001b[43mquery_job\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 150\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpage_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpage_size\u001b[49m\n\u001b[1;32m 151\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 152\u001b[0m query_job\u001b[38;5;241m.\u001b[39mreload()\n\u001b[1;32m 153\u001b[0m display\u001b[38;5;241m.\u001b[39mupdate_display(\n\u001b[1;32m 154\u001b[0m display\u001b[38;5;241m.\u001b[39mHTML(get_query_job_loading_html(query_job)),\n\u001b[1;32m 155\u001b[0m display_id\u001b[38;5;241m=\u001b[39mdisplay_id,\n\u001b[1;32m 156\u001b[0m )\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/job/query.py:1681\u001b[0m, in \u001b[0;36mQueryJob.result\u001b[0;34m(self, page_size, max_results, retry, timeout, start_index, job_retry)\u001b[0m\n\u001b[1;32m 1676\u001b[0m remaining_timeout \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1678\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m remaining_timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1679\u001b[0m \u001b[38;5;66;03m# Since is_job_done() calls jobs.getQueryResults, which is a\u001b[39;00m\n\u001b[1;32m 1680\u001b[0m \u001b[38;5;66;03m# long-running API, don't delay the next request at all.\u001b[39;00m\n\u001b[0;32m-> 1681\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[43mis_job_done\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 1682\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[1;32m 1683\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1684\u001b[0m \u001b[38;5;66;03m# Use a monotonic clock since we don't actually care about\u001b[39;00m\n\u001b[1;32m 1685\u001b[0m \u001b[38;5;66;03m# daylight savings or similar, just the elapsed time.\u001b[39;00m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:293\u001b[0m, in \u001b[0;36mRetry.__call__..retry_wrapped_func\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 289\u001b[0m target \u001b[38;5;241m=\u001b[39m functools\u001b[38;5;241m.\u001b[39mpartial(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 290\u001b[0m sleep_generator \u001b[38;5;241m=\u001b[39m exponential_sleep_generator(\n\u001b[1;32m 291\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_initial, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_maximum, multiplier\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_multiplier\n\u001b[1;32m 292\u001b[0m )\n\u001b[0;32m--> 293\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mretry_target\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 294\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 295\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_predicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 296\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep_generator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 297\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 298\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 299\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:153\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 149\u001b[0m \u001b[38;5;66;03m# pylint: disable=broad-except\u001b[39;00m\n\u001b[1;32m 150\u001b[0m \u001b[38;5;66;03m# This function explicitly must deal with broad exceptions.\u001b[39;00m\n\u001b[1;32m 151\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 152\u001b[0m \u001b[38;5;66;03m# defer to shared logic for handling errors\u001b[39;00m\n\u001b[0;32m--> 153\u001b[0m \u001b[43m_retry_error_helper\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 154\u001b[0m \u001b[43m \u001b[49m\u001b[43mexc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 155\u001b[0m \u001b[43m \u001b[49m\u001b[43mdeadline\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 156\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 157\u001b[0m \u001b[43m \u001b[49m\u001b[43merror_list\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 158\u001b[0m \u001b[43m \u001b[49m\u001b[43mpredicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 159\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 160\u001b[0m \u001b[43m \u001b[49m\u001b[43mexception_factory\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 161\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 162\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 163\u001b[0m \u001b[38;5;66;03m# if exception not raised, sleep before next attempt\u001b[39;00m\n\u001b[1;32m 164\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(sleep)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_base.py:212\u001b[0m, in \u001b[0;36m_retry_error_helper\u001b[0;34m(exc, deadline, next_sleep, error_list, predicate_fn, on_error_fn, exc_factory_fn, original_timeout)\u001b[0m\n\u001b[1;32m 206\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m predicate_fn(exc):\n\u001b[1;32m 207\u001b[0m final_exc, source_exc \u001b[38;5;241m=\u001b[39m exc_factory_fn(\n\u001b[1;32m 208\u001b[0m error_list,\n\u001b[1;32m 209\u001b[0m RetryFailureReason\u001b[38;5;241m.\u001b[39mNON_RETRYABLE_ERROR,\n\u001b[1;32m 210\u001b[0m original_timeout,\n\u001b[1;32m 211\u001b[0m )\n\u001b[0;32m--> 212\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m final_exc \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01msource_exc\u001b[39;00m\n\u001b[1;32m 213\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m on_error_fn \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 214\u001b[0m on_error_fn(exc)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:144\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m sleep \u001b[38;5;129;01min\u001b[39;00m sleep_generator:\n\u001b[1;32m 143\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 144\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mtarget\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 145\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inspect\u001b[38;5;241m.\u001b[39misawaitable(result):\n\u001b[1;32m 146\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(_ASYNC_RETRY_WARNING)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/job/query.py:1650\u001b[0m, in \u001b[0;36mQueryJob.result..is_job_done\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1644\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 1646\u001b[0m \u001b[38;5;66;03m# Call jobs.getQueryResults with max results set to 0 just to\u001b[39;00m\n\u001b[1;32m 1647\u001b[0m \u001b[38;5;66;03m# wait for the query to finish. Unlike most methods,\u001b[39;00m\n\u001b[1;32m 1648\u001b[0m \u001b[38;5;66;03m# jobs.getQueryResults hangs as long as it can to ensure we\u001b[39;00m\n\u001b[1;32m 1649\u001b[0m \u001b[38;5;66;03m# know when the query has finished as soon as possible.\u001b[39;00m\n\u001b[0;32m-> 1650\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_reload_query_results\u001b[49m\u001b[43m(\u001b[49m\u001b[43mretry\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mretry\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mreload_query_results_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1652\u001b[0m \u001b[38;5;66;03m# Even if the query is finished now according to\u001b[39;00m\n\u001b[1;32m 1653\u001b[0m \u001b[38;5;66;03m# jobs.getQueryResults, we'll want to reload the job status if\u001b[39;00m\n\u001b[1;32m 1654\u001b[0m \u001b[38;5;66;03m# it's not already DONE.\u001b[39;00m\n\u001b[1;32m 1655\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/job/query.py:1448\u001b[0m, in \u001b[0;36mQueryJob._reload_query_results\u001b[0;34m(self, retry, timeout, page_size)\u001b[0m\n\u001b[1;32m 1445\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(transport_timeout, (\u001b[38;5;28mfloat\u001b[39m, \u001b[38;5;28mint\u001b[39m)):\n\u001b[1;32m 1446\u001b[0m transport_timeout \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1448\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_query_results \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_client\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_query_results\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1449\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjob_id\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1450\u001b[0m \u001b[43m \u001b[49m\u001b[43mretry\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1451\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mproject\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1452\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout_ms\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout_ms\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1453\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlocation\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1454\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtransport_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1455\u001b[0m \u001b[43m \u001b[49m\u001b[43mpage_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpage_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1456\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/client.py:2034\u001b[0m, in \u001b[0;36mClient._get_query_results\u001b[0;34m(self, job_id, retry, project, timeout_ms, location, timeout, page_size)\u001b[0m\n\u001b[1;32m 2030\u001b[0m \u001b[38;5;66;03m# This call is typically made in a polling loop that checks whether the\u001b[39;00m\n\u001b[1;32m 2031\u001b[0m \u001b[38;5;66;03m# job is complete (from QueryJob.done(), called ultimately from\u001b[39;00m\n\u001b[1;32m 2032\u001b[0m \u001b[38;5;66;03m# QueryJob.result()). So we don't need to poll here.\u001b[39;00m\n\u001b[1;32m 2033\u001b[0m span_attributes \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpath\u001b[39m\u001b[38;5;124m\"\u001b[39m: path}\n\u001b[0;32m-> 2034\u001b[0m resource \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_api\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2035\u001b[0m \u001b[43m \u001b[49m\u001b[43mretry\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2036\u001b[0m \u001b[43m \u001b[49m\u001b[43mspan_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mBigQuery.getQueryResults\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2037\u001b[0m \u001b[43m \u001b[49m\u001b[43mspan_attributes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mspan_attributes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2038\u001b[0m \u001b[43m \u001b[49m\u001b[43mmethod\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mGET\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2039\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2040\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_params\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextra_params\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2041\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 2042\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2043\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _QueryResults\u001b[38;5;241m.\u001b[39mfrom_api_repr(resource)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/client.py:843\u001b[0m, in \u001b[0;36mClient._call_api\u001b[0;34m(self, retry, span_name, span_attributes, job_ref, headers, **kwargs)\u001b[0m\n\u001b[1;32m 839\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m span_name \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 840\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m create_span(\n\u001b[1;32m 841\u001b[0m name\u001b[38;5;241m=\u001b[39mspan_name, attributes\u001b[38;5;241m=\u001b[39mspan_attributes, client\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m, job_ref\u001b[38;5;241m=\u001b[39mjob_ref\n\u001b[1;32m 842\u001b[0m ):\n\u001b[0;32m--> 843\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mcall\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 845\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m call()\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:293\u001b[0m, in \u001b[0;36mRetry.__call__..retry_wrapped_func\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 289\u001b[0m target \u001b[38;5;241m=\u001b[39m functools\u001b[38;5;241m.\u001b[39mpartial(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 290\u001b[0m sleep_generator \u001b[38;5;241m=\u001b[39m exponential_sleep_generator(\n\u001b[1;32m 291\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_initial, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_maximum, multiplier\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_multiplier\n\u001b[1;32m 292\u001b[0m )\n\u001b[0;32m--> 293\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mretry_target\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 294\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 295\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_predicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 296\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep_generator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 297\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 298\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 299\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:153\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 149\u001b[0m \u001b[38;5;66;03m# pylint: disable=broad-except\u001b[39;00m\n\u001b[1;32m 150\u001b[0m \u001b[38;5;66;03m# This function explicitly must deal with broad exceptions.\u001b[39;00m\n\u001b[1;32m 151\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 152\u001b[0m \u001b[38;5;66;03m# defer to shared logic for handling errors\u001b[39;00m\n\u001b[0;32m--> 153\u001b[0m \u001b[43m_retry_error_helper\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 154\u001b[0m \u001b[43m \u001b[49m\u001b[43mexc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 155\u001b[0m \u001b[43m \u001b[49m\u001b[43mdeadline\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 156\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 157\u001b[0m \u001b[43m \u001b[49m\u001b[43merror_list\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 158\u001b[0m \u001b[43m \u001b[49m\u001b[43mpredicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 159\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 160\u001b[0m \u001b[43m \u001b[49m\u001b[43mexception_factory\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 161\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 162\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 163\u001b[0m \u001b[38;5;66;03m# if exception not raised, sleep before next attempt\u001b[39;00m\n\u001b[1;32m 164\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(sleep)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_base.py:212\u001b[0m, in \u001b[0;36m_retry_error_helper\u001b[0;34m(exc, deadline, next_sleep, error_list, predicate_fn, on_error_fn, exc_factory_fn, original_timeout)\u001b[0m\n\u001b[1;32m 206\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m predicate_fn(exc):\n\u001b[1;32m 207\u001b[0m final_exc, source_exc \u001b[38;5;241m=\u001b[39m exc_factory_fn(\n\u001b[1;32m 208\u001b[0m error_list,\n\u001b[1;32m 209\u001b[0m RetryFailureReason\u001b[38;5;241m.\u001b[39mNON_RETRYABLE_ERROR,\n\u001b[1;32m 210\u001b[0m original_timeout,\n\u001b[1;32m 211\u001b[0m )\n\u001b[0;32m--> 212\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m final_exc \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01msource_exc\u001b[39;00m\n\u001b[1;32m 213\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m on_error_fn \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 214\u001b[0m on_error_fn(exc)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:144\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m sleep \u001b[38;5;129;01min\u001b[39;00m sleep_generator:\n\u001b[1;32m 143\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 144\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mtarget\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 145\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inspect\u001b[38;5;241m.\u001b[39misawaitable(result):\n\u001b[1;32m 146\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(_ASYNC_RETRY_WARNING)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/_http/__init__.py:494\u001b[0m, in \u001b[0;36mJSONConnection.api_request\u001b[0;34m(self, method, path, query_params, data, content_type, headers, api_base_url, api_version, expect_json, _target_object, timeout, extra_api_info)\u001b[0m\n\u001b[1;32m 482\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_make_request(\n\u001b[1;32m 483\u001b[0m method\u001b[38;5;241m=\u001b[39mmethod,\n\u001b[1;32m 484\u001b[0m url\u001b[38;5;241m=\u001b[39murl,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 490\u001b[0m extra_api_info\u001b[38;5;241m=\u001b[39mextra_api_info,\n\u001b[1;32m 491\u001b[0m )\n\u001b[1;32m 493\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;241m200\u001b[39m \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m response\u001b[38;5;241m.\u001b[39mstatus_code \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m300\u001b[39m:\n\u001b[0;32m--> 494\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exceptions\u001b[38;5;241m.\u001b[39mfrom_http_response(response)\n\u001b[1;32m 496\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m expect_json \u001b[38;5;129;01mand\u001b[39;00m response\u001b[38;5;241m.\u001b[39mcontent:\n\u001b[1;32m 497\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m response\u001b[38;5;241m.\u001b[39mjson()\n", - "\u001b[0;31mBadRequest\u001b[0m: 400 GET https://bigquery.googleapis.com/bigquery/v2/projects/bigframes-dev/queries/b1947d40-947f-4693-bf90-24c48564df5d?maxResults=0&location=US&prettyPrint=false: Operation timed out after 6.0 hours. Consider reducing the amount of work performed by your operation so that it can complete within this limit.\n\nLocation: US\nJob ID: b1947d40-947f-4693-bf90-24c48564df5d\n Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey. You are currently running BigFrames version 2.5.0. [{'@type': 'type.googleapis.com/google.rpc.DebugInfo', 'detail': '[TIMEOUT] errorProto=code: \"TIMEOUT\"\\nargument: \"Operation timed out after 6.0 hours. Consider reducing the amount of work performed by your operation so that it can complete within this limit.\"\\n\\n\\tat com.google.cloud.helix.common.Exceptions$Public.timeout(Exceptions.java:958)\\n\\tat com.google.cloud.helix.server.job.DremelErrorUtil.createHelixErrorForDeadlineExceeded(DremelErrorUtil.java:75)\\n\\tat com.google.cloud.helix.server.job.DremelErrorUtil.createHelixErrorFromDremelRpcException(DremelErrorUtil.java:61)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:784)\\n\\tat com.google.cloud.helix.common.dremel.QueryExecutorImpl$ConfiguredQueryMigration$StreamHandler.onMessage(QueryExecutorImpl.java:696)\\n\\tat com.google.net.rpc3.stream.RpcMessageCallback$ForwardingRpcMessageCallback.onMessage(RpcMessageCallback.java:128)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.processMessageUnlocked(RpcStreamInternalContext.java:1876)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksInternalUnlocked(RpcStreamInternalContext.java:2930)\\n\\tat com.google.net.rpc3.impl.RpcStreamInternalContext.invokeCallbacksUnlocked(RpcStreamInternalContext.java:2854)\\n\\tat com.google.net.eventmanager.AbstractFutureTask$Sync.innerRun(AbstractFutureTask.java:259)\\n\\tat com.google.net.eventmanager.AbstractFutureTask.run(AbstractFutureTask.java:120)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runTaskTraced(EventManagerImpl.java:900)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runTask(EventManagerImpl.java:892)\\n\\tat com.google.net.eventmanager.EventManagerImpl.internalRunWorkerLoop(EventManagerImpl.java:1319)\\n\\tat com.google.net.eventmanager.EventManagerImpl.runWorkerLoop(EventManagerImpl.java:1210)\\n\\tat com.google.net.eventmanager.WorkerThreadInfo.runWorkerLoop(WorkerThreadInfo.java:153)\\n\\tat com.google.net.eventmanager.EventManagerImpl$WorkerThread.run(EventManagerImpl.java:1999)\\n'}]" - ] - } - ], - "source": [ - "#bq_connection = \"bigframes-dev.us.bigframes-default-connection\"\n", - "df[\"text\"] = df[\"audio\"].blob.audio_transcribe(model_name=\"gemini-2.0-flash-001\", verbose=True)\n", - "# gemini-2.5-pro-preview-05-06" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 310b7d0c-1369-4f83-b3ab-0627540e8c66 is DONE. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "ename": "NotFound", - "evalue": "404 Not found: Table bigframes-dev:_8b037bfb7316dddf9d92b12dcf93e008906bfe52._641b89a4_f2cc_4eee_a38e_a79c7c92293b_bqdf_5b24fca6-6b41-4342-8733-6fd57a90e0e3 was not found in location US; reason: notFound, message: Not found: Table bigframes-dev:_8b037bfb7316dddf9d92b12dcf93e008906bfe52._641b89a4_f2cc_4eee_a38e_a79c7c92293b_bqdf_5b24fca6-6b41-4342-8733-6fd57a90e0e3 was not found in location US\n\nLocation: US\nJob ID: 310b7d0c-1369-4f83-b3ab-0627540e8c66\n Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey. You are currently running BigFrames version 2.5.0.", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNotFound\u001b[0m Traceback (most recent call last)", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/IPython/core/formatters.py:770\u001b[0m, in \u001b[0;36mPlainTextFormatter.__call__\u001b[0;34m(self, obj)\u001b[0m\n\u001b[1;32m 763\u001b[0m stream \u001b[38;5;241m=\u001b[39m StringIO()\n\u001b[1;32m 764\u001b[0m printer \u001b[38;5;241m=\u001b[39m pretty\u001b[38;5;241m.\u001b[39mRepresentationPrinter(stream, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mverbose,\n\u001b[1;32m 765\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmax_width, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnewline,\n\u001b[1;32m 766\u001b[0m max_seq_length\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmax_seq_length,\n\u001b[1;32m 767\u001b[0m singleton_pprinters\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msingleton_printers,\n\u001b[1;32m 768\u001b[0m type_pprinters\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtype_printers,\n\u001b[1;32m 769\u001b[0m deferred_pprinters\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdeferred_printers)\n\u001b[0;32m--> 770\u001b[0m \u001b[43mprinter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpretty\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 771\u001b[0m printer\u001b[38;5;241m.\u001b[39mflush()\n\u001b[1;32m 772\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m stream\u001b[38;5;241m.\u001b[39mgetvalue()\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/IPython/lib/pretty.py:419\u001b[0m, in \u001b[0;36mRepresentationPrinter.pretty\u001b[0;34m(self, obj)\u001b[0m\n\u001b[1;32m 408\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m meth(obj, \u001b[38;5;28mself\u001b[39m, cycle)\n\u001b[1;32m 409\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 410\u001b[0m \u001b[38;5;28mcls\u001b[39m \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mobject\u001b[39m\n\u001b[1;32m 411\u001b[0m \u001b[38;5;66;03m# check if cls defines __repr__\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 417\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mcallable\u001b[39m(_safe_getattr(\u001b[38;5;28mcls\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m__repr__\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m 418\u001b[0m ):\n\u001b[0;32m--> 419\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_repr_pprint\u001b[49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcycle\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 421\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m _default_pprint(obj, \u001b[38;5;28mself\u001b[39m, cycle)\n\u001b[1;32m 422\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/IPython/lib/pretty.py:794\u001b[0m, in \u001b[0;36m_repr_pprint\u001b[0;34m(obj, p, cycle)\u001b[0m\n\u001b[1;32m 792\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"A pprint that just redirects to the normal repr function.\"\"\"\u001b[39;00m\n\u001b[1;32m 793\u001b[0m \u001b[38;5;66;03m# Find newlines and replace them with p.break_()\u001b[39;00m\n\u001b[0;32m--> 794\u001b[0m output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mrepr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mobj\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 795\u001b[0m lines \u001b[38;5;241m=\u001b[39m output\u001b[38;5;241m.\u001b[39msplitlines()\n\u001b[1;32m 796\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m p\u001b[38;5;241m.\u001b[39mgroup():\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:175\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 172\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 175\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 178\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 179\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;66;03m# _block.session.bqclient.\u001b[39;00m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/dataframe.py:742\u001b[0m, in \u001b[0;36mDataFrame.__repr__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 737\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m formatter\u001b[38;5;241m.\u001b[39mrepr_query_job(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compute_dry_run())\n\u001b[1;32m 739\u001b[0m \u001b[38;5;66;03m# TODO(swast): pass max_columns and get the true column count back. Maybe\u001b[39;00m\n\u001b[1;32m 740\u001b[0m \u001b[38;5;66;03m# get 1 more column than we have requested so that pandas can add the\u001b[39;00m\n\u001b[1;32m 741\u001b[0m \u001b[38;5;66;03m# ... for us?\u001b[39;00m\n\u001b[0;32m--> 742\u001b[0m pandas_df, row_count, query_job \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_block\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mretrieve_repr_request_results\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 743\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\n\u001b[1;32m 744\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 746\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_set_internal_query_job(query_job)\n\u001b[1;32m 748\u001b[0m column_count \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(pandas_df\u001b[38;5;241m.\u001b[39mcolumns)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/blocks.py:1515\u001b[0m, in \u001b[0;36mBlock.retrieve_repr_request_results\u001b[0;34m(self, max_results)\u001b[0m\n\u001b[1;32m 1513\u001b[0m \u001b[38;5;66;03m# head caches full underlying expression, so row_count will be free after\u001b[39;00m\n\u001b[1;32m 1514\u001b[0m executor \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msession\u001b[38;5;241m.\u001b[39m_executor\n\u001b[0;32m-> 1515\u001b[0m \u001b[43mexecutor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcached\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1516\u001b[0m \u001b[43m \u001b[49m\u001b[43marray_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexpr\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1517\u001b[0m \u001b[43m \u001b[49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mexecutors\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mCacheConfig\u001b[49m\u001b[43m(\u001b[49m\u001b[43moptimize_for\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mhead\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mif_cached\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mreuse-strict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1518\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1519\u001b[0m head_result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msession\u001b[38;5;241m.\u001b[39m_executor\u001b[38;5;241m.\u001b[39mexecute(\n\u001b[1;32m 1520\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexpr\u001b[38;5;241m.\u001b[39mslice(start\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, stop\u001b[38;5;241m=\u001b[39mmax_results, step\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m 1521\u001b[0m )\n\u001b[1;32m 1522\u001b[0m row_count \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msession\u001b[38;5;241m.\u001b[39m_executor\u001b[38;5;241m.\u001b[39mexecute(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexpr\u001b[38;5;241m.\u001b[39mrow_count())\u001b[38;5;241m.\u001b[39mto_py_scalar()\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/bq_caching_executor.py:363\u001b[0m, in \u001b[0;36mBigQueryCachingExecutor.cached\u001b[0;34m(self, array_value, config)\u001b[0m\n\u001b[1;32m 361\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_cache_with_session_awareness(array_value)\n\u001b[1;32m 362\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m config\u001b[38;5;241m.\u001b[39moptimize_for \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhead\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m--> 363\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_cache_with_offsets\u001b[49m\u001b[43m(\u001b[49m\u001b[43marray_value\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 364\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 365\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(config\u001b[38;5;241m.\u001b[39moptimize_for, executor\u001b[38;5;241m.\u001b[39mHierarchicalKey)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/bq_caching_executor.py:475\u001b[0m, in \u001b[0;36mBigQueryCachingExecutor._cache_with_offsets\u001b[0;34m(self, array_value)\u001b[0m\n\u001b[1;32m 468\u001b[0m w_offsets, offset_column \u001b[38;5;241m=\u001b[39m array_value\u001b[38;5;241m.\u001b[39mpromote_offsets()\n\u001b[1;32m 469\u001b[0m compiled \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mcompile\u001b[39m\u001b[38;5;241m.\u001b[39mcompile_sql(\n\u001b[1;32m 470\u001b[0m \u001b[38;5;28mcompile\u001b[39m\u001b[38;5;241m.\u001b[39mCompileRequest(\n\u001b[1;32m 471\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlogical_plan(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_substitute_large_local_sources(w_offsets\u001b[38;5;241m.\u001b[39mnode)),\n\u001b[1;32m 472\u001b[0m sort_rows\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m 473\u001b[0m )\n\u001b[1;32m 474\u001b[0m )\n\u001b[0;32m--> 475\u001b[0m tmp_table_ref \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sql_as_cached_temp_table\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 476\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompiled\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 477\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompiled\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msql_schema\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 478\u001b[0m \u001b[43m \u001b[49m\u001b[43mcluster_cols\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43moffset_column\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 479\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 480\u001b[0m tmp_table \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbqclient\u001b[38;5;241m.\u001b[39mget_table(tmp_table_ref)\n\u001b[1;32m 481\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m compiled\u001b[38;5;241m.\u001b[39mrow_order \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/bq_caching_executor.py:550\u001b[0m, in \u001b[0;36mBigQueryCachingExecutor._sql_as_cached_temp_table\u001b[0;34m(self, sql, schema, cluster_cols)\u001b[0m\n\u001b[1;32m 545\u001b[0m job_config \u001b[38;5;241m=\u001b[39m cast(\n\u001b[1;32m 546\u001b[0m bigquery\u001b[38;5;241m.\u001b[39mQueryJobConfig,\n\u001b[1;32m 547\u001b[0m bigquery\u001b[38;5;241m.\u001b[39mQueryJobConfig\u001b[38;5;241m.\u001b[39mfrom_api_repr({}),\n\u001b[1;32m 548\u001b[0m )\n\u001b[1;32m 549\u001b[0m job_config\u001b[38;5;241m.\u001b[39mdestination \u001b[38;5;241m=\u001b[39m temp_table\n\u001b[0;32m--> 550\u001b[0m _, query_job \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_execute_query\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 551\u001b[0m \u001b[43m \u001b[49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 552\u001b[0m \u001b[43m \u001b[49m\u001b[43mjob_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjob_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 553\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 554\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m query_job \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 555\u001b[0m query_job\u001b[38;5;241m.\u001b[39mresult()\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/bq_caching_executor.py:392\u001b[0m, in \u001b[0;36mBigQueryCachingExecutor._run_execute_query\u001b[0;34m(self, sql, job_config, query_with_job)\u001b[0m\n\u001b[1;32m 389\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 390\u001b[0m \u001b[38;5;66;03m# Trick the type checker into thinking we got a literal.\u001b[39;00m\n\u001b[1;32m 391\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m query_with_job:\n\u001b[0;32m--> 392\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mbq_io\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstart_query_with_client\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 393\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbqclient\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 394\u001b[0m \u001b[43m \u001b[49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 395\u001b[0m \u001b[43m \u001b[49m\u001b[43mjob_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjob_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 396\u001b[0m \u001b[43m \u001b[49m\u001b[43mmetrics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmetrics\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 397\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 398\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 399\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 400\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_with_job\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 401\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 402\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 403\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m bq_io\u001b[38;5;241m.\u001b[39mstart_query_with_client(\n\u001b[1;32m 404\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbqclient,\n\u001b[1;32m 405\u001b[0m sql,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 411\u001b[0m query_with_job\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m 412\u001b[0m )\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/_io/bigquery/__init__.py:314\u001b[0m, in \u001b[0;36mstart_query_with_client\u001b[0;34m(bq_client, sql, job_config, location, project, timeout, metrics, query_with_job)\u001b[0m\n\u001b[1;32m 312\u001b[0m opts \u001b[38;5;241m=\u001b[39m bigframes\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mdisplay\n\u001b[1;32m 313\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m opts\u001b[38;5;241m.\u001b[39mprogress_bar \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m query_job\u001b[38;5;241m.\u001b[39mconfiguration\u001b[38;5;241m.\u001b[39mdry_run:\n\u001b[0;32m--> 314\u001b[0m results_iterator \u001b[38;5;241m=\u001b[39m \u001b[43mformatting_helpers\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait_for_query_job\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 315\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_job\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 316\u001b[0m \u001b[43m \u001b[49m\u001b[43mprogress_bar\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mopts\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprogress_bar\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 317\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 318\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 319\u001b[0m results_iterator \u001b[38;5;241m=\u001b[39m query_job\u001b[38;5;241m.\u001b[39mresult()\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/formatting_helpers.py:149\u001b[0m, in \u001b[0;36mwait_for_query_job\u001b[0;34m(query_job, max_results, page_size, progress_bar)\u001b[0m\n\u001b[1;32m 147\u001b[0m loading_bar \u001b[38;5;241m=\u001b[39m display\u001b[38;5;241m.\u001b[39mHTML(get_query_job_loading_html(query_job))\n\u001b[1;32m 148\u001b[0m display\u001b[38;5;241m.\u001b[39mdisplay(loading_bar, display_id\u001b[38;5;241m=\u001b[39mdisplay_id)\n\u001b[0;32m--> 149\u001b[0m query_result \u001b[38;5;241m=\u001b[39m \u001b[43mquery_job\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 150\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpage_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpage_size\u001b[49m\n\u001b[1;32m 151\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 152\u001b[0m query_job\u001b[38;5;241m.\u001b[39mreload()\n\u001b[1;32m 153\u001b[0m display\u001b[38;5;241m.\u001b[39mupdate_display(\n\u001b[1;32m 154\u001b[0m display\u001b[38;5;241m.\u001b[39mHTML(get_query_job_loading_html(query_job)),\n\u001b[1;32m 155\u001b[0m display_id\u001b[38;5;241m=\u001b[39mdisplay_id,\n\u001b[1;32m 156\u001b[0m )\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/job/query.py:1681\u001b[0m, in \u001b[0;36mQueryJob.result\u001b[0;34m(self, page_size, max_results, retry, timeout, start_index, job_retry)\u001b[0m\n\u001b[1;32m 1676\u001b[0m remaining_timeout \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1678\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m remaining_timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1679\u001b[0m \u001b[38;5;66;03m# Since is_job_done() calls jobs.getQueryResults, which is a\u001b[39;00m\n\u001b[1;32m 1680\u001b[0m \u001b[38;5;66;03m# long-running API, don't delay the next request at all.\u001b[39;00m\n\u001b[0;32m-> 1681\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[43mis_job_done\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 1682\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[1;32m 1683\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1684\u001b[0m \u001b[38;5;66;03m# Use a monotonic clock since we don't actually care about\u001b[39;00m\n\u001b[1;32m 1685\u001b[0m \u001b[38;5;66;03m# daylight savings or similar, just the elapsed time.\u001b[39;00m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:293\u001b[0m, in \u001b[0;36mRetry.__call__..retry_wrapped_func\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 289\u001b[0m target \u001b[38;5;241m=\u001b[39m functools\u001b[38;5;241m.\u001b[39mpartial(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 290\u001b[0m sleep_generator \u001b[38;5;241m=\u001b[39m exponential_sleep_generator(\n\u001b[1;32m 291\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_initial, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_maximum, multiplier\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_multiplier\n\u001b[1;32m 292\u001b[0m )\n\u001b[0;32m--> 293\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mretry_target\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 294\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 295\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_predicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 296\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep_generator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 297\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 298\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 299\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:153\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 149\u001b[0m \u001b[38;5;66;03m# pylint: disable=broad-except\u001b[39;00m\n\u001b[1;32m 150\u001b[0m \u001b[38;5;66;03m# This function explicitly must deal with broad exceptions.\u001b[39;00m\n\u001b[1;32m 151\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 152\u001b[0m \u001b[38;5;66;03m# defer to shared logic for handling errors\u001b[39;00m\n\u001b[0;32m--> 153\u001b[0m \u001b[43m_retry_error_helper\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 154\u001b[0m \u001b[43m \u001b[49m\u001b[43mexc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 155\u001b[0m \u001b[43m \u001b[49m\u001b[43mdeadline\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 156\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 157\u001b[0m \u001b[43m \u001b[49m\u001b[43merror_list\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 158\u001b[0m \u001b[43m \u001b[49m\u001b[43mpredicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 159\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 160\u001b[0m \u001b[43m \u001b[49m\u001b[43mexception_factory\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 161\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 162\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 163\u001b[0m \u001b[38;5;66;03m# if exception not raised, sleep before next attempt\u001b[39;00m\n\u001b[1;32m 164\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(sleep)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_base.py:212\u001b[0m, in \u001b[0;36m_retry_error_helper\u001b[0;34m(exc, deadline, next_sleep, error_list, predicate_fn, on_error_fn, exc_factory_fn, original_timeout)\u001b[0m\n\u001b[1;32m 206\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m predicate_fn(exc):\n\u001b[1;32m 207\u001b[0m final_exc, source_exc \u001b[38;5;241m=\u001b[39m exc_factory_fn(\n\u001b[1;32m 208\u001b[0m error_list,\n\u001b[1;32m 209\u001b[0m RetryFailureReason\u001b[38;5;241m.\u001b[39mNON_RETRYABLE_ERROR,\n\u001b[1;32m 210\u001b[0m original_timeout,\n\u001b[1;32m 211\u001b[0m )\n\u001b[0;32m--> 212\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m final_exc \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01msource_exc\u001b[39;00m\n\u001b[1;32m 213\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m on_error_fn \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 214\u001b[0m on_error_fn(exc)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:144\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m sleep \u001b[38;5;129;01min\u001b[39;00m sleep_generator:\n\u001b[1;32m 143\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 144\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mtarget\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 145\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inspect\u001b[38;5;241m.\u001b[39misawaitable(result):\n\u001b[1;32m 146\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(_ASYNC_RETRY_WARNING)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/job/query.py:1630\u001b[0m, in \u001b[0;36mQueryJob.result..is_job_done\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1607\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m job_failed_exception \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1608\u001b[0m \u001b[38;5;66;03m# Only try to restart the query job if the job failed for\u001b[39;00m\n\u001b[1;32m 1609\u001b[0m \u001b[38;5;66;03m# a retriable reason. For example, don't restart the query\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1627\u001b[0m \u001b[38;5;66;03m# into an exception that can be processed by the\u001b[39;00m\n\u001b[1;32m 1628\u001b[0m \u001b[38;5;66;03m# `job_retry` predicate.\u001b[39;00m\n\u001b[1;32m 1629\u001b[0m restart_query_job \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m-> 1630\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m job_failed_exception\n\u001b[1;32m 1631\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1632\u001b[0m \u001b[38;5;66;03m# Make sure that the _query_results are cached so we\u001b[39;00m\n\u001b[1;32m 1633\u001b[0m \u001b[38;5;66;03m# can return a complete RowIterator.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1639\u001b[0m \u001b[38;5;66;03m# making any extra API calls if the previous loop\u001b[39;00m\n\u001b[1;32m 1640\u001b[0m \u001b[38;5;66;03m# iteration fetched the finished job.\u001b[39;00m\n\u001b[1;32m 1641\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reload_query_results(\n\u001b[1;32m 1642\u001b[0m retry\u001b[38;5;241m=\u001b[39mretry, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mreload_query_results_kwargs\n\u001b[1;32m 1643\u001b[0m )\n", - "\u001b[0;31mNotFound\u001b[0m: 404 Not found: Table bigframes-dev:_8b037bfb7316dddf9d92b12dcf93e008906bfe52._641b89a4_f2cc_4eee_a38e_a79c7c92293b_bqdf_5b24fca6-6b41-4342-8733-6fd57a90e0e3 was not found in location US; reason: notFound, message: Not found: Table bigframes-dev:_8b037bfb7316dddf9d92b12dcf93e008906bfe52._641b89a4_f2cc_4eee_a38e_a79c7c92293b_bqdf_5b24fca6-6b41-4342-8733-6fd57a90e0e3 was not found in location US\n\nLocation: US\nJob ID: 310b7d0c-1369-4f83-b3ab-0627540e8c66\n Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey. You are currently running BigFrames version 2.5.0." - ] - }, - { - "data": { - "text/html": [ - "Query job 31d9c7a6-23a1-4a87-abf1-724f2f8995d5 is DONE. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "ename": "NotFound", - "evalue": "404 Not found: Table bigframes-dev:_8b037bfb7316dddf9d92b12dcf93e008906bfe52._641b89a4_f2cc_4eee_a38e_a79c7c92293b_bqdf_5b24fca6-6b41-4342-8733-6fd57a90e0e3 was not found in location US; reason: notFound, message: Not found: Table bigframes-dev:_8b037bfb7316dddf9d92b12dcf93e008906bfe52._641b89a4_f2cc_4eee_a38e_a79c7c92293b_bqdf_5b24fca6-6b41-4342-8733-6fd57a90e0e3 was not found in location US\n\nLocation: US\nJob ID: 31d9c7a6-23a1-4a87-abf1-724f2f8995d5\n Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey. You are currently running BigFrames version 2.5.0.", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNotFound\u001b[0m Traceback (most recent call last)", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/IPython/core/formatters.py:406\u001b[0m, in \u001b[0;36mBaseFormatter.__call__\u001b[0;34m(self, obj)\u001b[0m\n\u001b[1;32m 404\u001b[0m method \u001b[38;5;241m=\u001b[39m get_real_method(obj, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mprint_method)\n\u001b[1;32m 405\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m method \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 406\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 407\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 408\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/log_adapter.py:175\u001b[0m, in \u001b[0;36mmethod_logger..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 172\u001b[0m _call_stack\u001b[38;5;241m.\u001b[39mappend(full_method_name)\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 175\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mmethod\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\u001b[38;5;167;01mNotImplementedError\u001b[39;00m, \u001b[38;5;167;01mTypeError\u001b[39;00m) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[1;32m 177\u001b[0m \u001b[38;5;66;03m# Log method parameters that are implemented in pandas but either missing (TypeError)\u001b[39;00m\n\u001b[1;32m 178\u001b[0m \u001b[38;5;66;03m# or not fully supported (NotImplementedError) in BigFrames.\u001b[39;00m\n\u001b[1;32m 179\u001b[0m \u001b[38;5;66;03m# Logging is currently supported only when we can access the bqclient through\u001b[39;00m\n\u001b[1;32m 180\u001b[0m \u001b[38;5;66;03m# _block.session.bqclient.\u001b[39;00m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(_call_stack) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/dataframe.py:799\u001b[0m, in \u001b[0;36mDataFrame._repr_html_\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 794\u001b[0m df[col] \u001b[38;5;241m=\u001b[39m df[col]\u001b[38;5;241m.\u001b[39mblob\u001b[38;5;241m.\u001b[39m_get_runtime(mode\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mR\u001b[39m\u001b[38;5;124m\"\u001b[39m, with_metadata\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 796\u001b[0m \u001b[38;5;66;03m# TODO(swast): pass max_columns and get the true column count back. Maybe\u001b[39;00m\n\u001b[1;32m 797\u001b[0m \u001b[38;5;66;03m# get 1 more column than we have requested so that pandas can add the\u001b[39;00m\n\u001b[1;32m 798\u001b[0m \u001b[38;5;66;03m# ... for us?\u001b[39;00m\n\u001b[0;32m--> 799\u001b[0m pandas_df, row_count, query_job \u001b[38;5;241m=\u001b[39m \u001b[43mdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_block\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mretrieve_repr_request_results\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 800\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\n\u001b[1;32m 801\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 803\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_set_internal_query_job(query_job)\n\u001b[1;32m 805\u001b[0m column_count \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(pandas_df\u001b[38;5;241m.\u001b[39mcolumns)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/core/blocks.py:1515\u001b[0m, in \u001b[0;36mBlock.retrieve_repr_request_results\u001b[0;34m(self, max_results)\u001b[0m\n\u001b[1;32m 1513\u001b[0m \u001b[38;5;66;03m# head caches full underlying expression, so row_count will be free after\u001b[39;00m\n\u001b[1;32m 1514\u001b[0m executor \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msession\u001b[38;5;241m.\u001b[39m_executor\n\u001b[0;32m-> 1515\u001b[0m \u001b[43mexecutor\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcached\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1516\u001b[0m \u001b[43m \u001b[49m\u001b[43marray_value\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexpr\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1517\u001b[0m \u001b[43m \u001b[49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mexecutors\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mCacheConfig\u001b[49m\u001b[43m(\u001b[49m\u001b[43moptimize_for\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mhead\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mif_cached\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mreuse-strict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1518\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1519\u001b[0m head_result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msession\u001b[38;5;241m.\u001b[39m_executor\u001b[38;5;241m.\u001b[39mexecute(\n\u001b[1;32m 1520\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexpr\u001b[38;5;241m.\u001b[39mslice(start\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m, stop\u001b[38;5;241m=\u001b[39mmax_results, step\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m 1521\u001b[0m )\n\u001b[1;32m 1522\u001b[0m row_count \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msession\u001b[38;5;241m.\u001b[39m_executor\u001b[38;5;241m.\u001b[39mexecute(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexpr\u001b[38;5;241m.\u001b[39mrow_count())\u001b[38;5;241m.\u001b[39mto_py_scalar()\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/bq_caching_executor.py:363\u001b[0m, in \u001b[0;36mBigQueryCachingExecutor.cached\u001b[0;34m(self, array_value, config)\u001b[0m\n\u001b[1;32m 361\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_cache_with_session_awareness(array_value)\n\u001b[1;32m 362\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m config\u001b[38;5;241m.\u001b[39moptimize_for \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhead\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[0;32m--> 363\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_cache_with_offsets\u001b[49m\u001b[43m(\u001b[49m\u001b[43marray_value\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 364\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 365\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(config\u001b[38;5;241m.\u001b[39moptimize_for, executor\u001b[38;5;241m.\u001b[39mHierarchicalKey)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/bq_caching_executor.py:475\u001b[0m, in \u001b[0;36mBigQueryCachingExecutor._cache_with_offsets\u001b[0;34m(self, array_value)\u001b[0m\n\u001b[1;32m 468\u001b[0m w_offsets, offset_column \u001b[38;5;241m=\u001b[39m array_value\u001b[38;5;241m.\u001b[39mpromote_offsets()\n\u001b[1;32m 469\u001b[0m compiled \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mcompile\u001b[39m\u001b[38;5;241m.\u001b[39mcompile_sql(\n\u001b[1;32m 470\u001b[0m \u001b[38;5;28mcompile\u001b[39m\u001b[38;5;241m.\u001b[39mCompileRequest(\n\u001b[1;32m 471\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlogical_plan(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_substitute_large_local_sources(w_offsets\u001b[38;5;241m.\u001b[39mnode)),\n\u001b[1;32m 472\u001b[0m sort_rows\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m 473\u001b[0m )\n\u001b[1;32m 474\u001b[0m )\n\u001b[0;32m--> 475\u001b[0m tmp_table_ref \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sql_as_cached_temp_table\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 476\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompiled\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 477\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompiled\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msql_schema\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 478\u001b[0m \u001b[43m \u001b[49m\u001b[43mcluster_cols\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43moffset_column\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 479\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 480\u001b[0m tmp_table \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbqclient\u001b[38;5;241m.\u001b[39mget_table(tmp_table_ref)\n\u001b[1;32m 481\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m compiled\u001b[38;5;241m.\u001b[39mrow_order \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/bq_caching_executor.py:550\u001b[0m, in \u001b[0;36mBigQueryCachingExecutor._sql_as_cached_temp_table\u001b[0;34m(self, sql, schema, cluster_cols)\u001b[0m\n\u001b[1;32m 545\u001b[0m job_config \u001b[38;5;241m=\u001b[39m cast(\n\u001b[1;32m 546\u001b[0m bigquery\u001b[38;5;241m.\u001b[39mQueryJobConfig,\n\u001b[1;32m 547\u001b[0m bigquery\u001b[38;5;241m.\u001b[39mQueryJobConfig\u001b[38;5;241m.\u001b[39mfrom_api_repr({}),\n\u001b[1;32m 548\u001b[0m )\n\u001b[1;32m 549\u001b[0m job_config\u001b[38;5;241m.\u001b[39mdestination \u001b[38;5;241m=\u001b[39m temp_table\n\u001b[0;32m--> 550\u001b[0m _, query_job \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_run_execute_query\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 551\u001b[0m \u001b[43m \u001b[49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 552\u001b[0m \u001b[43m \u001b[49m\u001b[43mjob_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjob_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 553\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 554\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m query_job \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 555\u001b[0m query_job\u001b[38;5;241m.\u001b[39mresult()\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/bq_caching_executor.py:392\u001b[0m, in \u001b[0;36mBigQueryCachingExecutor._run_execute_query\u001b[0;34m(self, sql, job_config, query_with_job)\u001b[0m\n\u001b[1;32m 389\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 390\u001b[0m \u001b[38;5;66;03m# Trick the type checker into thinking we got a literal.\u001b[39;00m\n\u001b[1;32m 391\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m query_with_job:\n\u001b[0;32m--> 392\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mbq_io\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstart_query_with_client\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 393\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mbqclient\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 394\u001b[0m \u001b[43m \u001b[49m\u001b[43msql\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 395\u001b[0m \u001b[43m \u001b[49m\u001b[43mjob_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mjob_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 396\u001b[0m \u001b[43m \u001b[49m\u001b[43mmetrics\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmetrics\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 397\u001b[0m \u001b[43m \u001b[49m\u001b[43mproject\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 398\u001b[0m \u001b[43m \u001b[49m\u001b[43mlocation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 399\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 400\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_with_job\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 401\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 402\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 403\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m bq_io\u001b[38;5;241m.\u001b[39mstart_query_with_client(\n\u001b[1;32m 404\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbqclient,\n\u001b[1;32m 405\u001b[0m sql,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 411\u001b[0m query_with_job\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m 412\u001b[0m )\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/session/_io/bigquery/__init__.py:314\u001b[0m, in \u001b[0;36mstart_query_with_client\u001b[0;34m(bq_client, sql, job_config, location, project, timeout, metrics, query_with_job)\u001b[0m\n\u001b[1;32m 312\u001b[0m opts \u001b[38;5;241m=\u001b[39m bigframes\u001b[38;5;241m.\u001b[39moptions\u001b[38;5;241m.\u001b[39mdisplay\n\u001b[1;32m 313\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m opts\u001b[38;5;241m.\u001b[39mprogress_bar \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m query_job\u001b[38;5;241m.\u001b[39mconfiguration\u001b[38;5;241m.\u001b[39mdry_run:\n\u001b[0;32m--> 314\u001b[0m results_iterator \u001b[38;5;241m=\u001b[39m \u001b[43mformatting_helpers\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait_for_query_job\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 315\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_job\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 316\u001b[0m \u001b[43m \u001b[49m\u001b[43mprogress_bar\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mopts\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprogress_bar\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 317\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 318\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 319\u001b[0m results_iterator \u001b[38;5;241m=\u001b[39m query_job\u001b[38;5;241m.\u001b[39mresult()\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/bigframes/formatting_helpers.py:149\u001b[0m, in \u001b[0;36mwait_for_query_job\u001b[0;34m(query_job, max_results, page_size, progress_bar)\u001b[0m\n\u001b[1;32m 147\u001b[0m loading_bar \u001b[38;5;241m=\u001b[39m display\u001b[38;5;241m.\u001b[39mHTML(get_query_job_loading_html(query_job))\n\u001b[1;32m 148\u001b[0m display\u001b[38;5;241m.\u001b[39mdisplay(loading_bar, display_id\u001b[38;5;241m=\u001b[39mdisplay_id)\n\u001b[0;32m--> 149\u001b[0m query_result \u001b[38;5;241m=\u001b[39m \u001b[43mquery_job\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mresult\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 150\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_results\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_results\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpage_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpage_size\u001b[49m\n\u001b[1;32m 151\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 152\u001b[0m query_job\u001b[38;5;241m.\u001b[39mreload()\n\u001b[1;32m 153\u001b[0m display\u001b[38;5;241m.\u001b[39mupdate_display(\n\u001b[1;32m 154\u001b[0m display\u001b[38;5;241m.\u001b[39mHTML(get_query_job_loading_html(query_job)),\n\u001b[1;32m 155\u001b[0m display_id\u001b[38;5;241m=\u001b[39mdisplay_id,\n\u001b[1;32m 156\u001b[0m )\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/job/query.py:1681\u001b[0m, in \u001b[0;36mQueryJob.result\u001b[0;34m(self, page_size, max_results, retry, timeout, start_index, job_retry)\u001b[0m\n\u001b[1;32m 1676\u001b[0m remaining_timeout \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1678\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m remaining_timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1679\u001b[0m \u001b[38;5;66;03m# Since is_job_done() calls jobs.getQueryResults, which is a\u001b[39;00m\n\u001b[1;32m 1680\u001b[0m \u001b[38;5;66;03m# long-running API, don't delay the next request at all.\u001b[39;00m\n\u001b[0;32m-> 1681\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[43mis_job_done\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[1;32m 1682\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[1;32m 1683\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1684\u001b[0m \u001b[38;5;66;03m# Use a monotonic clock since we don't actually care about\u001b[39;00m\n\u001b[1;32m 1685\u001b[0m \u001b[38;5;66;03m# daylight savings or similar, just the elapsed time.\u001b[39;00m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:293\u001b[0m, in \u001b[0;36mRetry.__call__..retry_wrapped_func\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 289\u001b[0m target \u001b[38;5;241m=\u001b[39m functools\u001b[38;5;241m.\u001b[39mpartial(func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 290\u001b[0m sleep_generator \u001b[38;5;241m=\u001b[39m exponential_sleep_generator(\n\u001b[1;32m 291\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_initial, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_maximum, multiplier\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_multiplier\n\u001b[1;32m 292\u001b[0m )\n\u001b[0;32m--> 293\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mretry_target\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 294\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 295\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_predicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 296\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep_generator\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 297\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_timeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 298\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 299\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:153\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 149\u001b[0m \u001b[38;5;66;03m# pylint: disable=broad-except\u001b[39;00m\n\u001b[1;32m 150\u001b[0m \u001b[38;5;66;03m# This function explicitly must deal with broad exceptions.\u001b[39;00m\n\u001b[1;32m 151\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 152\u001b[0m \u001b[38;5;66;03m# defer to shared logic for handling errors\u001b[39;00m\n\u001b[0;32m--> 153\u001b[0m \u001b[43m_retry_error_helper\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 154\u001b[0m \u001b[43m \u001b[49m\u001b[43mexc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 155\u001b[0m \u001b[43m \u001b[49m\u001b[43mdeadline\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 156\u001b[0m \u001b[43m \u001b[49m\u001b[43msleep\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 157\u001b[0m \u001b[43m \u001b[49m\u001b[43merror_list\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 158\u001b[0m \u001b[43m \u001b[49m\u001b[43mpredicate\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 159\u001b[0m \u001b[43m \u001b[49m\u001b[43mon_error\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 160\u001b[0m \u001b[43m \u001b[49m\u001b[43mexception_factory\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 161\u001b[0m \u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 162\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 163\u001b[0m \u001b[38;5;66;03m# if exception not raised, sleep before next attempt\u001b[39;00m\n\u001b[1;32m 164\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(sleep)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_base.py:212\u001b[0m, in \u001b[0;36m_retry_error_helper\u001b[0;34m(exc, deadline, next_sleep, error_list, predicate_fn, on_error_fn, exc_factory_fn, original_timeout)\u001b[0m\n\u001b[1;32m 206\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m predicate_fn(exc):\n\u001b[1;32m 207\u001b[0m final_exc, source_exc \u001b[38;5;241m=\u001b[39m exc_factory_fn(\n\u001b[1;32m 208\u001b[0m error_list,\n\u001b[1;32m 209\u001b[0m RetryFailureReason\u001b[38;5;241m.\u001b[39mNON_RETRYABLE_ERROR,\n\u001b[1;32m 210\u001b[0m original_timeout,\n\u001b[1;32m 211\u001b[0m )\n\u001b[0;32m--> 212\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m final_exc \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01msource_exc\u001b[39;00m\n\u001b[1;32m 213\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m on_error_fn \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 214\u001b[0m on_error_fn(exc)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/retry/retry_unary.py:144\u001b[0m, in \u001b[0;36mretry_target\u001b[0;34m(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m sleep \u001b[38;5;129;01min\u001b[39;00m sleep_generator:\n\u001b[1;32m 143\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 144\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mtarget\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 145\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inspect\u001b[38;5;241m.\u001b[39misawaitable(result):\n\u001b[1;32m 146\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(_ASYNC_RETRY_WARNING)\n", - "File \u001b[0;32m~/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/cloud/bigquery/job/query.py:1630\u001b[0m, in \u001b[0;36mQueryJob.result..is_job_done\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1607\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m job_failed_exception \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1608\u001b[0m \u001b[38;5;66;03m# Only try to restart the query job if the job failed for\u001b[39;00m\n\u001b[1;32m 1609\u001b[0m \u001b[38;5;66;03m# a retriable reason. For example, don't restart the query\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1627\u001b[0m \u001b[38;5;66;03m# into an exception that can be processed by the\u001b[39;00m\n\u001b[1;32m 1628\u001b[0m \u001b[38;5;66;03m# `job_retry` predicate.\u001b[39;00m\n\u001b[1;32m 1629\u001b[0m restart_query_job \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m-> 1630\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m job_failed_exception\n\u001b[1;32m 1631\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1632\u001b[0m \u001b[38;5;66;03m# Make sure that the _query_results are cached so we\u001b[39;00m\n\u001b[1;32m 1633\u001b[0m \u001b[38;5;66;03m# can return a complete RowIterator.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1639\u001b[0m \u001b[38;5;66;03m# making any extra API calls if the previous loop\u001b[39;00m\n\u001b[1;32m 1640\u001b[0m \u001b[38;5;66;03m# iteration fetched the finished job.\u001b[39;00m\n\u001b[1;32m 1641\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reload_query_results(\n\u001b[1;32m 1642\u001b[0m retry\u001b[38;5;241m=\u001b[39mretry, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mreload_query_results_kwargs\n\u001b[1;32m 1643\u001b[0m )\n", - "\u001b[0;31mNotFound\u001b[0m: 404 Not found: Table bigframes-dev:_8b037bfb7316dddf9d92b12dcf93e008906bfe52._641b89a4_f2cc_4eee_a38e_a79c7c92293b_bqdf_5b24fca6-6b41-4342-8733-6fd57a90e0e3 was not found in location US; reason: notFound, message: Not found: Table bigframes-dev:_8b037bfb7316dddf9d92b12dcf93e008906bfe52._641b89a4_f2cc_4eee_a38e_a79c7c92293b_bqdf_5b24fca6-6b41-4342-8733-6fd57a90e0e3 was not found in location US\n\nLocation: US\nJob ID: 31d9c7a6-23a1-4a87-abf1-724f2f8995d5\n Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey. You are currently running BigFrames version 2.5.0." - ] - } - ], - "source": [ - "df" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/test_blob_transcribe_long_audio_2p5.ipynb b/notebooks/test_blob_transcribe_long_audio_2p5.ipynb deleted file mode 100644 index e156232b51..0000000000 --- a/notebooks/test_blob_transcribe_long_audio_2p5.ipynb +++ /dev/null @@ -1,271 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/_config/experiment_options.py:71: ApiDeprecationWarning: BigFrames Blob is in preview now. This flag is no longer needed.\n", - " warnings.warn(msg, category=bfe.ApiDeprecationWarning)\n" - ] - } - ], - "source": [ - "import bigframes\n", - "bigframes.options.experiments.blob = True\n", - "#bigframes.options._bigquery_options.client_endpoints_override = {\"bqclient\": \"https://test-bigquery.sandbox.google.com\", \n", - "# \"bqconnectionclient\": \"test-bigqueryconnection.sandbox.googleapis.com\", \n", - "# \"bqstoragereadclient\": \"test-bigquerystorage-grpc.sandbox.googleapis.com\"}\n", - "import bigframes.pandas as bpd" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/global_session.py:103: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", - " _global_session = bigframes.session.connect(\n" - ] - }, - { - "data": { - "text/html": [ - "Query job 9553647a-5a71-43af-902d-043df6a62387 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 7b3ca9c5-5664-4973-b062-3bb304730dd6 is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "df = bpd.from_glob_path(\"gs://shuowei_bucket/long_audio/*\", name=\"audio\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# copy files again, now we have 1000 audio files\n", - "#copies = [df] * 2 * 100\n", - "#df = bpd.concat(copies, ignore_index=True)\n", - "#df = df.cache()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# copy files again, now we have 1,000,000 audio files\n", - "#copies = [df] * 2 * 100\n", - "#df = bpd.concat(copies, ignore_index=True)\n", - "#df = df.cache()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job 848201eb-0342-40a4-8de8-621ad180184a is DONE. 0 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job 094c794c-9283-4a12-9c6a-c7aaa559e791 is DONE. 480 Bytes processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/array_value.py:109: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n", - "`db_dtypes` is a preview feature and subject to change.\n", - " warnings.warn(msg, bfe.PreviewWarning)\n" - ] - } - ], - "source": [ - "#bq_connection = \"bigframes-dev.us.bigframes-default-connection\"\n", - "df[\"text\"] = df[\"audio\"].blob.transcribe(df=df, audio_column=\"audio\", model_name=\"gemini-2.5-pro-preview-05-06\", verbose=True)\n", - "# gemini-2.0-flash-001" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "Query job ed3d84ac-2e1b-42ab-b867-58560e1a3167 is DONE. 8.8 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "Query job a65b6550-f250-4052-92e2-492ab58da692 is DONE. 8.8 kB processed. Open Job" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
audiotext
0uri: gs://shuowei_bucket/audio/ID014clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': \"I'll tell you something, when the first revelation Joseph Smith received about marriage of any kind was that you couldn't get anywhere, you into the celestial kingdom. No man could get into the celestial kingdom without a wife, because he's only half there if he hasn't got a wife. And no woman, of course, could get there without a husband either, because she's only half there. Well, now at that time, the Mormons were very strict, uh, uh, religious people, you see. Sure. Sure. And, uh, thought that a man had to live clean all his life as well as a woman did in order to be worthy to be a man. Well, this is still true. Well, it's still true certainly. But I said, in that those circumstances, at that time, there were, uh, about three times as many women that joined the church and were faithful Latter-day Saints than there were men. When Joseph Smith comes out with the revelation that, uh, marriage was for eternity. And if if you're going to be separated from death do you part, then you can't get into the celestial kingdom at all to become as God is. Now, they got that doctrine. And that was, the prophet said that, they believed that. What about these here two-thirds of the women? Yeah, that's a good question. What is going to become of us then? It won't do us any good to go out here and marry one of these scallywags that spends his time drinking and boozing around and mining and maybe works in the mine a while and does all kind of skullduggery. That won't do us any good to marry that kind of a man. Besides, we don't have a chance to raise a respectable family with that kind of a man. So, what are we going to do about it now? What's going to become of us? And Joseph Smith always prayed about every problem that came to him. And there's where he got the revelation that if they live thoroughly clean and faithful, if they can understand the purity and sacredness of sex, so they don't desecrate it, then one man can have a number of wives and and permit them to raise a respectable, honorable family as well as have those wives throughout eternity, have their husband to take them into the celestial kingdom. Now, there was the origin of polygamy.\"}
1uri: gs://shuowei_bucket/audio/HI009clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': \"Uh, I hear that there's a Hawaiian custom of something called a hukilau. Hukilau. Hukilau is they have a rope and on the rope they have a net there. Mhm. They surround it a certain place and they drag the net. They surround it, you mean they carry the net out? Carry the net, uh-huh. Uh, how do they, is there something to keep the net floating? Yeah, some they have those floaters uh floating. Mhm. Some they have a lead down on the bottom, then they drag that uh net in. What do the floats, what are the floats made of? The float made of those uh some coconut uh dry coconut, I guess. Oh, dry coconut. Yeah. That's a good use for coconut. Uh, and that floats the top of the net. Top of the net. Now, the word lau means leaf, doesn't it? Lau, lau is a yeah, yeah, lau is a leaf. All right. Well now, the the where does the leaf come in in this hukilau? Hukilau, the leaves, you know, they stay on the top, top of the rope. So that the when they pull, the fish going to not to jump over the rope. The ropes, the the uh the leaves keep the fish from jumping over to the scared, they scared them away. They don't want to go underneath the water, see. When some moment she go underneath the water there, they will go to the net. They have a pocket on the net. They have a pocket on the net. Now the bottom of the net then. Bottom of the net they have a pocket in. Mhm. The water must have some kind of things to make it sink. Sink is a lead, they have a lead. Oh, they have lead on it. I see. Then uh uh somebody goes out and puts the net out around. Yeah, they tow it with a boat. With a boat? Two, two boats they goes out. Tow it all around. And then uh who pulls it in? Well, they have a lots of uh from the shore about 40, 50 guys they pull that rope up. Ah, I see. Yeah. And uh they must have a pretty big school of fish there. Yeah, once in a while they have a nice school, right? And then they pull them all in and the fish don't are scared to jump out so they get caught. No, they all in up on the net already. Mhm. They get caught in the net. I suppose the top of the net comes in sooner than the bottom so they if they do try to go out there, that's pocket. Yeah, they have to take on uh, you know, about six or seven guys is swimming on it around the net to they're watching how the fish going. Oh, I see. They they're chasing at the hole. Uh-huh. Well, about how much fish do they get out of a good haul of those? Once I saw up Waimea, they caught about about a ton, I guess, you know, those akule. Mhm. Oh, they have a nice crew up there. What's akule? Mm, the akule some kind of, you know, that's Hawaiian used to call akule, akule. They have been calling it Aji in Japan.\"}
2uri: gs://shuowei_bucket/audio/AK012clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': \"the soapstone carvings are those done, have you done any of those? Yes. Yeah. It's pretty much like ivory carving. Only it takes more care. What? To work on soapstone. Why? Why is it? Because it is brittle. Oh, it is. And very soft. Mhm. Uh, you you can hack on ivory, but you can't do that on soapstone. Chip too easily. Uh-huh. But then you have to use your files on those. Yes. And then once you've done the filing and so on, how do you smooth it down? Uh, we I use uh fine file. Mhm. To finish it up and then use uh sandpaper or emery cloth. Mhm. I think you said that um quite often the thing that is carved is determined by the shape of the piece of stone that one starts carving. Yes, yeah. Sometimes uh uh an ivory carver or a soapstone carver will take advantage of the shape of the stone. Mhm. And uh try to visualize what it'd look like. Uh, maybe a polar bear or or Mhm. He makes it just the way it is.\"}
3uri: gs://shuowei_bucket/audio/CA138clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': \"was rather interesting just to watch them gathering their materials and bouncing around. Yeah, they they are what they call it kangaroo walk or or something. Really? Uh, something like that. They named it that. I don't know. I bet those men are going to get quite a reception when they get back to earth. Oh, yes. I'll be so glad when they land back now, but I think that's uh pretty well uh fact because they've landed so many safely now that I I feel really relieved. Just getting off of the moon was the thing that was. Have they met with the um one that was circling? Yes, they've rendezvoused. So I understand. I that wasn't shown either, so I but uh they say they have rendezvoused, so that's a matter of making the circles and then coming down. What do you sort of imagine for the future? Do you imagine them sending a I think they will. I think they will will do some more exploring up there. Right. Positive, you know, to continue with this. Uh-huh. Because that was such a very small area when you think of it that they just gathered uh rocks and and uh oh samples of soil and all and they did uh probe for some samples. And just what is going to come of that, I don't know. I'd be glad to read it.\"}
4uri: gs://shuowei_bucket/audio/TX048clip.mp3, authorizer: bigframes-dev.us.bigframes-default-connection{'status': '', 'content': \"Say this is entered customs. And uh the next step is to prepare a Mexican customs entry. Mexico call them pedimentos. It's really an appraisal sheet. It's it's a breakdown of the sheet. You use the same information. That's right. But here you give them all the information, for example, let me see, I found one right here. You give them all the information on on what the invoice contains. Here's one. On what the invoice contains, what uh what what the material is itself. If you hear, for example, this is some bottling equipment going over there and on the appraisal sheet, so you tell them who it's for, who made the invoice out of the states. This came from St. Louis, Missouri. Yeah. How much the freight was. And on the reverse of this of this thing, but the everything has to be marked. So complete identification. This identification number is that traffic number I spoke to you a little while ago. Number of boxes, what they contain, the tariff duty, the tariff number, the ad valorem rate, and what the actual duties would be paid. For example, here would be 996 pesos and 73 cents. And as you can see, this is a form that has already been approved by the appraiser. These appraisers are called Vistas. It's very different from the Vistas we have nowadays here.\"}
\n", - "

5 rows × 2 columns

\n", - "
[5 rows x 2 columns in total]" - ], - "text/plain": [ - " audio \\\n", - "0 {'uri': 'gs://shuowei_bucket/audio/ID014clip.m... \n", - "1 {'uri': 'gs://shuowei_bucket/audio/HI009clip.m... \n", - "2 {'uri': 'gs://shuowei_bucket/audio/AK012clip.m... \n", - "3 {'uri': 'gs://shuowei_bucket/audio/CA138clip.m... \n", - "4 {'uri': 'gs://shuowei_bucket/audio/TX048clip.m... \n", - "\n", - " text \n", - "0 {'status': '', 'content': \"I'll tell you somet... \n", - "1 {'status': '', 'content': \"Uh, I hear that the... \n", - "2 {'status': '', 'content': \"the soapstone carvi... \n", - "3 {'status': '', 'content': \"was rather interest... \n", - "4 {'status': '', 'content': \"Say this is entered... \n", - "\n", - "[5 rows x 2 columns]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/test_notebook.ipynb b/notebooks/test_notebook.ipynb deleted file mode 100644 index ce85a846de..0000000000 --- a/notebooks/test_notebook.ipynb +++ /dev/null @@ -1,58 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import bigframes.pandas as bpd\n", - "bpd.options.bigquery.project = 'bigquery-public-data'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "gender_filter = 'M'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "sql" - ] - }, - "outputs": [], - "source": [ - "%%sql\n", - "SELECT * FROM `bigquery-public-data.usa_names.usa_1910_2013` WHERE gender = '{gender_filter}'" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.5" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/tests/system/small/test_loc.py b/tests/system/small/test_loc.py deleted file mode 100644 index 2f0b9df31b..0000000000 --- a/tests/system/small/test_loc.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pandas as pd -import pytest - -from bigframes.testing.utils import assert_pandas_df_equal - - -def test_loc_select_columns_w_repeats(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.loc[:, ["string_col", "string_col"]].to_pandas() - pd_result = scalars_pandas_df_index.loc[:, ["string_col", "string_col"]] - assert_pandas_df_equal(bf_result, pd_result) - - -def test_loc_select_rows_and_columns_w_repeats( - scalars_df_index, scalars_pandas_df_index -): - bf_result = scalars_df_index.loc[ - [2, 3, 2], ["string_col", "string_col"] - ].to_pandas() - pd_result = scalars_pandas_df_index.loc[[2, 3, 2], ["string_col", "string_col"]] - assert_pandas_df_equal(bf_result, pd_result) - - -def test_loc_slice_rows_and_select_columns_w_repeats( - scalars_df_index, scalars_pandas_df_index -): - bf_result = scalars_df_index.loc[2:5, ["string_col", "string_col"]].to_pandas() - pd_result = scalars_pandas_df_index.loc[2:5, ["string_col", "string_col"]] - assert_pandas_df_equal(bf_result, pd_result) - - -def test_loc_bool_series(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.loc[scalars_df_index["bool_col"]].to_pandas() - pd_result = scalars_pandas_df_index.loc[scalars_pandas_df_index["bool_col"]] - assert_pandas_df_equal(bf_result, pd_result) - - -def test_loc_list_select_rows_and_columns(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.loc[[2, 3], ["string_col", "int64_col"]].to_pandas() - pd_result = scalars_pandas_df_index.loc[[2, 3], ["string_col", "int64_col"]] - assert_pandas_df_equal(bf_result, pd_result) - - -def test_loc_select_column(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.loc[:, "string_col"].to_pandas() - pd_result = scalars_pandas_df_index.loc[:, "string_col"] - assert_pandas_df_equal(bf_result, pd_result) - - -def test_loc_select_with_column_condition(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.loc[ - scalars_df_index["bool_col"], "string_col" - ].to_pandas() - pd_result = scalars_pandas_df_index.loc[ - scalars_pandas_df_index["bool_col"], "string_col" - ] - assert_pandas_df_equal(bf_result, pd_result) - - -def test_loc_select_with_column_condition_bf_series( - scalars_df_index, scalars_pandas_df_index -): - bf_result = scalars_df_index.loc[ - scalars_df_index["bool_col"], scalars_df_index.columns.to_series() - ].to_pandas() - pd_result = scalars_pandas_df_index.loc[ - scalars_pandas_df_index["bool_col"], - scalars_pandas_df_index.columns.to_series(), - ] - assert_pandas_df_equal(bf_result, pd_result) - - -def test_loc_single_index_with_duplicate(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.set_index("int64_col").loc[2].to_pandas() - pd_result = scalars_pandas_df_index.set_index("int64_col").loc[2] - assert_pandas_df_equal(bf_result, pd_result) - - -def test_loc_single_index_no_duplicate(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.set_index("int64_col").loc[6].to_pandas() - pd_result = scalars_pandas_df_index.set_index("int64_col").loc[6] - assert_pandas_df_equal(bf_result, pd_result) - - -def test_loc_setitem_slice_scalar(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_df = scalars_df.copy() - pd_df = scalars_pandas_df.copy() - bf_df.loc[2:5, "int64_col"] = 99 - pd_df.loc[2:5, "int64_col"] = 99 - assert_pandas_df_equal(bf_df.to_pandas(), pd_df) - - -def test_loc_setitem_slice_series(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_df = scalars_df.copy() - pd_df = scalars_pandas_df.copy() - bf_series = bf_df["int64_col"] * 2 - pd_series = pd_df["int64_col"] * 2 - bf_df.loc[2:5, "int64_col"] = bf_series - pd_df.loc[2:5, "int64_col"] = pd_series - assert_pandas_df_equal(bf_df.to_pandas(), pd_df) - - -def test_loc_setitem_list_scalar(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_df = scalars_df.copy() - pd_df = scalars_pandas_df.copy() - bf_df.loc[[2, 5], "int64_col"] = 99 - pd_df.loc[[2, 5], "int64_col"] = 99 - assert_pandas_df_equal(bf_df.to_pandas(), pd_df) - - -def test_loc_setitem_list_series(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_df = scalars_df.copy() - pd_df = scalars_pandas_df.copy() - bf_series = bf_df["int64_col"] * 2 - pd_series = pd_df["int64_col"] * 2 - bf_df.loc[[2, 5], "int64_col"] = bf_series - pd_df.loc[[2, 5], "int64_col"] = pd_series - assert_pandas_df_equal(bf_df.to_pandas(), pd_df) - - -@pytest.mark.parametrize( - ("col", "value"), - [ - ("new_col", 99), - ("int64_col", -1), - ("string_col", "new_string"), - ("date_col", pd.Timestamp("2024-01-01")), - ], -) -def test_loc_setitem_bool_series_scalar(scalars_dfs, col, value): - scalars_df, scalars_pandas_df = scalars_dfs - bf_df = scalars_df.copy() - pd_df = scalars_pandas_df.copy() - bf_df.loc[bf_df["bool_col"], col] = value - pd_df.loc[pd_df["bool_col"], col] = value - assert_pandas_df_equal(bf_df.to_pandas(), pd_df) - - -def test_loc_setitem_bool_series_scalar_error(scalars_dfs): - scalars_df, scalars_pandas_df = scalars_dfs - bf_df = scalars_df.copy() - pd_df = scalars_pandas_df.copy() - with pytest.raises(TypeError): - bf_df.loc[bf_df["bool_col"], "int64_col"] = "incompatible_string" - with pytest.raises(TypeError): - pd_df.loc[pd_df["bool_col"], "int64_col"] = "incompatible_string" - - -def test_loc_list_string_index(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.set_index("string_col").loc[["cat", "dog"]].to_pandas() - pd_result = scalars_pandas_df_index.set_index("string_col").loc[["cat", "dog"]] - assert_pandas_df_equal(bf_result, pd_result) - - -def test_loc_list_integer_index(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.loc[[2, 3]].to_pandas() - pd_result = scalars_pandas_df_index.loc[[2, 3]] - assert_pandas_df_equal(bf_result, pd_result) - - -def test_loc_list_multiindex(scalars_dfs): - # TODO: supply a reason why this isn't compatible with pandas 1.x - pytest.importorskip("pandas", minversion="2.0.0") - scalars_df, scalars_pandas_df = scalars_dfs - bf_df = scalars_df.set_index(["string_col", "int64_col"]) - pd_df = scalars_pandas_df.set_index(["string_col", "int64_col"]) - bf_result = bf_df.loc[[("cat", 2), ("dog", 2)]].to_pandas() - pd_result = pd_df.loc[[("cat", 2), ("dog", 2)]] - assert_pandas_df_equal(bf_result, pd_result) - - -def test_loc_bf_series_string_index(scalars_df_index, scalars_pandas_df_index): - bf_result = ( - scalars_df_index.set_index("string_col") - .loc[scalars_df_index["string_col"]] - .to_pandas() - ) - pd_result = scalars_pandas_df_index.set_index("string_col").loc[ - scalars_pandas_df_index["string_col"] - ] - assert_pandas_df_equal(bf_result, pd_result) - - -def test_loc_bf_series_multiindex(scalars_df_index, scalars_pandas_df_index): - bf_df = scalars_df_index.set_index(["string_col", "int64_col"]) - pd_df = scalars_pandas_df_index.set_index(["string_col", "int64_col"]) - bf_result = bf_df.loc[bf_df.index.to_series()].to_pandas() - pd_result = pd_df.loc[pd_df.index.to_series()] - assert_pandas_df_equal(bf_result, pd_result) - - -def test_loc_bf_index_integer_index(scalars_df_index, scalars_pandas_df_index): - bf_result = scalars_df_index.loc[scalars_df_index.index].to_pandas() - pd_result = scalars_pandas_df_index.loc[scalars_pandas_df_index.index] - assert_pandas_df_equal(bf_result, pd_result) - - -def test_loc_bf_index_integer_index_renamed_col( - scalars_df_index, scalars_pandas_df_index -): - bf_df = scalars_df_index.rename(columns={"int64_col": "new_name"}) - pd_df = scalars_pandas_df_index.rename(columns={"int64_col": "new_name"}) - bf_result = bf_df.loc[bf_df.index].to_pandas() - pd_result = pd_df.loc[pd_df.index] - assert_pandas_df_equal(bf_result, pd_result) diff --git a/tests/unit/ml/test_utils.py b/tests/unit/ml/test_utils.py deleted file mode 100644 index 9b273cb716..0000000000 --- a/tests/unit/ml/test_utils.py +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest - -from bigframes.ml import utils - - -@pytest.mark.parametrize( - ("input", "expected"), - [ - ("STRING", "string"), - ("str", "string"), - ("Integer", "int64"), - ("int64", "int64"), - ("boolean", "bool"), - ("bool", "bool"), - ("float", "float64"), - ("float64", "float64"), - ], -) -def test_standardize_type(input, expected): - assert utils.standardize_type(input) == expected From 39cf5954a1823e08a67c6457402033aecbef685e Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 30 Oct 2025 20:25:38 +0000 Subject: [PATCH 45/53] revert 1 files to match main branch --- bigframes/bigquery/_operations/ai.py | 7 -- bigframes/core/compile/polars/compiler.py | 30 ++----- bigframes/dataframe.py | 36 +-------- bigframes/display/anywidget.py | 9 +-- bigframes/ml/llm.py | 11 +-- bigframes/operations/output_schemas.py | 5 -- bigframes/series.py | 1 - tests/system/small/test_dataframe.py | 12 --- tests/system/small/test_series.py | 1 + tests/unit/test_dataframe.py | 23 ------ tests/unit/test_polars_compiler.py | 95 ----------------------- 11 files changed, 13 insertions(+), 217 deletions(-) delete mode 100644 tests/unit/test_polars_compiler.py diff --git a/bigframes/bigquery/_operations/ai.py b/bigframes/bigquery/_operations/ai.py index 07f81d87f5..8579f7f298 100644 --- a/bigframes/bigquery/_operations/ai.py +++ b/bigframes/bigquery/_operations/ai.py @@ -123,13 +123,6 @@ def generate( if output_schema is None: output_schema_str = None else: - # Validate output schema types - for col_name, col_type in output_schema.items(): - if col_type.upper() == "JSON": - raise ValueError( - "JSON type is not supported in output_schema. " - "Supported types are: STRING, INT64, FLOAT64, BOOL, ARRAY, and STRUCT." - ) output_schema_str = ", ".join( [f"{name} {sql_type}" for name, sql_type in output_schema.items()] ) diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index e939f80120..d48ddba0cc 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -45,13 +45,13 @@ polars_installed = True if TYPE_CHECKING: import polars as pl - import pyarrow as pa else: try: import bigframes._importing + # Use import_polars() instead of importing directly so that we check + # the version numbers. pl = bigframes._importing.import_polars() - import pyarrow as pa except Exception: polars_installed = False @@ -427,21 +427,6 @@ def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: assert isinstance(op, json_ops.JSONDecode) return input.str.json_decode(_DTYPE_MAPPING[op.to_type]) - @compile_op.register(json_ops.ToJSONString) - def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: - # Convert JSON to string representation - return input.cast(pl.String()) - - @compile_op.register(json_ops.ParseJSON) - def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: - # In Polars, JSON is stored as string, so no decoding needed - return input - - @compile_op.register(json_ops.JSONExtract) - def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: - assert isinstance(op, json_ops.JSONExtract) - return input.str.json_path_match(op.json_path) - @compile_op.register(arr_ops.ToArrayOp) def _(self, op: ops.ToArrayOp, *inputs: pl.Expr) -> pl.Expr: return pl.concat_list(*inputs) @@ -621,14 +606,9 @@ def compile_readlocal(self, node: nodes.ReadLocalNode): scan_item.source_id: scan_item.id.sql for scan_item in node.scan_list.items } - - if hasattr(node.local_data_source, "to_arrow"): - schema, batches = node.local_data_source.to_arrow(json_type="string") - arrow_data = pa.Table.from_batches(batches, schema) - else: - arrow_data = node.local_data_source.data - - lazy_frame = cast(pl.DataFrame, pl.from_arrow(arrow_data)).lazy() + lazy_frame = cast( + pl.DataFrame, pl.from_arrow(node.local_data_source.data) + ).lazy() lazy_frame = lazy_frame.select(cols_to_read.keys()).rename(cols_to_read) if node.offsets_col: lazy_frame = lazy_frame.with_columns( diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index f3b78e8218..f016fddd83 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -783,8 +783,7 @@ def __repr__(self) -> str: opts = bigframes.options.display max_results = opts.max_rows - - # anywidget mode uses the same display logic as the "deferred" mode + # anywdiget mode uses the same display logic as the "deferred" mode # for faster execution if opts.repr_mode in ("deferred", "anywidget"): return formatter.repr_query_job(self._compute_dry_run()) @@ -856,21 +855,6 @@ def _repr_html_(self) -> str: from bigframes import display - # The anywidget frontend doesn't support the db_dtypes JSON type, so - # convert to strings for display. - json_cols = [ - series_name - for series_name, series in df.items() - if bigframes.dtypes.contains_db_dtypes_json_dtype(series.dtype) - ] - if json_cols: - warnings.warn( - "Converting JSON columns to strings for display. " - "This is temporary and will be removed when the frontend supports JSON types." - ) - for col in json_cols: - df[col] = df[col]._apply_unary_op(ops.json_ops.ToJSONString()) - # Always create a new widget instance for each display call # This ensures that each cell gets its own widget and prevents # unintended sharing between cells @@ -878,6 +862,7 @@ def _repr_html_(self) -> str: ipython_display(widget) return "" # Return empty string since we used display() + except (AttributeError, ValueError, ImportError): # Fallback if anywidget is not available warnings.warn( @@ -1963,22 +1948,7 @@ def _to_pandas_batches( *, allow_large_results: Optional[bool] = None, ) -> blocks.PandasBatches: - # Workaround for PyArrow bug https://github.com/apache/arrow/issues/45262 - # JSON columns are not supported in to_pandas_batches - json_cols = [ - str(col_name) # Cast to string - for col_name, dtype in self.dtypes.items() - if bigframes.dtypes.contains_db_dtypes_json_dtype(dtype) - ] - - df = self - if json_cols: - # Convert JSON columns to strings before materialization - df = df.copy() - for col in json_cols: - df[col] = df[col].astype("string") - - return df._block.to_pandas_batches( + return self._block.to_pandas_batches( page_size=page_size, max_results=max_results, allow_large_results=allow_large_results, diff --git a/bigframes/display/anywidget.py b/bigframes/display/anywidget.py index 8930c611e9..a0b4f809d8 100644 --- a/bigframes/display/anywidget.py +++ b/bigframes/display/anywidget.py @@ -209,12 +209,6 @@ def _reset_batches_for_new_page_size(self) -> None: def _set_table_html(self) -> None: """Sets the current html data based on the current page and page size.""" - # For empty dataframe, render empty table with headers. - if self.row_count == 0: - page_data = self._cached_data - else: - start = self.page * self.page_size - end = start + self.page_size if self._error_message: self.table_html = ( f"
{self._error_message}
" @@ -256,5 +250,8 @@ def _page_size_changed(self, _change: Dict[str, Any]) -> None: # Reset the page to 0 when page size changes to avoid invalid page states self.page = 0 + # Reset batches to use new page size for future data fetching + self._reset_batches_for_new_page_size() + # Update the table display self._set_table_html() diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index edede34e8f..531a043c45 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -731,17 +731,8 @@ def predict( "ground_with_google_search": ground_with_google_search, } if output_schema: - supported_dtypes = ( - "int64", - "float64", - "bool", - "string", - "array", - "struct", - ) output_schema = { - k: utils.standardize_type(v, supported_dtypes=supported_dtypes) - for k, v in output_schema.items() + k: utils.standardize_type(v) for k, v in output_schema.items() } options["output_schema"] = output_schema return self._predict_and_retry( diff --git a/bigframes/operations/output_schemas.py b/bigframes/operations/output_schemas.py index 2a72d4f48f..ff9c9883dc 100644 --- a/bigframes/operations/output_schemas.py +++ b/bigframes/operations/output_schemas.py @@ -14,8 +14,6 @@ import pyarrow as pa -from bigframes import dtypes - def parse_sql_type(sql: str) -> pa.DataType: """ @@ -45,9 +43,6 @@ def parse_sql_type(sql: str) -> pa.DataType: if sql.upper() == "BOOL": return pa.bool_() - if sql.upper() == "JSON": - return dtypes.JSON_ARROW_TYPE - if sql.upper().startswith("ARRAY<") and sql.endswith(">"): inner_type = sql[len("ARRAY<") : -1] return pa.list_(parse_sql_type(inner_type)) diff --git a/bigframes/series.py b/bigframes/series.py index 5177bd0f33..ef0da32dfc 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -610,7 +610,6 @@ def astype( if errors not in ["raise", "null"]: raise ValueError("Argument 'errors' must be one of 'raise' or 'null'") dtype = bigframes.dtypes.bigframes_type(dtype) - return self._apply_unary_op( bigframes.operations.AsTypeOp(to_type=dtype, safe=(errors == "null")) ) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index a0c0e41a1b..79f8efd00f 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -6142,15 +6142,3 @@ def test_agg_with_dict_containing_non_existing_col_raise_key_error(scalars_dfs): with pytest.raises(KeyError): bf_df.agg(agg_funcs) - - -def test_to_pandas_batches_with_json_columns(session): - """Test that JSON columns are properly handled in to_pandas_batches.""" - # Create a DataFrame with JSON column - df = session.read_gbq('SELECT JSON \'{"key": "value"}\' as json_col') - - # This should not raise an error - batches = df._to_pandas_batches(page_size=10) - next(batches) - - # TODO diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 63c2f6c498..5ace3f54d8 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -4077,6 +4077,7 @@ def test_json_astype_others(data, to_type, errors): pytest.param(["10.2", None], dtypes.INT_DTYPE, id="to_int"), pytest.param(["false", None], dtypes.FLOAT_DTYPE, id="to_float"), pytest.param(["10.2", None], dtypes.BOOL_DTYPE, id="to_bool"), + pytest.param(["true", None], dtypes.STRING_DTYPE, id="to_string"), ], ) def test_json_astype_others_raise_error(data, to_type): diff --git a/tests/unit/test_dataframe.py b/tests/unit/test_dataframe.py index f223d69a76..2326f2595b 100644 --- a/tests/unit/test_dataframe.py +++ b/tests/unit/test_dataframe.py @@ -181,26 +181,3 @@ def test_dataframe_ai_property_future_warning( with pytest.warns(FutureWarning): dataframe.ai - - -@pytest.fixture() -def json_df(polars_session: bigframes.session.Session) -> bigframes.dataframe.DataFrame: - """Create a DataFrame with a JSON column for testing.""" - import bigframes.dtypes - - pandas_df = pd.DataFrame( - { - "a": [1], - "b": ['{"c": 2, "d": 3}'], - } - ) - pandas_df["b"] = pandas_df["b"].astype(bigframes.dtypes.JSON_DTYPE) - return polars_session.read_pandas(pandas_df) - - -def test_to_pandas_batches_with_json_column(json_df: bigframes.dataframe.DataFrame): - """Test that JSON columns are converted to strings in to_pandas_batches.""" - batches = list(json_df._to_pandas_batches(page_size=10)) - assert len(batches) > 0 - # Verify the JSON column is now string type - assert batches[0]["b"].dtype == pd.StringDtype(storage="pyarrow") diff --git a/tests/unit/test_polars_compiler.py b/tests/unit/test_polars_compiler.py deleted file mode 100644 index 95be7d5d00..0000000000 --- a/tests/unit/test_polars_compiler.py +++ /dev/null @@ -1,95 +0,0 @@ -# Copyright 2025 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pandas as pd -import pytest - -try: - import polars as pl - - POLARS_INSTALLED = True -except ImportError: - POLARS_INSTALLED = False - -if not POLARS_INSTALLED: - pytest.skip("polars is not installed", allow_module_level=True) - - -import bigframes as bf -import bigframes.core.compile.polars.compiler as polars_compiler -import bigframes.core.nodes as nodes -import bigframes.operations.json_ops as json_ops - - -def test_polars_to_json_string(): - """Test ToJSONString operation in Polars compiler.""" - compiler = polars_compiler.PolarsExpressionCompiler() - op = json_ops.ToJSONString() - # Polars doesn't have a native JSON type, it uses strings. - # The operation is a cast to string. - input_expr = pl.lit('{"b": 2}', dtype=pl.String) - result = compiler.compile_op(op, input_expr) - - df = pl.DataFrame({"a": ['{"b": 2}']}).lazy() - result_df = df.with_columns(result.alias("b")).collect() - assert result_df["b"][0] == '{"b": 2}' - assert result_df["b"].dtype == pl.String - - -def test_polars_parse_json(): - """Test ParseJSON operation in Polars compiler.""" - compiler = polars_compiler.PolarsExpressionCompiler() - op = json_ops.ParseJSON() - input_expr = pl.lit('{"b": 2}', dtype=pl.String) - result = compiler.compile_op(op, input_expr) - - df = pl.DataFrame({"a": ['{"b": 2}']}).lazy() - result_df = df.with_columns(result.alias("b")).collect() - # The result of json_decode is a struct - assert isinstance(result_df["b"][0], dict) - assert result_df["b"][0]["b"] == 2 - - -def test_polars_json_extract(): - """Test JSONExtract operation in Polars compiler.""" - compiler = polars_compiler.PolarsExpressionCompiler() - op = json_ops.JSONExtract(json_path="$.b") - input_expr = pl.lit('{"a": 1, "b": "hello"}', dtype=pl.String) - result = compiler.compile_op(op, input_expr) - - df = pl.DataFrame({"a": ['{"a": 1, "b": "hello"}']}).lazy() - result_df = df.with_columns(result.alias("b")).collect() - # json_path_match returns the raw string value - assert result_df["b"][0] == "hello" - - -def test_readlocal_with_json_column(polars_session): - """Test ReadLocalNode compilation with JSON columns.""" - pandas_df = pd.DataFrame({"data": ['{"key": "value"}']}) - pandas_df["data"] = pandas_df["data"].astype(bf.dtypes.JSON_DTYPE) - bf_df = polars_session.read_pandas(pandas_df) - - node = bf_df._block.expr.node - # Traverse the node tree to find the ReadLocalNode - while not isinstance(node, nodes.ReadLocalNode): - node = node.child - assert isinstance(node, nodes.ReadLocalNode) - - compiler = polars_compiler.PolarsCompiler() - lazy_frame = compiler.compile_node(node) - result_df = lazy_frame.collect() - - # The compiler should have converted the JSON column to string. - assert result_df.schema["column_0"] == pl.String - assert result_df["column_0"][0] == '{"key":"value"}' From 8c3451266c28ec0da6dd57c4f9929ae68a593574 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 30 Oct 2025 21:46:08 +0000 Subject: [PATCH 46/53] Correctly display DataFrames with JSON columns in anywidget --- bigframes/core/blocks.py | 48 +++++++-- bigframes/dataframe.py | 2 - bigframes/session/executor.py | 34 +++++++ mypy.ini | 3 + notebooks/dataframes/anywidget_mode.ipynb | 119 ++++++++++++++++++++-- 5 files changed, 189 insertions(+), 17 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 1900b7208a..2dc9d7d898 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -43,6 +43,7 @@ import warnings import bigframes_vendored.constants as constants +import db_dtypes import google.cloud.bigquery as bigquery import numpy import pandas as pd @@ -134,6 +135,21 @@ class MaterializationOptions: ordered: bool = True +def _replace_json_arrow_with_string(pa_type: pa.DataType) -> pa.DataType: + """Recursively replace JSONArrowType with string type.""" + if isinstance(pa_type, db_dtypes.JSONArrowType): + return pa.string() + if isinstance(pa_type, pa.ListType): + return pa.list_(_replace_json_arrow_with_string(pa_type.value_type)) + if isinstance(pa_type, pa.StructType): + new_fields = [ + field.with_type(_replace_json_arrow_with_string(field.type)) + for field in pa_type + ] + return pa.struct(new_fields) + return pa_type + + class Block: """A immutable 2D data structure.""" @@ -715,12 +731,32 @@ def to_pandas_batches( # To reduce the number of edge cases to consider when working with the # results of this, always return at least one DataFrame. See: # b/428918844. - empty_val = pd.DataFrame( - { - col: pd.Series([], dtype=self.expr.get_column_type(col)) - for col in itertools.chain(self.value_columns, self.index_columns) - } - ) + series_map = {} + for col in itertools.chain(self.value_columns, self.index_columns): + dtype = self.expr.get_column_type(col) + if bigframes.dtypes.contains_db_dtypes_json_dtype(dtype): + # Due to a limitation in Apache Arrow (#45262), JSON columns are not + # natively supported by the to_pandas_batches() method, which is + # used by the anywidget backend. + # Workaround for https://github.com/googleapis/python-bigquery-dataframes/issues/1273 + # PyArrow doesn't support creating an empty array with db_dtypes.JSONArrowType, + # especially when nested. + # Create with string type and then cast. + + # MyPy doesn't automatically narrow the type of 'dtype' here, + # so we add an explicit check. + if isinstance(dtype, pd.ArrowDtype): + safe_pa_type = _replace_json_arrow_with_string(dtype.pyarrow_dtype) + safe_dtype = pd.ArrowDtype(safe_pa_type) + series_map[col] = pd.Series([], dtype=safe_dtype).astype(dtype) + else: + # This branch should ideally not be reached if + # contains_db_dtypes_json_dtype is accurate, + # but it's here for MyPy's sake. + series_map[col] = pd.Series([], dtype=dtype) + else: + series_map[col] = pd.Series([], dtype=dtype) + empty_val = pd.DataFrame(series_map) dfs = map( lambda a: a[0], itertools.zip_longest( diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index f016fddd83..c954c8eebc 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -783,8 +783,6 @@ def __repr__(self) -> str: opts = bigframes.options.display max_results = opts.max_rows - # anywdiget mode uses the same display logic as the "deferred" mode - # for faster execution if opts.repr_mode in ("deferred", "anywidget"): return formatter.repr_query_job(self._compute_dry_run()) diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py index d0cfe5f4f7..97ad7f5bb8 100644 --- a/bigframes/session/executor.py +++ b/bigframes/session/executor.py @@ -52,6 +52,8 @@ def arrow_batches(self) -> Iterator[pyarrow.RecordBatch]: result_rows = 0 for batch in self._arrow_batches: + # Convert JSON columns to strings before casting + batch = self._convert_json_to_string(batch) batch = pyarrow_utils.cast_batch(batch, self.schema.to_pyarrow()) result_rows += batch.num_rows @@ -67,6 +69,38 @@ def arrow_batches(self) -> Iterator[pyarrow.RecordBatch]: yield batch + def _convert_json_to_string( + self, batch: pyarrow.RecordBatch + ) -> pyarrow.RecordBatch: + """Convert JSON arrow extension types to string to avoid PyArrow compatibility issues.""" + import logging + + new_arrays = [] + new_fields = [] + + for i, field in enumerate(batch.schema): + array = batch.column(i) + + # Check if this column should be JSON based on our schema + schema_item = next( + (item for item in self.schema.items if item.column == field.name), None + ) + + if schema_item and schema_item.dtype == bigframes.dtypes.JSON_DTYPE: + logging.info(f"Converting JSON column: {field.name}") + # Convert JSONArrowType to string + if array.type == bigframes.dtypes.JSON_ARROW_TYPE: + array = array.cast(pyarrow.string()) + new_fields.append(pyarrow.field(field.name, pyarrow.string())) + else: + new_fields.append(field) + + new_arrays.append(array) + + return pyarrow.RecordBatch.from_arrays( + new_arrays, schema=pyarrow.schema(new_fields) + ) + def to_arrow_table(self) -> pyarrow.Table: # Need to provide schema if no result rows, as arrow can't infer # If ther are rows, it is safest to infer schema from batches. diff --git a/mypy.ini b/mypy.ini index 7709eb200a..1fbca2498a 100644 --- a/mypy.ini +++ b/mypy.ini @@ -44,3 +44,6 @@ ignore_missing_imports = True [mypy-anywidget] ignore_missing_imports = True + +[mypy-db_dtypes] +ignore_missing_imports = True diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index c2af915721..347f57566a 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -35,7 +35,16 @@ "execution_count": 2, "id": "ca22f059", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/_python_version_support.py:266: FutureWarning: You are using a Python version (3.10.15) which Google will stop supporting in new releases of google.api_core once it reaches its end of life (2026-10-04). Please upgrade to the latest Python version, or at least Python 3.11, to continue receiving updates for google.api_core past that date.\n", + " warnings.warn(message, FutureWarning)\n" + ] + } + ], "source": [ "import bigframes.pandas as bpd" ] @@ -142,9 +151,9 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "aafd4f912b5f42e0896aa5f0c2c62620", + "model_id": "473b016aa6b24c86aafc6372352e822d", "version_major": 2, - "version_minor": 0 + "version_minor": 1 }, "text/plain": [ "TableWidget(page_size=10, row_count=5552452, table_html='" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:969: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "✅ Completed. " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a6d61e48cca642b7a57e6431359b4cc4", + "version_major": 2, + "version_minor": 1 + }, + "text/plain": [ + "TableWidget(page_size=10, row_count=5, table_html='
(\\\"Extract the values.\\\", OBJ.GET_ACCESS_URL(OBJ.FETCH_METADATA(OBJ.MAKE_REF(gcs_path, \\\"us.conn\\\")), \\\"r\\\")),\n", + " connection_id=>\\\"bigframes-dev.us.bigframes-default-connection\\\",\n", + " output_schema=>\\\"publication_date string, class_international string, application_number string, filing_date string\\\") AS result,\n", + " *\n", + " FROM `bigquery-public-data.labeled_patents.extracted_data`\n", + " LIMIT 5;\n", + "\"\"\")" + ] } ], "metadata": { From 317358210c72c7c91e5583337248ad6fb152667f Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 30 Oct 2025 21:54:47 +0000 Subject: [PATCH 47/53] add mis-deleted comment back --- bigframes/dataframe.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index c954c8eebc..f016fddd83 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -783,6 +783,8 @@ def __repr__(self) -> str: opts = bigframes.options.display max_results = opts.max_rows + # anywdiget mode uses the same display logic as the "deferred" mode + # for faster execution if opts.repr_mode in ("deferred", "anywidget"): return formatter.repr_query_job(self._compute_dry_run()) From 9c109621aad4fc1aab947567f749643f669d71b4 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 30 Oct 2025 23:22:43 +0000 Subject: [PATCH 48/53] revert unnecessary change --- bigframes/session/executor.py | 34 ---------------------------------- 1 file changed, 34 deletions(-) diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py index 97ad7f5bb8..d0cfe5f4f7 100644 --- a/bigframes/session/executor.py +++ b/bigframes/session/executor.py @@ -52,8 +52,6 @@ def arrow_batches(self) -> Iterator[pyarrow.RecordBatch]: result_rows = 0 for batch in self._arrow_batches: - # Convert JSON columns to strings before casting - batch = self._convert_json_to_string(batch) batch = pyarrow_utils.cast_batch(batch, self.schema.to_pyarrow()) result_rows += batch.num_rows @@ -69,38 +67,6 @@ def arrow_batches(self) -> Iterator[pyarrow.RecordBatch]: yield batch - def _convert_json_to_string( - self, batch: pyarrow.RecordBatch - ) -> pyarrow.RecordBatch: - """Convert JSON arrow extension types to string to avoid PyArrow compatibility issues.""" - import logging - - new_arrays = [] - new_fields = [] - - for i, field in enumerate(batch.schema): - array = batch.column(i) - - # Check if this column should be JSON based on our schema - schema_item = next( - (item for item in self.schema.items if item.column == field.name), None - ) - - if schema_item and schema_item.dtype == bigframes.dtypes.JSON_DTYPE: - logging.info(f"Converting JSON column: {field.name}") - # Convert JSONArrowType to string - if array.type == bigframes.dtypes.JSON_ARROW_TYPE: - array = array.cast(pyarrow.string()) - new_fields.append(pyarrow.field(field.name, pyarrow.string())) - else: - new_fields.append(field) - - new_arrays.append(array) - - return pyarrow.RecordBatch.from_arrays( - new_arrays, schema=pyarrow.schema(new_fields) - ) - def to_arrow_table(self) -> pyarrow.Table: # Need to provide schema if no result rows, as arrow can't infer # If ther are rows, it is safest to infer schema from batches. From cc6dd641f82981c376ab0b434309b0bdea83e06b Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 30 Oct 2025 23:35:51 +0000 Subject: [PATCH 49/53] move helper function to dtypes.py --- bigframes/core/blocks.py | 20 ++------- bigframes/dtypes.py | 15 +++++++ notebooks/dataframes/anywidget_mode.ipynb | 55 +++++++++++++++-------- 3 files changed, 54 insertions(+), 36 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 2dc9d7d898..f359000d3d 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -43,7 +43,6 @@ import warnings import bigframes_vendored.constants as constants -import db_dtypes import google.cloud.bigquery as bigquery import numpy import pandas as pd @@ -135,21 +134,6 @@ class MaterializationOptions: ordered: bool = True -def _replace_json_arrow_with_string(pa_type: pa.DataType) -> pa.DataType: - """Recursively replace JSONArrowType with string type.""" - if isinstance(pa_type, db_dtypes.JSONArrowType): - return pa.string() - if isinstance(pa_type, pa.ListType): - return pa.list_(_replace_json_arrow_with_string(pa_type.value_type)) - if isinstance(pa_type, pa.StructType): - new_fields = [ - field.with_type(_replace_json_arrow_with_string(field.type)) - for field in pa_type - ] - return pa.struct(new_fields) - return pa_type - - class Block: """A immutable 2D data structure.""" @@ -746,7 +730,9 @@ def to_pandas_batches( # MyPy doesn't automatically narrow the type of 'dtype' here, # so we add an explicit check. if isinstance(dtype, pd.ArrowDtype): - safe_pa_type = _replace_json_arrow_with_string(dtype.pyarrow_dtype) + safe_pa_type = bigframes.dtypes._replace_json_arrow_with_string( + dtype.pyarrow_dtype + ) safe_dtype = pd.ArrowDtype(safe_pa_type) series_map[col] = pd.Series([], dtype=safe_dtype).astype(dtype) else: diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 6c05b6f4a3..2a7db7f86e 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -954,6 +954,21 @@ def contains_db_dtypes_json_dtype(dtype): return contains_db_dtypes_json_arrow_type(dtype.pyarrow_dtype) +def _replace_json_arrow_with_string(pa_type: pa.DataType) -> pa.DataType: + """Recursively replace JSONArrowType with string type.""" + if isinstance(pa_type, db_dtypes.JSONArrowType): + return pa.string() + if isinstance(pa_type, pa.ListType): + return pa.list_(_replace_json_arrow_with_string(pa_type.value_type)) + if isinstance(pa_type, pa.StructType): + new_fields = [ + field.with_type(_replace_json_arrow_with_string(field.type)) + for field in pa_type + ] + return pa.struct(new_fields) + return pa_type + + def warn_on_db_dtypes_json_dtype(dtypes): """Warn that the JSON dtype is changing. diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index c5e2584cc4..3b99bbeae7 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -151,7 +151,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "93dd10072d564a02a0278817d14855a9", + "model_id": "47795eaa10f149aeb99574232c0936eb", "version_major": 2, "version_minor": 1 }, @@ -214,7 +214,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "6e2538d446e344ac8505e4706730243e", + "model_id": "8354ce0f82d3495a9b630dfc362f73ee", "version_major": 2, "version_minor": 1 }, @@ -222,8 +222,9 @@ "TableWidget(page_size=10, row_count=5552452, table_html='
SQL
SELECT\n",
+       "`state` AS `state`,\n",
+       "`gender` AS `gender`,\n",
+       "`year` AS `year`,\n",
+       "`name` AS `name`,\n",
+       "`number` AS `number`\n",
+       "FROM\n",
+       "(SELECT\n",
+       "  *\n",
+       "FROM (\n",
+       "  SELECT\n",
+       "    `state`,\n",
+       "    `gender`,\n",
+       "    `year`,\n",
+       "    `name`,\n",
+       "    `number`\n",
+       "  FROM `bigquery-public-data.usa_names.usa_1910_2013` FOR SYSTEM_TIME AS OF TIMESTAMP('2025-10-30T21:48:48.979701+00:00')\n",
+       ") AS `t0`)\n",
+       "ORDER BY `name` ASC NULLS LAST ,`year` ASC NULLS LAST ,`state` ASC NULLS LAST\n",
+       "LIMIT 5
\n", " " ], "text/plain": [ @@ -313,7 +333,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "d6faf367ea5d44ad9d275506d870557a", + "model_id": "59461286a17d4a42b6be6d9d9c7bf7e3", "version_major": 2, "version_minor": 1 }, @@ -321,8 +341,9 @@ "TableWidget(page_size=10, row_count=5, table_html='
" @@ -391,9 +408,9 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "b6d6f3bacc2c43fc9a335e6039db12a5", + "model_id": "d1794b42579542a8980bd158e521bd3e", "version_major": 2, - "version_minor": 0 + "version_minor": 1 }, "text/plain": [ "TableWidget(page_size=10, row_count=5, table_html='
Date: Thu, 30 Oct 2025 23:38:12 +0000 Subject: [PATCH 50/53] revert unnecessary testcase change --- tests/system/small/test_anywidget.py | 198 +++++---------------------- 1 file changed, 37 insertions(+), 161 deletions(-) diff --git a/tests/system/small/test_anywidget.py b/tests/system/small/test_anywidget.py index 0587e13916..8944ee5365 100644 --- a/tests/system/small/test_anywidget.py +++ b/tests/system/small/test_anywidget.py @@ -12,18 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import annotations - -from unittest import mock import pandas as pd import pytest import bigframes as bf +pytest.importorskip("anywidget") + # Test constants to avoid change detector tests EXPECTED_ROW_COUNT = 6 EXPECTED_PAGE_SIZE = 2 +EXPECTED_TOTAL_PAGES = 3 @pytest.fixture(scope="module") @@ -62,7 +62,8 @@ def table_widget(paginated_bf_df: bf.dataframe.DataFrame): Helper fixture to create a TableWidget instance with a fixed page size. This reduces duplication across tests that use the same widget configuration. """ - from bigframes.display.anywidget import TableWidget + + from bigframes.display import TableWidget with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): # Delay context manager cleanup of `max_rows` until after tests finish. @@ -91,7 +92,7 @@ def small_bf_df( @pytest.fixture def small_widget(small_bf_df): """Helper fixture for tests using a DataFrame smaller than the page size.""" - from bigframes.display.anywidget import TableWidget + from bigframes.display import TableWidget with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 5): yield TableWidget(small_bf_df) @@ -110,19 +111,21 @@ def empty_bf_df( return session.read_pandas(empty_pandas_df) -@pytest.fixture(scope="module") -def json_df(session: bf.Session) -> bf.dataframe.DataFrame: - """Create a DataFrame with a JSON column for testing.""" - import bigframes.dtypes +def mock_execute_result_with_params( + self, schema, total_rows_val, arrow_batches_val, *args, **kwargs +): + """ + Mocks an execution result with configurable total_rows and arrow_batches. + """ + from bigframes.session.executor import ExecuteResult - pandas_df = pd.DataFrame( - { - "a": [1], - "b": ['{"c": 2, "d": 3}'], - } + return ExecuteResult( + iter(arrow_batches_val), + schema=schema, + query_job=None, + total_bytes=None, + total_rows=total_rows_val, ) - pandas_df["b"] = pandas_df["b"].astype(bigframes.dtypes.JSON_DTYPE) - return session.read_pandas(pandas_df) def _assert_html_matches_pandas_slice( @@ -151,11 +154,10 @@ def test_widget_initialization_should_calculate_total_row_count( paginated_bf_df: bf.dataframe.DataFrame, ): """A TableWidget should correctly calculate the total row count on creation.""" - from bigframes.display.anywidget import TableWidget + from bigframes.display import TableWidget with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): widget = TableWidget(paginated_bf_df) - widget = TableWidget(paginated_bf_df) assert widget.row_count == EXPECTED_ROW_COUNT @@ -266,7 +268,7 @@ def test_widget_pagination_should_work_with_custom_page_size( A widget should paginate correctly with a custom page size of 3. """ with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 3): - from bigframes.display.anywidget import TableWidget + from bigframes.display import TableWidget widget = TableWidget(paginated_bf_df) assert widget.page_size == 3 @@ -312,7 +314,7 @@ def test_widget_page_size_should_be_immutable_after_creation( by subsequent changes to global options. """ with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): - from bigframes.display.anywidget import TableWidget + from bigframes.display import TableWidget widget = TableWidget(paginated_bf_df) assert widget.page_size == 2 @@ -331,7 +333,7 @@ def test_widget_page_size_should_be_immutable_after_creation( def test_empty_widget_should_have_zero_row_count(empty_bf_df: bf.dataframe.DataFrame): """Given an empty DataFrame, the widget's row count should be 0.""" with bf.option_context("display.repr_mode", "anywidget"): - from bigframes.display.anywidget import TableWidget + from bigframes.display import TableWidget widget = TableWidget(empty_bf_df) @@ -341,7 +343,7 @@ def test_empty_widget_should_have_zero_row_count(empty_bf_df: bf.dataframe.DataF def test_empty_widget_should_render_table_headers(empty_bf_df: bf.dataframe.DataFrame): """Given an empty DataFrame, the widget should still render table headers.""" with bf.option_context("display.repr_mode", "anywidget"): - from bigframes.display.anywidget import TableWidget + from bigframes.display import TableWidget widget = TableWidget(empty_bf_df) @@ -436,112 +438,21 @@ def test_setting_page_size_above_max_should_be_clamped(table_widget): assert table_widget.page_size == expected_clamped_size -@mock.patch("bigframes.display.TableWidget") -def test_sql_anywidget_mode(mock_table_widget, session: bf.Session): - """ - Test that a SQL query runs in anywidget mode. - """ - sql = "SELECT * FROM `bigquery-public-data.usa_names.usa_1910_current` LIMIT 5" - - with bf.option_context("display.repr_mode", "anywidget"): - df = session.read_gbq(sql) - # In a real environment, this would display a widget. - # For testing, we just want to make sure we're in the anywidget code path. - df._repr_html_() - mock_table_widget.assert_called_once() - - -@mock.patch("IPython.display.display") -def test_struct_column_anywidget_mode(mock_display, session: bf.Session): - """ - Test that a DataFrame with a STRUCT column is displayed in anywidget mode - and does not fall back to the deferred representation. This confirms that - anywidget can handle complex types without raising an exception that would - trigger the fallback mechanism. - """ - pandas_df = pd.DataFrame( - { - "a": [1], - "b": [{"c": 2, "d": 3}], - } - ) - bf_df = session.read_pandas(pandas_df) - - with bf.option_context("display.repr_mode", "anywidget"): - with mock.patch( - "bigframes.dataframe.formatter.repr_query_job" - ) as mock_repr_query_job: - # Trigger the display logic. - result = bf_df._repr_html_() - - # Assert that we did NOT fall back to the deferred representation. - mock_repr_query_job.assert_not_called() - - widget = mock_display.call_args[0][0] - from bigframes.display.anywidget import TableWidget - - assert isinstance(widget, TableWidget) - - # Assert that the widget's html contains the struct - html = widget.table_html - assert "{'c': 2, 'd': 3}" in html - - # Assert that _repr_html_ returns an empty string - assert result == "" - - def test_widget_creation_should_load_css_for_rendering(table_widget): """ - Test that the widget's CSS is loaded correctly. + Given a TableWidget is created, when its resources are accessed, + it should contain the CSS content required for styling. """ - css_content = table_widget._css - assert ".bigframes-widget .footer" in css_content + # The table_widget fixture creates the widget. + # No additional setup is needed. + # Access the CSS content. + css_content = table_widget._css -@mock.patch("IPython.display.display") -def test_json_column_anywidget_mode(mock_display, json_df: bf.dataframe.DataFrame): - """ - Test that a DataFrame with a JSON column is displayed in anywidget mode - by converting JSON to string, and does not fall back to deferred representation. - """ - with bf.option_context("display.repr_mode", "anywidget"): - with mock.patch( - "bigframes.dataframe.formatter.repr_query_job" - ) as mock_repr_query_job: - result = json_df._repr_html_() - - # Assert no fallback - mock_repr_query_job.assert_not_called() - - # Assert TableWidget was created and displayed - mock_display.assert_called_once() - widget = mock_display.call_args[0][0] - from bigframes.display.anywidget import TableWidget - - assert isinstance(widget, TableWidget) - - # Assert JSON was converted to string in the HTML - html = widget.table_html - assert "{"c":2,"d":3}" in html - - assert result == "" - - -def mock_execute_result_with_params( - self, schema, total_rows_val, arrow_batches_val, *args, **kwargs -): - """ - Mocks an execution result with configurable total_rows and arrow_batches. - """ - from bigframes.session.executor import ExecuteResult - - return ExecuteResult( - iter(arrow_batches_val), - schema=schema, - query_job=None, - total_bytes=None, - total_rows=total_rows_val, - ) + # The content is a non-empty string containing a known selector. + assert isinstance(css_content, str) + assert len(css_content) > 0 + assert ".bigframes-widget .footer" in css_content def test_widget_row_count_should_be_immutable_after_creation( @@ -552,7 +463,7 @@ def test_widget_row_count_should_be_immutable_after_creation( options are changed later, the widget's original row_count should remain unchanged. """ - from bigframes.display.anywidget import TableWidget + from bigframes.display import TableWidget # Use a context manager to ensure the option is reset with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): @@ -592,7 +503,7 @@ def test_widget_should_fallback_to_zero_rows_with_invalid_total_rows( # Create the TableWidget under the error condition. with bf.option_context("display.repr_mode", "anywidget"): - from bigframes.display.anywidget import TableWidget + from bigframes.display import TableWidget # The widget should handle the faulty data from the mock without crashing. widget = TableWidget(paginated_bf_df) @@ -611,7 +522,7 @@ def test_widget_row_count_reflects_actual_data_available( Test that widget row_count reflects the actual data available, regardless of theoretical limits. """ - from bigframes.display.anywidget import TableWidget + from bigframes.display import TableWidget # Set up display options that define a page size. with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2): @@ -626,38 +537,3 @@ def test_widget_row_count_reflects_actual_data_available( # TODO(shuowei): Add tests for custom index and multiindex # This may not be necessary for the SQL Cell use case but should be # considered for completeness. - - -@pytest.fixture(scope="module") -def empty_json_df(session: bf.Session) -> bf.dataframe.DataFrame: - """Create an empty DataFrame with a JSON column for testing.""" - import bigframes.dtypes - - pandas_df = pd.DataFrame( - { - "a": pd.Series(dtype="int64"), - "b": pd.Series(dtype=bigframes.dtypes.JSON_DTYPE), - } - ) - return session.read_pandas(pandas_df) - - -def test_empty_widget_with_json_column(empty_json_df: bf.dataframe.DataFrame): - """Given an empty DataFrame with a JSON column, the widget should render table headers.""" - with bf.option_context("display.repr_mode", "anywidget"): - from bigframes.display.anywidget import TableWidget - - widget = TableWidget(empty_json_df) - html = widget.table_html - - assert widget.row_count == 0 - assert " Date: Thu, 30 Oct 2025 23:55:53 +0000 Subject: [PATCH 51/53] Improve JSON type handling for to_gbq and to_pandas_batches --- bigframes/core/blocks.py | 6 ++ tests/system/small/test_dataframe_io.py | 77 +++++++++++++++++++++++++ 2 files changed, 83 insertions(+) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index f359000d3d..3c2b45d193 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -715,6 +715,12 @@ def to_pandas_batches( # To reduce the number of edge cases to consider when working with the # results of this, always return at least one DataFrame. See: # b/428918844. + empty_val = pd.DataFrame( + { + col: pd.Series([], dtype=self.expr.get_column_type(col)) + for col in itertools.chain(self.value_columns, self.index_columns) + } + ) series_map = {} for col in itertools.chain(self.value_columns, self.index_columns): dtype = self.expr.get_column_type(col) diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 96d7881d67..400af791e8 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -376,6 +376,83 @@ def test_to_pandas_batches_w_empty_dataframe(session): pandas.testing.assert_series_equal(results[0].dtypes, empty.dtypes) +def test_to_pandas_batches_w_empty_dataframe_json_in_list(session): + """Tests to_pandas_batches() with an empty DataFrame containing a list of JSON. + + Regression test for https://github.com/googleapis/python-bigquery-dataframes/issues/1273 + """ + import db_dtypes + + json_list_dtype = pd.ArrowDtype(pa.list_(db_dtypes.JSONArrowType())) + empty_df_with_json_list = bpd.DataFrame( + { + "idx": pd.Series([], dtype="Int64"), + "json_list_col": pd.Series([], dtype=json_list_dtype), + }, + session=session, + ).set_index("idx", drop=True) + + results = list(empty_df_with_json_list.to_pandas_batches()) + + assert len(results) == 1 + assert list(results[0].columns) == ["json_list_col"] + assert results[0].dtypes["json_list_col"] == json_list_dtype + assert len(results[0]) == 0 + + +# --- Behavior 2: JSON in Struct --- + + +def test_to_pandas_batches_w_empty_dataframe_json_in_struct(session): + """Tests to_pandas_batches() with an empty DataFrame containing a struct of JSON. + + Regression test for https://github.com/googleapis/python-bigquery-dataframes/issues/1273 + """ + import db_dtypes + + json_struct_dtype = pd.ArrowDtype( + pa.struct([("json_field", db_dtypes.JSONArrowType())]) + ) + empty_df_with_json_struct = bpd.DataFrame( + { + "idx": pd.Series([], dtype="Int64"), + "json_struct_col": pd.Series([], dtype=json_struct_dtype), + }, + session=session, + ).set_index("idx", drop=True) + + results = list(empty_df_with_json_struct.to_pandas_batches()) + + assert len(results) == 1 + assert list(results[0].columns) == ["json_struct_col"] + assert results[0].dtypes["json_struct_col"] == json_struct_dtype + assert len(results[0]) == 0 + + +# --- Behavior 3: Simple JSON --- + + +def test_to_pandas_batches_w_empty_dataframe_simple_json(session): + """Tests to_pandas_batches() with an empty DataFrame containing a simple JSON column. + + Regression test for https://github.com/googleapis/python-bigquery-dataframes/issues/1273 + """ + empty_df_with_json = bpd.DataFrame( + { + "idx": pd.Series([], dtype="Int64"), + "json_col": pd.Series([], dtype=dtypes.JSON_DTYPE), + }, + session=session, + ).set_index("idx", drop=True) + + results = list(empty_df_with_json.to_pandas_batches()) + + assert len(results) == 1 + assert list(results[0].columns) == ["json_col"] + assert results[0].dtypes["json_col"] == dtypes.JSON_DTYPE + assert len(results[0]) == 0 + + @pytest.mark.parametrize("allow_large_results", (True, False)) def test_to_pandas_batches_w_page_size_and_max_results(session, allow_large_results): """Verify to_pandas_batches() APIs returns the expected page size. From 21134258858d1210d8a9393a59a5eb9d469d585e Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 31 Oct 2025 00:10:49 +0000 Subject: [PATCH 52/53] Remove unnecessary comment --- tests/system/small/test_dataframe_io.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 400af791e8..944fd27e6c 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -400,9 +400,6 @@ def test_to_pandas_batches_w_empty_dataframe_json_in_list(session): assert len(results[0]) == 0 -# --- Behavior 2: JSON in Struct --- - - def test_to_pandas_batches_w_empty_dataframe_json_in_struct(session): """Tests to_pandas_batches() with an empty DataFrame containing a struct of JSON. @@ -429,9 +426,6 @@ def test_to_pandas_batches_w_empty_dataframe_json_in_struct(session): assert len(results[0]) == 0 -# --- Behavior 3: Simple JSON --- - - def test_to_pandas_batches_w_empty_dataframe_simple_json(session): """Tests to_pandas_batches() with an empty DataFrame containing a simple JSON column. From e06a9f45c3b1b590506334492127d97bae7ca78a Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Tue, 4 Nov 2025 20:17:30 +0000 Subject: [PATCH 53/53] Revert bigframes/dtypes.py and mypy.ini to main branch version --- bigframes/dtypes.py | 15 --------------- mypy.ini | 3 --- 2 files changed, 18 deletions(-) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 37a7c150ca..29e1be1ace 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -972,21 +972,6 @@ def contains_db_dtypes_json_dtype(dtype): return contains_db_dtypes_json_arrow_type(dtype.pyarrow_dtype) -def _replace_json_arrow_with_string(pa_type: pa.DataType) -> pa.DataType: - """Recursively replace JSONArrowType with string type.""" - if isinstance(pa_type, db_dtypes.JSONArrowType): - return pa.string() - if isinstance(pa_type, pa.ListType): - return pa.list_(_replace_json_arrow_with_string(pa_type.value_type)) - if isinstance(pa_type, pa.StructType): - new_fields = [ - field.with_type(_replace_json_arrow_with_string(field.type)) - for field in pa_type - ] - return pa.struct(new_fields) - return pa_type - - def warn_on_db_dtypes_json_dtype(dtypes): """Warn that the JSON dtype is changing. diff --git a/mypy.ini b/mypy.ini index 1fbca2498a..7709eb200a 100644 --- a/mypy.ini +++ b/mypy.ini @@ -44,6 +44,3 @@ ignore_missing_imports = True [mypy-anywidget] ignore_missing_imports = True - -[mypy-db_dtypes] -ignore_missing_imports = True