From 8c3451266c28ec0da6dd57c4f9929ae68a593574 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 30 Oct 2025 21:46:08 +0000 Subject: [PATCH 01/15] Correctly display DataFrames with JSON columns in anywidget --- bigframes/core/blocks.py | 48 +++++++-- bigframes/dataframe.py | 2 - bigframes/session/executor.py | 34 +++++++ mypy.ini | 3 + notebooks/dataframes/anywidget_mode.ipynb | 119 ++++++++++++++++++++-- 5 files changed, 189 insertions(+), 17 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 1900b7208a..2dc9d7d898 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -43,6 +43,7 @@ import warnings import bigframes_vendored.constants as constants +import db_dtypes import google.cloud.bigquery as bigquery import numpy import pandas as pd @@ -134,6 +135,21 @@ class MaterializationOptions: ordered: bool = True +def _replace_json_arrow_with_string(pa_type: pa.DataType) -> pa.DataType: + """Recursively replace JSONArrowType with string type.""" + if isinstance(pa_type, db_dtypes.JSONArrowType): + return pa.string() + if isinstance(pa_type, pa.ListType): + return pa.list_(_replace_json_arrow_with_string(pa_type.value_type)) + if isinstance(pa_type, pa.StructType): + new_fields = [ + field.with_type(_replace_json_arrow_with_string(field.type)) + for field in pa_type + ] + return pa.struct(new_fields) + return pa_type + + class Block: """A immutable 2D data structure.""" @@ -715,12 +731,32 @@ def to_pandas_batches( # To reduce the number of edge cases to consider when working with the # results of this, always return at least one DataFrame. See: # b/428918844. - empty_val = pd.DataFrame( - { - col: pd.Series([], dtype=self.expr.get_column_type(col)) - for col in itertools.chain(self.value_columns, self.index_columns) - } - ) + series_map = {} + for col in itertools.chain(self.value_columns, self.index_columns): + dtype = self.expr.get_column_type(col) + if bigframes.dtypes.contains_db_dtypes_json_dtype(dtype): + # Due to a limitation in Apache Arrow (#45262), JSON columns are not + # natively supported by the to_pandas_batches() method, which is + # used by the anywidget backend. + # Workaround for https://github.com/googleapis/python-bigquery-dataframes/issues/1273 + # PyArrow doesn't support creating an empty array with db_dtypes.JSONArrowType, + # especially when nested. + # Create with string type and then cast. + + # MyPy doesn't automatically narrow the type of 'dtype' here, + # so we add an explicit check. + if isinstance(dtype, pd.ArrowDtype): + safe_pa_type = _replace_json_arrow_with_string(dtype.pyarrow_dtype) + safe_dtype = pd.ArrowDtype(safe_pa_type) + series_map[col] = pd.Series([], dtype=safe_dtype).astype(dtype) + else: + # This branch should ideally not be reached if + # contains_db_dtypes_json_dtype is accurate, + # but it's here for MyPy's sake. + series_map[col] = pd.Series([], dtype=dtype) + else: + series_map[col] = pd.Series([], dtype=dtype) + empty_val = pd.DataFrame(series_map) dfs = map( lambda a: a[0], itertools.zip_longest( diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index f016fddd83..c954c8eebc 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -783,8 +783,6 @@ def __repr__(self) -> str: opts = bigframes.options.display max_results = opts.max_rows - # anywdiget mode uses the same display logic as the "deferred" mode - # for faster execution if opts.repr_mode in ("deferred", "anywidget"): return formatter.repr_query_job(self._compute_dry_run()) diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py index d0cfe5f4f7..97ad7f5bb8 100644 --- a/bigframes/session/executor.py +++ b/bigframes/session/executor.py @@ -52,6 +52,8 @@ def arrow_batches(self) -> Iterator[pyarrow.RecordBatch]: result_rows = 0 for batch in self._arrow_batches: + # Convert JSON columns to strings before casting + batch = self._convert_json_to_string(batch) batch = pyarrow_utils.cast_batch(batch, self.schema.to_pyarrow()) result_rows += batch.num_rows @@ -67,6 +69,38 @@ def arrow_batches(self) -> Iterator[pyarrow.RecordBatch]: yield batch + def _convert_json_to_string( + self, batch: pyarrow.RecordBatch + ) -> pyarrow.RecordBatch: + """Convert JSON arrow extension types to string to avoid PyArrow compatibility issues.""" + import logging + + new_arrays = [] + new_fields = [] + + for i, field in enumerate(batch.schema): + array = batch.column(i) + + # Check if this column should be JSON based on our schema + schema_item = next( + (item for item in self.schema.items if item.column == field.name), None + ) + + if schema_item and schema_item.dtype == bigframes.dtypes.JSON_DTYPE: + logging.info(f"Converting JSON column: {field.name}") + # Convert JSONArrowType to string + if array.type == bigframes.dtypes.JSON_ARROW_TYPE: + array = array.cast(pyarrow.string()) + new_fields.append(pyarrow.field(field.name, pyarrow.string())) + else: + new_fields.append(field) + + new_arrays.append(array) + + return pyarrow.RecordBatch.from_arrays( + new_arrays, schema=pyarrow.schema(new_fields) + ) + def to_arrow_table(self) -> pyarrow.Table: # Need to provide schema if no result rows, as arrow can't infer # If ther are rows, it is safest to infer schema from batches. diff --git a/mypy.ini b/mypy.ini index 7709eb200a..1fbca2498a 100644 --- a/mypy.ini +++ b/mypy.ini @@ -44,3 +44,6 @@ ignore_missing_imports = True [mypy-anywidget] ignore_missing_imports = True + +[mypy-db_dtypes] +ignore_missing_imports = True diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index c2af915721..347f57566a 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -35,7 +35,16 @@ "execution_count": 2, "id": "ca22f059", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/_python_version_support.py:266: FutureWarning: You are using a Python version (3.10.15) which Google will stop supporting in new releases of google.api_core once it reaches its end of life (2026-10-04). Please upgrade to the latest Python version, or at least Python 3.11, to continue receiving updates for google.api_core past that date.\n", + " warnings.warn(message, FutureWarning)\n" + ] + } + ], "source": [ "import bigframes.pandas as bpd" ] @@ -142,9 +151,9 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "aafd4f912b5f42e0896aa5f0c2c62620", + "model_id": "473b016aa6b24c86aafc6372352e822d", "version_major": 2, - "version_minor": 0 + "version_minor": 1 }, "text/plain": [ "TableWidget(page_size=10, row_count=5552452, table_html='" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:969: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "✅ Completed. " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "a6d61e48cca642b7a57e6431359b4cc4", + "version_major": 2, + "version_minor": 1 + }, + "text/plain": [ + "TableWidget(page_size=10, row_count=5, table_html='
(\\\"Extract the values.\\\", OBJ.GET_ACCESS_URL(OBJ.FETCH_METADATA(OBJ.MAKE_REF(gcs_path, \\\"us.conn\\\")), \\\"r\\\")),\n", + " connection_id=>\\\"bigframes-dev.us.bigframes-default-connection\\\",\n", + " output_schema=>\\\"publication_date string, class_international string, application_number string, filing_date string\\\") AS result,\n", + " *\n", + " FROM `bigquery-public-data.labeled_patents.extracted_data`\n", + " LIMIT 5;\n", + "\"\"\")" + ] } ], "metadata": { From 05e9b6955125b051c2024bff274d5c2eaaf8e24b Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Thu, 30 Oct 2025 23:55:53 +0000 Subject: [PATCH 02/15] Improve JSON type handling for to_gbq and to_pandas_batches --- bigframes/core/blocks.py | 10 +++- bigframes/dtypes.py | 15 +++++ tests/system/small/test_dataframe_io.py | 77 +++++++++++++++++++++++++ 3 files changed, 101 insertions(+), 1 deletion(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 2dc9d7d898..b21b122134 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -731,6 +731,12 @@ def to_pandas_batches( # To reduce the number of edge cases to consider when working with the # results of this, always return at least one DataFrame. See: # b/428918844. + empty_val = pd.DataFrame( + { + col: pd.Series([], dtype=self.expr.get_column_type(col)) + for col in itertools.chain(self.value_columns, self.index_columns) + } + ) series_map = {} for col in itertools.chain(self.value_columns, self.index_columns): dtype = self.expr.get_column_type(col) @@ -746,7 +752,9 @@ def to_pandas_batches( # MyPy doesn't automatically narrow the type of 'dtype' here, # so we add an explicit check. if isinstance(dtype, pd.ArrowDtype): - safe_pa_type = _replace_json_arrow_with_string(dtype.pyarrow_dtype) + safe_pa_type = bigframes.dtypes._replace_json_arrow_with_string( + dtype.pyarrow_dtype + ) safe_dtype = pd.ArrowDtype(safe_pa_type) series_map[col] = pd.Series([], dtype=safe_dtype).astype(dtype) else: diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 6c05b6f4a3..2a7db7f86e 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -954,6 +954,21 @@ def contains_db_dtypes_json_dtype(dtype): return contains_db_dtypes_json_arrow_type(dtype.pyarrow_dtype) +def _replace_json_arrow_with_string(pa_type: pa.DataType) -> pa.DataType: + """Recursively replace JSONArrowType with string type.""" + if isinstance(pa_type, db_dtypes.JSONArrowType): + return pa.string() + if isinstance(pa_type, pa.ListType): + return pa.list_(_replace_json_arrow_with_string(pa_type.value_type)) + if isinstance(pa_type, pa.StructType): + new_fields = [ + field.with_type(_replace_json_arrow_with_string(field.type)) + for field in pa_type + ] + return pa.struct(new_fields) + return pa_type + + def warn_on_db_dtypes_json_dtype(dtypes): """Warn that the JSON dtype is changing. diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 96d7881d67..400af791e8 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -376,6 +376,83 @@ def test_to_pandas_batches_w_empty_dataframe(session): pandas.testing.assert_series_equal(results[0].dtypes, empty.dtypes) +def test_to_pandas_batches_w_empty_dataframe_json_in_list(session): + """Tests to_pandas_batches() with an empty DataFrame containing a list of JSON. + + Regression test for https://github.com/googleapis/python-bigquery-dataframes/issues/1273 + """ + import db_dtypes + + json_list_dtype = pd.ArrowDtype(pa.list_(db_dtypes.JSONArrowType())) + empty_df_with_json_list = bpd.DataFrame( + { + "idx": pd.Series([], dtype="Int64"), + "json_list_col": pd.Series([], dtype=json_list_dtype), + }, + session=session, + ).set_index("idx", drop=True) + + results = list(empty_df_with_json_list.to_pandas_batches()) + + assert len(results) == 1 + assert list(results[0].columns) == ["json_list_col"] + assert results[0].dtypes["json_list_col"] == json_list_dtype + assert len(results[0]) == 0 + + +# --- Behavior 2: JSON in Struct --- + + +def test_to_pandas_batches_w_empty_dataframe_json_in_struct(session): + """Tests to_pandas_batches() with an empty DataFrame containing a struct of JSON. + + Regression test for https://github.com/googleapis/python-bigquery-dataframes/issues/1273 + """ + import db_dtypes + + json_struct_dtype = pd.ArrowDtype( + pa.struct([("json_field", db_dtypes.JSONArrowType())]) + ) + empty_df_with_json_struct = bpd.DataFrame( + { + "idx": pd.Series([], dtype="Int64"), + "json_struct_col": pd.Series([], dtype=json_struct_dtype), + }, + session=session, + ).set_index("idx", drop=True) + + results = list(empty_df_with_json_struct.to_pandas_batches()) + + assert len(results) == 1 + assert list(results[0].columns) == ["json_struct_col"] + assert results[0].dtypes["json_struct_col"] == json_struct_dtype + assert len(results[0]) == 0 + + +# --- Behavior 3: Simple JSON --- + + +def test_to_pandas_batches_w_empty_dataframe_simple_json(session): + """Tests to_pandas_batches() with an empty DataFrame containing a simple JSON column. + + Regression test for https://github.com/googleapis/python-bigquery-dataframes/issues/1273 + """ + empty_df_with_json = bpd.DataFrame( + { + "idx": pd.Series([], dtype="Int64"), + "json_col": pd.Series([], dtype=dtypes.JSON_DTYPE), + }, + session=session, + ).set_index("idx", drop=True) + + results = list(empty_df_with_json.to_pandas_batches()) + + assert len(results) == 1 + assert list(results[0].columns) == ["json_col"] + assert results[0].dtypes["json_col"] == dtypes.JSON_DTYPE + assert len(results[0]) == 0 + + @pytest.mark.parametrize("allow_large_results", (True, False)) def test_to_pandas_batches_w_page_size_and_max_results(session, allow_large_results): """Verify to_pandas_batches() APIs returns the expected page size. From aa04bac44924009f5526067995c15c900c696dfa Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 31 Oct 2025 00:08:53 +0000 Subject: [PATCH 03/15] Revert "Correctly display DataFrames with JSON columns in anywidget" This reverts commit 8c3451266c28ec0da6dd57c4f9929ae68a593574. --- bigframes/core/blocks.py | 16 --- bigframes/dataframe.py | 2 + bigframes/session/executor.py | 34 ------- mypy.ini | 3 - notebooks/dataframes/anywidget_mode.ipynb | 119 ++-------------------- 5 files changed, 11 insertions(+), 163 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index b21b122134..3c2b45d193 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -43,7 +43,6 @@ import warnings import bigframes_vendored.constants as constants -import db_dtypes import google.cloud.bigquery as bigquery import numpy import pandas as pd @@ -135,21 +134,6 @@ class MaterializationOptions: ordered: bool = True -def _replace_json_arrow_with_string(pa_type: pa.DataType) -> pa.DataType: - """Recursively replace JSONArrowType with string type.""" - if isinstance(pa_type, db_dtypes.JSONArrowType): - return pa.string() - if isinstance(pa_type, pa.ListType): - return pa.list_(_replace_json_arrow_with_string(pa_type.value_type)) - if isinstance(pa_type, pa.StructType): - new_fields = [ - field.with_type(_replace_json_arrow_with_string(field.type)) - for field in pa_type - ] - return pa.struct(new_fields) - return pa_type - - class Block: """A immutable 2D data structure.""" diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index c954c8eebc..f016fddd83 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -783,6 +783,8 @@ def __repr__(self) -> str: opts = bigframes.options.display max_results = opts.max_rows + # anywdiget mode uses the same display logic as the "deferred" mode + # for faster execution if opts.repr_mode in ("deferred", "anywidget"): return formatter.repr_query_job(self._compute_dry_run()) diff --git a/bigframes/session/executor.py b/bigframes/session/executor.py index 97ad7f5bb8..d0cfe5f4f7 100644 --- a/bigframes/session/executor.py +++ b/bigframes/session/executor.py @@ -52,8 +52,6 @@ def arrow_batches(self) -> Iterator[pyarrow.RecordBatch]: result_rows = 0 for batch in self._arrow_batches: - # Convert JSON columns to strings before casting - batch = self._convert_json_to_string(batch) batch = pyarrow_utils.cast_batch(batch, self.schema.to_pyarrow()) result_rows += batch.num_rows @@ -69,38 +67,6 @@ def arrow_batches(self) -> Iterator[pyarrow.RecordBatch]: yield batch - def _convert_json_to_string( - self, batch: pyarrow.RecordBatch - ) -> pyarrow.RecordBatch: - """Convert JSON arrow extension types to string to avoid PyArrow compatibility issues.""" - import logging - - new_arrays = [] - new_fields = [] - - for i, field in enumerate(batch.schema): - array = batch.column(i) - - # Check if this column should be JSON based on our schema - schema_item = next( - (item for item in self.schema.items if item.column == field.name), None - ) - - if schema_item and schema_item.dtype == bigframes.dtypes.JSON_DTYPE: - logging.info(f"Converting JSON column: {field.name}") - # Convert JSONArrowType to string - if array.type == bigframes.dtypes.JSON_ARROW_TYPE: - array = array.cast(pyarrow.string()) - new_fields.append(pyarrow.field(field.name, pyarrow.string())) - else: - new_fields.append(field) - - new_arrays.append(array) - - return pyarrow.RecordBatch.from_arrays( - new_arrays, schema=pyarrow.schema(new_fields) - ) - def to_arrow_table(self) -> pyarrow.Table: # Need to provide schema if no result rows, as arrow can't infer # If ther are rows, it is safest to infer schema from batches. diff --git a/mypy.ini b/mypy.ini index 1fbca2498a..7709eb200a 100644 --- a/mypy.ini +++ b/mypy.ini @@ -44,6 +44,3 @@ ignore_missing_imports = True [mypy-anywidget] ignore_missing_imports = True - -[mypy-db_dtypes] -ignore_missing_imports = True diff --git a/notebooks/dataframes/anywidget_mode.ipynb b/notebooks/dataframes/anywidget_mode.ipynb index 347f57566a..c2af915721 100644 --- a/notebooks/dataframes/anywidget_mode.ipynb +++ b/notebooks/dataframes/anywidget_mode.ipynb @@ -35,16 +35,7 @@ "execution_count": 2, "id": "ca22f059", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/venv/lib/python3.10/site-packages/google/api_core/_python_version_support.py:266: FutureWarning: You are using a Python version (3.10.15) which Google will stop supporting in new releases of google.api_core once it reaches its end of life (2026-10-04). Please upgrade to the latest Python version, or at least Python 3.11, to continue receiving updates for google.api_core past that date.\n", - " warnings.warn(message, FutureWarning)\n" - ] - } - ], + "outputs": [], "source": [ "import bigframes.pandas as bpd" ] @@ -151,9 +142,9 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "473b016aa6b24c86aafc6372352e822d", + "model_id": "aafd4f912b5f42e0896aa5f0c2c62620", "version_major": 2, - "version_minor": 1 + "version_minor": 0 }, "text/plain": [ "TableWidget(page_size=10, row_count=5552452, table_html='
" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:969: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" - ] - }, - { - "data": { - "text/html": [ - "✅ Completed. " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "a6d61e48cca642b7a57e6431359b4cc4", - "version_major": 2, - "version_minor": 1 - }, - "text/plain": [ - "TableWidget(page_size=10, row_count=5, table_html='
(\\\"Extract the values.\\\", OBJ.GET_ACCESS_URL(OBJ.FETCH_METADATA(OBJ.MAKE_REF(gcs_path, \\\"us.conn\\\")), \\\"r\\\")),\n", - " connection_id=>\\\"bigframes-dev.us.bigframes-default-connection\\\",\n", - " output_schema=>\\\"publication_date string, class_international string, application_number string, filing_date string\\\") AS result,\n", - " *\n", - " FROM `bigquery-public-data.labeled_patents.extracted_data`\n", - " LIMIT 5;\n", - "\"\"\")" - ] } ], "metadata": { From 592e43b128ffdf58c133e904afcde1172b69ef52 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 31 Oct 2025 00:10:49 +0000 Subject: [PATCH 04/15] Remove unnecessary comment --- tests/system/small/test_dataframe_io.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 400af791e8..944fd27e6c 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -400,9 +400,6 @@ def test_to_pandas_batches_w_empty_dataframe_json_in_list(session): assert len(results[0]) == 0 -# --- Behavior 2: JSON in Struct --- - - def test_to_pandas_batches_w_empty_dataframe_json_in_struct(session): """Tests to_pandas_batches() with an empty DataFrame containing a struct of JSON. @@ -429,9 +426,6 @@ def test_to_pandas_batches_w_empty_dataframe_json_in_struct(session): assert len(results[0]) == 0 -# --- Behavior 3: Simple JSON --- - - def test_to_pandas_batches_w_empty_dataframe_simple_json(session): """Tests to_pandas_batches() with an empty DataFrame containing a simple JSON column. From 5955bfe0a6a435894a9eaa08331f59932d5aef08 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 31 Oct 2025 19:55:11 +0000 Subject: [PATCH 05/15] code refactor --- bigframes/core/blocks.py | 17 +++++------------ bigframes/session/loader.py | 23 +++-------------------- 2 files changed, 8 insertions(+), 32 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 40dff1c2a8..a5e5f270c1 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -720,17 +720,12 @@ def to_pandas_batches( series_map = {} for col in itertools.chain(self.value_columns, self.index_columns): dtype = self.expr.get_column_type(col) - if bigframes.dtypes.contains_db_dtypes_json_dtype(dtype): - # Due to a limitation in Apache Arrow (#45262), JSON columns are not - # natively supported by the to_pandas_batches() method, which is - # used by the anywidget backend. - # Workaround for https://github.com/googleapis/python-bigquery-dataframes/issues/1273 - # PyArrow doesn't support creating an empty array with db_dtypes.JSONArrowType, - # especially when nested. + try: + series_map[col] = pd.Series([], dtype=dtype) + except pa.ArrowNotImplementedError: + # PyArrow doesn't support creating an empty array with + # db_dtypes.JSONArrowType, especially when nested. # Create with string type and then cast. - - # MyPy doesn't automatically narrow the type of 'dtype' here, - # so we add an explicit check. if isinstance(dtype, pd.ArrowDtype): safe_pa_type = bigframes.dtypes._replace_json_arrow_with_string( dtype.pyarrow_dtype @@ -742,8 +737,6 @@ def to_pandas_batches( # contains_db_dtypes_json_dtype is accurate, # but it's here for MyPy's sake. series_map[col] = pd.Series([], dtype=dtype) - else: - series_map[col] = pd.Series([], dtype=dtype) empty_val = pd.DataFrame(series_map) dfs = map( lambda a: a[0], diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index 6b16fe6bfd..62be2666ef 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -45,7 +45,6 @@ import google.cloud.bigquery.table from google.cloud.bigquery_storage_v1 import types as bq_storage_types import pandas -import pyarrow as pa import bigframes._tools import bigframes._tools.strings @@ -1307,22 +1306,6 @@ def _transform_read_gbq_configuration(configuration: Optional[dict]) -> dict: return configuration -def _has_json_arrow_type(arrow_type: pa.DataType) -> bool: - """ - Searches recursively for JSON array type within a PyArrow DataType. - """ - if arrow_type == bigframes.dtypes.JSON_ARROW_TYPE: - return True - if pa.types.is_list(arrow_type): - return _has_json_arrow_type(arrow_type.value_type) - if pa.types.is_struct(arrow_type): - for i in range(arrow_type.num_fields): - if _has_json_arrow_type(arrow_type.field(i).type): - return True - return False - return False - - def _validate_dtype_can_load(name: str, column_type: bigframes.dtypes.Dtype): """ Determines whether a datatype is supported by bq load jobs. @@ -1339,9 +1322,9 @@ def _validate_dtype_can_load(name: str, column_type: bigframes.dtypes.Dtype): if column_type == bigframes.dtypes.JSON_DTYPE: return - if isinstance(column_type, pandas.ArrowDtype) and _has_json_arrow_type( - column_type.pyarrow_dtype - ): + if isinstance( + column_type, pandas.ArrowDtype + ) and bigframes.dtypes.contains_db_dtypes_json_dtype(column_type): raise NotImplementedError( f"Nested JSON types, found in column `{name}`: `{column_type}`', " f"are currently unsupported for upload. {constants.FEEDBACK_LINK}" From d07ba7e68f867ab13bd70451a5b59965530b8000 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 31 Oct 2025 20:31:09 +0000 Subject: [PATCH 06/15] testcase update --- bigframes/core/blocks.py | 12 +--- tests/system/small/test_dataframe_io.py | 93 ++++++++++--------------- 2 files changed, 37 insertions(+), 68 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index a5e5f270c1..45daebf078 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -711,12 +711,6 @@ def to_pandas_batches( # To reduce the number of edge cases to consider when working with the # results of this, always return at least one DataFrame. See: # b/428918844. - empty_val = pd.DataFrame( - { - col: pd.Series([], dtype=self.expr.get_column_type(col)) - for col in itertools.chain(self.value_columns, self.index_columns) - } - ) series_map = {} for col in itertools.chain(self.value_columns, self.index_columns): dtype = self.expr.get_column_type(col) @@ -733,10 +727,8 @@ def to_pandas_batches( safe_dtype = pd.ArrowDtype(safe_pa_type) series_map[col] = pd.Series([], dtype=safe_dtype).astype(dtype) else: - # This branch should ideally not be reached if - # contains_db_dtypes_json_dtype is accurate, - # but it's here for MyPy's sake. - series_map[col] = pd.Series([], dtype=dtype) + # Fallback for other types that might error + series_map[col] = pd.Series([], dtype="object").astype(dtype) empty_val = pd.DataFrame(series_map) dfs = map( lambda a: a[0], diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 944fd27e6c..bb9a001606 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -376,75 +376,52 @@ def test_to_pandas_batches_w_empty_dataframe(session): pandas.testing.assert_series_equal(results[0].dtypes, empty.dtypes) -def test_to_pandas_batches_w_empty_dataframe_json_in_list(session): - """Tests to_pandas_batches() with an empty DataFrame containing a list of JSON. - - Regression test for https://github.com/googleapis/python-bigquery-dataframes/issues/1273 +def test_to_pandas_batches_preserves_dtypes_for_populated_nested_json(session): + """Verifies to_pandas_batches() preserves dtypes for nested JSON.""" + # This SQL query only tests the POPULATED case. + sql = """ + SELECT + 0 AS id, + [JSON '{"a":1}', JSON '{"b":2}'] AS json_array, + STRUCT(JSON '{"x":1}' AS json_field, 'test' AS str_field) AS json_struct """ - import db_dtypes + df = session.read_gbq(sql, index_col="id") - json_list_dtype = pd.ArrowDtype(pa.list_(db_dtypes.JSONArrowType())) - empty_df_with_json_list = bpd.DataFrame( - { - "idx": pd.Series([], dtype="Int64"), - "json_list_col": pd.Series([], dtype=json_list_dtype), - }, - session=session, - ).set_index("idx", drop=True) + batches = list(df.to_pandas_batches()) - results = list(empty_df_with_json_list.to_pandas_batches()) + # Check that we processed the row + assert sum(len(b) for b in batches) == 1 - assert len(results) == 1 - assert list(results[0].columns) == ["json_list_col"] - assert results[0].dtypes["json_list_col"] == json_list_dtype - assert len(results[0]) == 0 + # Check dtypes on the resulting batch + assert isinstance(batches[0].dtypes["json_array"], pd.ArrowDtype) + assert isinstance(batches[0].dtypes["json_array"].pyarrow_dtype, pa.ListType) + assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype) + assert isinstance(batches[0].dtypes["json_struct"].pyarrow_dtype, pa.StructType) -def test_to_pandas_batches_w_empty_dataframe_json_in_struct(session): - """Tests to_pandas_batches() with an empty DataFrame containing a struct of JSON. +def test_to_pandas_batches_should_not_error_on_empty_nested_json(session): + """Verify to_pandas_batches() works with empty nested JSON types. - Regression test for https://github.com/googleapis/python-bigquery-dataframes/issues/1273 + Regression test for PyArrow limitation with empty JSON arrays. """ - import db_dtypes - - json_struct_dtype = pd.ArrowDtype( - pa.struct([("json_field", db_dtypes.JSONArrowType())]) - ) - empty_df_with_json_struct = bpd.DataFrame( - { - "idx": pd.Series([], dtype="Int64"), - "json_struct_col": pd.Series([], dtype=json_struct_dtype), - }, - session=session, - ).set_index("idx", drop=True) - - results = list(empty_df_with_json_struct.to_pandas_batches()) - - assert len(results) == 1 - assert list(results[0].columns) == ["json_struct_col"] - assert results[0].dtypes["json_struct_col"] == json_struct_dtype - assert len(results[0]) == 0 - - -def test_to_pandas_batches_w_empty_dataframe_simple_json(session): - """Tests to_pandas_batches() with an empty DataFrame containing a simple JSON column. - - Regression test for https://github.com/googleapis/python-bigquery-dataframes/issues/1273 + # This SQL query is MINIMAL and tests only the EMPTY regression case. + sql = """ + SELECT + 1 AS id, + [] AS json_array, + STRUCT(NULL AS json_field, 'test2' AS str_field) AS json_struct """ - empty_df_with_json = bpd.DataFrame( - { - "idx": pd.Series([], dtype="Int64"), - "json_col": pd.Series([], dtype=dtypes.JSON_DTYPE), - }, - session=session, - ).set_index("idx", drop=True) + df = session.read_gbq(sql, index_col="id") - results = list(empty_df_with_json.to_pandas_batches()) + # The main point of this test is that this line does not raise an error. + batches = list(df.to_pandas_batches()) - assert len(results) == 1 - assert list(results[0].columns) == ["json_col"] - assert results[0].dtypes["json_col"] == dtypes.JSON_DTYPE - assert len(results[0]) == 0 + # Verify the row was actually processed and not just skipped + assert sum(len(b) for b in batches) == 1 + + # Verify dtypes are still correct, even with empty data + assert isinstance(batches[0].dtypes["json_array"], pd.ArrowDtype) + assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype) @pytest.mark.parametrize("allow_large_results", (True, False)) From d7455a65ff016f507677c2a32df73c7941537890 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 31 Oct 2025 21:27:20 +0000 Subject: [PATCH 07/15] Fix testcase --- bigframes/core/blocks.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 45daebf078..ca6d7760c0 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -724,8 +724,12 @@ def to_pandas_batches( safe_pa_type = bigframes.dtypes._replace_json_arrow_with_string( dtype.pyarrow_dtype ) - safe_dtype = pd.ArrowDtype(safe_pa_type) - series_map[col] = pd.Series([], dtype=safe_dtype).astype(dtype) + # Create empty array with safe type, but preserve original dtype metadata + empty_array = pa.array([], type=safe_pa_type) + series_map[col] = pd.Series( + empty_array, + dtype=dtype, # Use original dtype directly + ) else: # Fallback for other types that might error series_map[col] = pd.Series([], dtype="object").astype(dtype) From 12e2a6387e9e6f32225656c8eff468886c721c1d Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 31 Oct 2025 23:26:07 +0000 Subject: [PATCH 08/15] function call updated in bigframes/core/blocks.py, unused function removed from bigframes/dtypes.py --- bigframes/core/blocks.py | 4 +--- bigframes/dtypes.py | 15 --------------- 2 files changed, 1 insertion(+), 18 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index ca6d7760c0..817e60cce8 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -721,9 +721,7 @@ def to_pandas_batches( # db_dtypes.JSONArrowType, especially when nested. # Create with string type and then cast. if isinstance(dtype, pd.ArrowDtype): - safe_pa_type = bigframes.dtypes._replace_json_arrow_with_string( - dtype.pyarrow_dtype - ) + safe_pa_type = bigframes.dtypes.to_storage_type(dtype.pyarrow_dtype) # Create empty array with safe type, but preserve original dtype metadata empty_array = pa.array([], type=safe_pa_type) series_map[col] = pd.Series( diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 37a7c150ca..29e1be1ace 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -972,21 +972,6 @@ def contains_db_dtypes_json_dtype(dtype): return contains_db_dtypes_json_arrow_type(dtype.pyarrow_dtype) -def _replace_json_arrow_with_string(pa_type: pa.DataType) -> pa.DataType: - """Recursively replace JSONArrowType with string type.""" - if isinstance(pa_type, db_dtypes.JSONArrowType): - return pa.string() - if isinstance(pa_type, pa.ListType): - return pa.list_(_replace_json_arrow_with_string(pa_type.value_type)) - if isinstance(pa_type, pa.StructType): - new_fields = [ - field.with_type(_replace_json_arrow_with_string(field.type)) - for field in pa_type - ] - return pa.struct(new_fields) - return pa_type - - def warn_on_db_dtypes_json_dtype(dtypes): """Warn that the JSON dtype is changing. From 393a2f9b64173e578302ffca4841152a6b0f1a30 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 31 Oct 2025 23:31:02 +0000 Subject: [PATCH 09/15] revert the code refactor in loader.py, I will use a seperate pr for this refactor --- bigframes/session/loader.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index 62be2666ef..6b16fe6bfd 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -45,6 +45,7 @@ import google.cloud.bigquery.table from google.cloud.bigquery_storage_v1 import types as bq_storage_types import pandas +import pyarrow as pa import bigframes._tools import bigframes._tools.strings @@ -1306,6 +1307,22 @@ def _transform_read_gbq_configuration(configuration: Optional[dict]) -> dict: return configuration +def _has_json_arrow_type(arrow_type: pa.DataType) -> bool: + """ + Searches recursively for JSON array type within a PyArrow DataType. + """ + if arrow_type == bigframes.dtypes.JSON_ARROW_TYPE: + return True + if pa.types.is_list(arrow_type): + return _has_json_arrow_type(arrow_type.value_type) + if pa.types.is_struct(arrow_type): + for i in range(arrow_type.num_fields): + if _has_json_arrow_type(arrow_type.field(i).type): + return True + return False + return False + + def _validate_dtype_can_load(name: str, column_type: bigframes.dtypes.Dtype): """ Determines whether a datatype is supported by bq load jobs. @@ -1322,9 +1339,9 @@ def _validate_dtype_can_load(name: str, column_type: bigframes.dtypes.Dtype): if column_type == bigframes.dtypes.JSON_DTYPE: return - if isinstance( - column_type, pandas.ArrowDtype - ) and bigframes.dtypes.contains_db_dtypes_json_dtype(column_type): + if isinstance(column_type, pandas.ArrowDtype) and _has_json_arrow_type( + column_type.pyarrow_dtype + ): raise NotImplementedError( f"Nested JSON types, found in column `{name}`: `{column_type}`', " f"are currently unsupported for upload. {constants.FEEDBACK_LINK}" From 2ff0108197e0e07953d8a1e4d13cf1fbfaa4afb7 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Fri, 31 Oct 2025 23:37:57 +0000 Subject: [PATCH 10/15] replace the manual construction of the empty DataFrame with the more robust try...except block that leverages to_pyarrow and empty_table --- bigframes/core/blocks.py | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 817e60cce8..1eac176c32 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -711,27 +711,14 @@ def to_pandas_batches( # To reduce the number of edge cases to consider when working with the # results of this, always return at least one DataFrame. See: # b/428918844. - series_map = {} - for col in itertools.chain(self.value_columns, self.index_columns): - dtype = self.expr.get_column_type(col) - try: - series_map[col] = pd.Series([], dtype=dtype) - except pa.ArrowNotImplementedError: - # PyArrow doesn't support creating an empty array with - # db_dtypes.JSONArrowType, especially when nested. - # Create with string type and then cast. - if isinstance(dtype, pd.ArrowDtype): - safe_pa_type = bigframes.dtypes.to_storage_type(dtype.pyarrow_dtype) - # Create empty array with safe type, but preserve original dtype metadata - empty_array = pa.array([], type=safe_pa_type) - series_map[col] = pd.Series( - empty_array, - dtype=dtype, # Use original dtype directly - ) - else: - # Fallback for other types that might error - series_map[col] = pd.Series([], dtype="object").astype(dtype) - empty_val = pd.DataFrame(series_map) + try: + empty_arrow_table = self.expr.schema.to_pyarrow().empty_table() + except pa.ArrowNotImplementedError: + # Bug with some pyarrow versions, empty_table only supports base storage types, not extension types. + empty_arrow_table = self.expr.schema.to_pyarrow( + use_storage_types=True + ).empty_table() + empty_val = empty_arrow_table.to_pandas() dfs = map( lambda a: a[0], itertools.zip_longest( From 512e3a186f2491bdb393d3e5dacee7fee603b3b6 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Sat, 1 Nov 2025 00:20:00 +0000 Subject: [PATCH 11/15] fix testcase --- tests/system/small/test_dataframe_io.py | 32 +++++++++++++------------ 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index bb9a001606..c519636b0c 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -386,25 +386,25 @@ def test_to_pandas_batches_preserves_dtypes_for_populated_nested_json(session): STRUCT(JSON '{"x":1}' AS json_field, 'test' AS str_field) AS json_struct """ df = session.read_gbq(sql, index_col="id") - batches = list(df.to_pandas_batches()) - # Check that we processed the row assert sum(len(b) for b in batches) == 1 - # Check dtypes on the resulting batch - assert isinstance(batches[0].dtypes["json_array"], pd.ArrowDtype) - assert isinstance(batches[0].dtypes["json_array"].pyarrow_dtype, pa.ListType) + # Check dtypes based on pandas version + if bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable: + assert isinstance(batches[0].dtypes["json_array"], pd.ArrowDtype) + assert isinstance(batches[0].dtypes["json_array"].pyarrow_dtype, pa.ListType) + else: + # In pandas 1.x, list types become object dtype + assert batches[0].dtypes["json_array"] == "object" + + # Struct types work in both pandas versions assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype) assert isinstance(batches[0].dtypes["json_struct"].pyarrow_dtype, pa.StructType) def test_to_pandas_batches_should_not_error_on_empty_nested_json(session): - """Verify to_pandas_batches() works with empty nested JSON types. - - Regression test for PyArrow limitation with empty JSON arrays. - """ - # This SQL query is MINIMAL and tests only the EMPTY regression case. + """Verify to_pandas_batches() works with empty nested JSON types.""" sql = """ SELECT 1 AS id, @@ -413,14 +413,16 @@ def test_to_pandas_batches_should_not_error_on_empty_nested_json(session): """ df = session.read_gbq(sql, index_col="id") - # The main point of this test is that this line does not raise an error. + # The main point: this should not raise an error batches = list(df.to_pandas_batches()) - - # Verify the row was actually processed and not just skipped assert sum(len(b) for b in batches) == 1 - # Verify dtypes are still correct, even with empty data - assert isinstance(batches[0].dtypes["json_array"], pd.ArrowDtype) + # Check dtypes based on pandas version + if bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable: + assert isinstance(batches[0].dtypes["json_array"], pd.ArrowDtype) + else: + assert batches[0].dtypes["json_array"] == "object" + assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype) From 5f5881b167c70867c482e13b7e13834cd775c544 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Sat, 1 Nov 2025 00:51:04 +0000 Subject: [PATCH 12/15] existing arrow_to_pandas() helper that properly handles dtype conversion --- bigframes/core/blocks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 1eac176c32..a70ea63c4d 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -68,6 +68,7 @@ import bigframes.operations.aggregations as agg_ops from bigframes.session import dry_runs, execution_spec from bigframes.session import executor as executors +from bigframes.session._io import pandas as io_pandas # Type constraint for wherever column labels are used Label = typing.Hashable @@ -718,7 +719,7 @@ def to_pandas_batches( empty_arrow_table = self.expr.schema.to_pyarrow( use_storage_types=True ).empty_table() - empty_val = empty_arrow_table.to_pandas() + empty_val = io_pandas.arrow_to_pandas(empty_arrow_table, self.expr.schema) dfs = map( lambda a: a[0], itertools.zip_longest( From 3119771622f7a946da6b067a3f5ab23ba8f2b143 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Sat, 1 Nov 2025 01:00:06 +0000 Subject: [PATCH 13/15] testcase update --- tests/system/small/test_dataframe_io.py | 40 +++++++++++-------------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index c519636b0c..699fd2d056 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -378,7 +378,7 @@ def test_to_pandas_batches_w_empty_dataframe(session): def test_to_pandas_batches_preserves_dtypes_for_populated_nested_json(session): """Verifies to_pandas_batches() preserves dtypes for nested JSON.""" - # This SQL query only tests the POPULATED case. + sql = """ SELECT 0 AS id, @@ -386,25 +386,21 @@ def test_to_pandas_batches_preserves_dtypes_for_populated_nested_json(session): STRUCT(JSON '{"x":1}' AS json_field, 'test' AS str_field) AS json_struct """ df = session.read_gbq(sql, index_col="id") - batches = list(df.to_pandas_batches()) - - assert sum(len(b) for b in batches) == 1 - # Check dtypes based on pandas version - if bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable: - assert isinstance(batches[0].dtypes["json_array"], pd.ArrowDtype) - assert isinstance(batches[0].dtypes["json_array"].pyarrow_dtype, pa.ListType) - else: - # In pandas 1.x, list types become object dtype - assert batches[0].dtypes["json_array"] == "object" + batches = list(df.to_pandas_batches()) - # Struct types work in both pandas versions - assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype) - assert isinstance(batches[0].dtypes["json_struct"].pyarrow_dtype, pa.StructType) + # Focuses only on the "preserves dtypes" behavior. + # This implicitly checks that at least one batch was produced. + pd.testing.assert_series_equal( + batches[0].dtypes, + df.dtypes, + check_dtype=bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable, + ) def test_to_pandas_batches_should_not_error_on_empty_nested_json(session): """Verify to_pandas_batches() works with empty nested JSON types.""" + sql = """ SELECT 1 AS id, @@ -413,17 +409,15 @@ def test_to_pandas_batches_should_not_error_on_empty_nested_json(session): """ df = session.read_gbq(sql, index_col="id") - # The main point: this should not raise an error + # Verify that this line does not raise an error. batches = list(df.to_pandas_batches()) - assert sum(len(b) for b in batches) == 1 - - # Check dtypes based on pandas version - if bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable: - assert isinstance(batches[0].dtypes["json_array"], pd.ArrowDtype) - else: - assert batches[0].dtypes["json_array"] == "object" - assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype) + # Verify the resulting dtypes are correct for the empty/null data + pd.testing.assert_series_equal( + batches[0].dtypes, + df.dtypes, + check_dtype=bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable, + ) @pytest.mark.parametrize("allow_large_results", (True, False)) From be1dea4ab8f414bdbe4b6ba8d6f0a8ba0f639db5 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Sat, 1 Nov 2025 20:01:01 +0000 Subject: [PATCH 14/15] refactor testcase --- tests/system/small/test_dataframe_io.py | 78 +++++++++++++++++++------ 1 file changed, 60 insertions(+), 18 deletions(-) diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 699fd2d056..4d4a144d0a 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -376,9 +376,31 @@ def test_to_pandas_batches_w_empty_dataframe(session): pandas.testing.assert_series_equal(results[0].dtypes, empty.dtypes) -def test_to_pandas_batches_preserves_dtypes_for_populated_nested_json(session): - """Verifies to_pandas_batches() preserves dtypes for nested JSON.""" +@pytest.mark.skipif( + bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable, + reason="Test for pandas 1.x behavior only", +) +def test_to_pandas_batches_preserves_dtypes_for_populated_nested_json_pandas1(session): + """Verifies to_pandas_batches() preserves dtypes for nested JSON in pandas 1.x.""" + sql = """ + SELECT + 0 AS id, + [JSON '{"a":1}', JSON '{"b":2}'] AS json_array, + STRUCT(JSON '{"x":1}' AS json_field, 'test' AS str_field) AS json_struct + """ + df = session.read_gbq(sql, index_col="id") + batches = list(df.to_pandas_batches()) + + assert batches[0].dtypes["json_array"] == "object" + assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype) + +@pytest.mark.skipif( + not bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable, + reason="Test for pandas 2.x behavior only", +) +def test_to_pandas_batches_preserves_dtypes_for_populated_nested_json_pandas2(session): + """Verifies to_pandas_batches() preserves dtypes for nested JSON in pandas 2.x.""" sql = """ SELECT 0 AS id, @@ -386,20 +408,42 @@ def test_to_pandas_batches_preserves_dtypes_for_populated_nested_json(session): STRUCT(JSON '{"x":1}' AS json_field, 'test' AS str_field) AS json_struct """ df = session.read_gbq(sql, index_col="id") + batches = list(df.to_pandas_batches()) + + assert isinstance(batches[0].dtypes["json_array"], pd.ArrowDtype) + assert isinstance(batches[0].dtypes["json_array"].pyarrow_dtype, pa.ListType) + assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype) + + +@pytest.mark.skipif( + bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable, + reason="Test for pandas 1.x behavior only", +) +def test_to_pandas_batches_should_not_error_on_empty_nested_json_pandas1(session): + """Verify to_pandas_batches() works with empty nested JSON types in pandas 1.x.""" + + sql = """ + SELECT + 1 AS id, + [] AS json_array, + STRUCT(NULL AS json_field, 'test2' AS str_field) AS json_struct + """ + df = session.read_gbq(sql, index_col="id") + # The main point: this should not raise an error batches = list(df.to_pandas_batches()) + assert sum(len(b) for b in batches) == 1 - # Focuses only on the "preserves dtypes" behavior. - # This implicitly checks that at least one batch was produced. - pd.testing.assert_series_equal( - batches[0].dtypes, - df.dtypes, - check_dtype=bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable, - ) + assert batches[0].dtypes["json_array"] == "object" + assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype) -def test_to_pandas_batches_should_not_error_on_empty_nested_json(session): - """Verify to_pandas_batches() works with empty nested JSON types.""" +@pytest.mark.skipif( + not bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable, + reason="Test for pandas 2.x behavior only", +) +def test_to_pandas_batches_should_not_error_on_empty_nested_json_pandas2(session): + """Verify to_pandas_batches() works with empty nested JSON types in pandas 2.x.""" sql = """ SELECT @@ -409,15 +453,13 @@ def test_to_pandas_batches_should_not_error_on_empty_nested_json(session): """ df = session.read_gbq(sql, index_col="id") - # Verify that this line does not raise an error. + # The main point: this should not raise an error batches = list(df.to_pandas_batches()) + assert sum(len(b) for b in batches) == 1 - # Verify the resulting dtypes are correct for the empty/null data - pd.testing.assert_series_equal( - batches[0].dtypes, - df.dtypes, - check_dtype=bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable, - ) + assert isinstance(batches[0].dtypes["json_array"], pd.ArrowDtype) + assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype) + assert isinstance(batches[0].dtypes["json_struct"].pyarrow_dtype, pa.StructType) @pytest.mark.parametrize("allow_large_results", (True, False)) From 5ed42933457e0368b24a3bfc2fd83588a4a7d123 Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 3 Nov 2025 21:30:07 +0000 Subject: [PATCH 15/15] Add pyarrow id to comments --- bigframes/core/blocks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index a70ea63c4d..61aaab1120 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -715,7 +715,8 @@ def to_pandas_batches( try: empty_arrow_table = self.expr.schema.to_pyarrow().empty_table() except pa.ArrowNotImplementedError: - # Bug with some pyarrow versions, empty_table only supports base storage types, not extension types. + # Bug with some pyarrow versions(https://github.com/apache/arrow/issues/45262), + # empty_table only supports base storage types, not extension types. empty_arrow_table = self.expr.schema.to_pyarrow( use_storage_types=True ).empty_table()