diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 41986ce5df..61aaab1120 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -68,6 +68,7 @@ import bigframes.operations.aggregations as agg_ops from bigframes.session import dry_runs, execution_spec from bigframes.session import executor as executors +from bigframes.session._io import pandas as io_pandas # Type constraint for wherever column labels are used Label = typing.Hashable @@ -711,12 +712,15 @@ def to_pandas_batches( # To reduce the number of edge cases to consider when working with the # results of this, always return at least one DataFrame. See: # b/428918844. - empty_val = pd.DataFrame( - { - col: pd.Series([], dtype=self.expr.get_column_type(col)) - for col in itertools.chain(self.value_columns, self.index_columns) - } - ) + try: + empty_arrow_table = self.expr.schema.to_pyarrow().empty_table() + except pa.ArrowNotImplementedError: + # Bug with some pyarrow versions(https://github.com/apache/arrow/issues/45262), + # empty_table only supports base storage types, not extension types. + empty_arrow_table = self.expr.schema.to_pyarrow( + use_storage_types=True + ).empty_table() + empty_val = io_pandas.arrow_to_pandas(empty_arrow_table, self.expr.schema) dfs = map( lambda a: a[0], itertools.zip_longest( diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 96d7881d67..4d4a144d0a 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -376,6 +376,92 @@ def test_to_pandas_batches_w_empty_dataframe(session): pandas.testing.assert_series_equal(results[0].dtypes, empty.dtypes) +@pytest.mark.skipif( + bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable, + reason="Test for pandas 1.x behavior only", +) +def test_to_pandas_batches_preserves_dtypes_for_populated_nested_json_pandas1(session): + """Verifies to_pandas_batches() preserves dtypes for nested JSON in pandas 1.x.""" + sql = """ + SELECT + 0 AS id, + [JSON '{"a":1}', JSON '{"b":2}'] AS json_array, + STRUCT(JSON '{"x":1}' AS json_field, 'test' AS str_field) AS json_struct + """ + df = session.read_gbq(sql, index_col="id") + batches = list(df.to_pandas_batches()) + + assert batches[0].dtypes["json_array"] == "object" + assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype) + + +@pytest.mark.skipif( + not bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable, + reason="Test for pandas 2.x behavior only", +) +def test_to_pandas_batches_preserves_dtypes_for_populated_nested_json_pandas2(session): + """Verifies to_pandas_batches() preserves dtypes for nested JSON in pandas 2.x.""" + sql = """ + SELECT + 0 AS id, + [JSON '{"a":1}', JSON '{"b":2}'] AS json_array, + STRUCT(JSON '{"x":1}' AS json_field, 'test' AS str_field) AS json_struct + """ + df = session.read_gbq(sql, index_col="id") + batches = list(df.to_pandas_batches()) + + assert isinstance(batches[0].dtypes["json_array"], pd.ArrowDtype) + assert isinstance(batches[0].dtypes["json_array"].pyarrow_dtype, pa.ListType) + assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype) + + +@pytest.mark.skipif( + bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable, + reason="Test for pandas 1.x behavior only", +) +def test_to_pandas_batches_should_not_error_on_empty_nested_json_pandas1(session): + """Verify to_pandas_batches() works with empty nested JSON types in pandas 1.x.""" + + sql = """ + SELECT + 1 AS id, + [] AS json_array, + STRUCT(NULL AS json_field, 'test2' AS str_field) AS json_struct + """ + df = session.read_gbq(sql, index_col="id") + + # The main point: this should not raise an error + batches = list(df.to_pandas_batches()) + assert sum(len(b) for b in batches) == 1 + + assert batches[0].dtypes["json_array"] == "object" + assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype) + + +@pytest.mark.skipif( + not bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable, + reason="Test for pandas 2.x behavior only", +) +def test_to_pandas_batches_should_not_error_on_empty_nested_json_pandas2(session): + """Verify to_pandas_batches() works with empty nested JSON types in pandas 2.x.""" + + sql = """ + SELECT + 1 AS id, + [] AS json_array, + STRUCT(NULL AS json_field, 'test2' AS str_field) AS json_struct + """ + df = session.read_gbq(sql, index_col="id") + + # The main point: this should not raise an error + batches = list(df.to_pandas_batches()) + assert sum(len(b) for b in batches) == 1 + + assert isinstance(batches[0].dtypes["json_array"], pd.ArrowDtype) + assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype) + assert isinstance(batches[0].dtypes["json_struct"].pyarrow_dtype, pa.StructType) + + @pytest.mark.parametrize("allow_large_results", (True, False)) def test_to_pandas_batches_w_page_size_and_max_results(session, allow_large_results): """Verify to_pandas_batches() APIs returns the expected page size.