Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions bigframes/core/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
import bigframes.operations.aggregations as agg_ops
from bigframes.session import dry_runs, execution_spec
from bigframes.session import executor as executors
from bigframes.session._io import pandas as io_pandas

# Type constraint for wherever column labels are used
Label = typing.Hashable
Expand Down Expand Up @@ -711,12 +712,15 @@ def to_pandas_batches(
# To reduce the number of edge cases to consider when working with the
# results of this, always return at least one DataFrame. See:
# b/428918844.
empty_val = pd.DataFrame(
{
col: pd.Series([], dtype=self.expr.get_column_type(col))
for col in itertools.chain(self.value_columns, self.index_columns)
}
)
try:
empty_arrow_table = self.expr.schema.to_pyarrow().empty_table()
except pa.ArrowNotImplementedError:
# Bug with some pyarrow versions(https://github.com/apache/arrow/issues/45262),
# empty_table only supports base storage types, not extension types.
empty_arrow_table = self.expr.schema.to_pyarrow(
use_storage_types=True
).empty_table()
empty_val = io_pandas.arrow_to_pandas(empty_arrow_table, self.expr.schema)
dfs = map(
lambda a: a[0],
itertools.zip_longest(
Expand Down
86 changes: 86 additions & 0 deletions tests/system/small/test_dataframe_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,92 @@ def test_to_pandas_batches_w_empty_dataframe(session):
pandas.testing.assert_series_equal(results[0].dtypes, empty.dtypes)


@pytest.mark.skipif(
bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable,
reason="Test for pandas 1.x behavior only",
)
def test_to_pandas_batches_preserves_dtypes_for_populated_nested_json_pandas1(session):
"""Verifies to_pandas_batches() preserves dtypes for nested JSON in pandas 1.x."""
sql = """
SELECT
0 AS id,
[JSON '{"a":1}', JSON '{"b":2}'] AS json_array,
STRUCT(JSON '{"x":1}' AS json_field, 'test' AS str_field) AS json_struct
"""
df = session.read_gbq(sql, index_col="id")
batches = list(df.to_pandas_batches())

assert batches[0].dtypes["json_array"] == "object"
assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype)


@pytest.mark.skipif(
not bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable,
reason="Test for pandas 2.x behavior only",
)
def test_to_pandas_batches_preserves_dtypes_for_populated_nested_json_pandas2(session):
"""Verifies to_pandas_batches() preserves dtypes for nested JSON in pandas 2.x."""
sql = """
SELECT
0 AS id,
[JSON '{"a":1}', JSON '{"b":2}'] AS json_array,
STRUCT(JSON '{"x":1}' AS json_field, 'test' AS str_field) AS json_struct
"""
df = session.read_gbq(sql, index_col="id")
batches = list(df.to_pandas_batches())

assert isinstance(batches[0].dtypes["json_array"], pd.ArrowDtype)
assert isinstance(batches[0].dtypes["json_array"].pyarrow_dtype, pa.ListType)
assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype)


@pytest.mark.skipif(
bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable,
reason="Test for pandas 1.x behavior only",
)
def test_to_pandas_batches_should_not_error_on_empty_nested_json_pandas1(session):
"""Verify to_pandas_batches() works with empty nested JSON types in pandas 1.x."""

sql = """
SELECT
1 AS id,
[] AS json_array,
STRUCT(NULL AS json_field, 'test2' AS str_field) AS json_struct
"""
df = session.read_gbq(sql, index_col="id")

# The main point: this should not raise an error
batches = list(df.to_pandas_batches())
assert sum(len(b) for b in batches) == 1

assert batches[0].dtypes["json_array"] == "object"
assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype)


@pytest.mark.skipif(
not bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable,
reason="Test for pandas 2.x behavior only",
)
def test_to_pandas_batches_should_not_error_on_empty_nested_json_pandas2(session):
"""Verify to_pandas_batches() works with empty nested JSON types in pandas 2.x."""

sql = """
SELECT
1 AS id,
[] AS json_array,
STRUCT(NULL AS json_field, 'test2' AS str_field) AS json_struct
"""
df = session.read_gbq(sql, index_col="id")

# The main point: this should not raise an error
batches = list(df.to_pandas_batches())
assert sum(len(b) for b in batches) == 1

assert isinstance(batches[0].dtypes["json_array"], pd.ArrowDtype)
assert isinstance(batches[0].dtypes["json_struct"], pd.ArrowDtype)
assert isinstance(batches[0].dtypes["json_struct"].pyarrow_dtype, pa.StructType)


@pytest.mark.parametrize("allow_large_results", (True, False))
def test_to_pandas_batches_w_page_size_and_max_results(session, allow_large_results):
"""Verify to_pandas_batches() APIs returns the expected page size.
Expand Down