Skip to content

Commit d443103

Browse files
fix: boolean round-trip test and CSV datetime loading errors (#1000)
This PR fixes two issues causing CI failures: 1. **Boolean Round Trip**: Adjusted the expected DataFrame in `test_dataframe_round_trip_with_table_schema` to correctly handle `pd.NA` returned by the connector for nullable boolean values, instead of `None`. 2. **CSV Extreme Datetimes**: Implemented `cast_dataframe_for_csv` to pre-format `DATETIME` and `TIMESTAMP` columns using `.isoformat()` before CSV serialization. This ensures years before 1000 are zero-padded (e.g., `0001-01-01`), avoiding invalid date string errors from BigQuery when loading data. --- *PR created automatically by Jules for task [5793097527839411486](https://jules.google.com/task/5793097527839411486) started by @chalmerlowe* --------- Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com> Co-authored-by: chalmer lowe <chalmerlowe@google.com>
1 parent f1525d9 commit d443103

File tree

3 files changed

+65
-0
lines changed

3 files changed

+65
-0
lines changed

pandas_gbq/load/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
# license that can be found in the LICENSE file.
44

55
from pandas_gbq.load.core import (
6+
cast_dataframe_for_csv,
67
cast_dataframe_for_parquet,
78
encode_chunk,
89
load_chunks,
@@ -13,6 +14,7 @@
1314
)
1415

1516
__all__ = [
17+
"cast_dataframe_for_csv",
1618
"cast_dataframe_for_parquet",
1719
"encode_chunk",
1820
"load_chunks",

pandas_gbq/load/core.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,45 @@ def convert(x):
124124
return dataframe
125125

126126

127+
def cast_dataframe_for_csv(
128+
dataframe: pandas.DataFrame,
129+
schema: Optional[Dict[str, Any]],
130+
) -> pandas.DataFrame:
131+
"""Cast columns to needed dtype when writing CSV files."""
132+
133+
columns = schema.get("fields", [])
134+
135+
# Protect against an explicit None in the dictionary.
136+
columns = columns if columns is not None else []
137+
138+
new_columns = {}
139+
for column in columns:
140+
# Schema can be a superset of the columns in the dataframe, so ignore
141+
# columns that aren't present.
142+
column_name = column.get("name")
143+
if column_name not in dataframe.columns:
144+
continue
145+
146+
column_type = column.get("type", "").upper()
147+
if column_type in {"DATETIME", "TIMESTAMP"}:
148+
# Use isoformat to ensure that the years are 4 digits.
149+
# https://github.com/googleapis/python-bigquery-pandas/issues/365
150+
def convert(x):
151+
if pandas.isna(x):
152+
return None
153+
try:
154+
return x.isoformat(sep=" ")
155+
except AttributeError:
156+
# It might be a string already or some other type.
157+
return x
158+
159+
new_columns[column_name] = dataframe[column_name].map(convert)
160+
161+
if new_columns:
162+
dataframe = dataframe.assign(**new_columns)
163+
return dataframe
164+
165+
127166
def load_parquet(
128167
client: bigquery.Client,
129168
dataframe: pandas.DataFrame,
@@ -195,6 +234,9 @@ def load_csv_from_dataframe(
195234
bq_schema = pandas_gbq.schema.to_google_cloud_bigquery(schema)
196235

197236
def load_chunk(chunk, job_config):
237+
if schema is not None:
238+
chunk = cast_dataframe_for_csv(chunk, schema)
239+
198240
client.load_table_from_dataframe(
199241
chunk,
200242
destination_table_ref,

tests/system/test_to_gbq.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,27 @@ def test_series_round_trip(
160160
),
161161
}
162162
),
163+
expected_df=pandas.DataFrame(
164+
{
165+
"row_num": [0, 1, 2],
166+
"bool_col": pandas.Series(
167+
[True, False, True],
168+
dtype="bool",
169+
),
170+
"boolean_col": pandas.Series(
171+
[None, True, False],
172+
dtype="boolean",
173+
),
174+
"object_col": pandas.Series(
175+
[
176+
False,
177+
(pandas.NA if hasattr(pandas, "NA") else None),
178+
True,
179+
],
180+
dtype="object",
181+
),
182+
}
183+
),
163184
table_schema=[
164185
{"name": "bool_col", "type": "BOOLEAN"},
165186
{"name": "boolean_col", "type": "BOOLEAN"},

0 commit comments

Comments
 (0)