fix: boolean round-trip test and CSV datetime loading errors (#1000)

google-labs-jules[bot] · chalmerlowe · web-flow · commit d4431030e1b5 · 2025-12-15T15:40:25.000-05:00
This PR fixes two issues causing CI failures: 1. **Boolean Round Trip**: Adjusted the expected DataFrame in `test_dataframe_round_trip_with_table_schema` to correctly handle `pd.NA` returned by the connector for nullable boolean values, instead of `None`. 2. **CSV Extreme Datetimes**: Implemented `cast_dataframe_for_csv` to pre-format `DATETIME` and `TIMESTAMP` columns using `.isoformat()` before CSV serialization. This ensures years before 1000 are zero-padded (e.g., `0001-01-01`), avoiding invalid date string errors from BigQuery when loading data. --- *PR created automatically by Jules for task [5793097527839411486](https://jules.google.com/task/5793097527839411486) started by @chalmerlowe* --------- Co-authored-by: google-labs-jules[bot] <161369871+google-labs-jules[bot]@users.noreply.github.com> Co-authored-by: chalmer lowe <chalmerlowe@google.com>
diff --git a/pandas_gbq/load/__init__.py b/pandas_gbq/load/__init__.py
@@ -3,6 +3,7 @@
 # license that can be found in the LICENSE file.
 
 from pandas_gbq.load.core import (
+    cast_dataframe_for_csv,
     cast_dataframe_for_parquet,
     encode_chunk,
     load_chunks,
@@ -13,6 +14,7 @@
 )
 
 __all__ = [
+    "cast_dataframe_for_csv",
     "cast_dataframe_for_parquet",
     "encode_chunk",
     "load_chunks",
diff --git a/pandas_gbq/load/core.py b/pandas_gbq/load/core.py
@@ -124,6 +124,45 @@ def convert(x):
     return dataframe
 
 
+def cast_dataframe_for_csv(
+    dataframe: pandas.DataFrame,
+    schema: Optional[Dict[str, Any]],
+) -> pandas.DataFrame:
+    """Cast columns to needed dtype when writing CSV files."""
+
+    columns = schema.get("fields", [])
+
+    # Protect against an explicit None in the dictionary.
+    columns = columns if columns is not None else []
+
+    new_columns = {}
+    for column in columns:
+        # Schema can be a superset of the columns in the dataframe, so ignore
+        # columns that aren't present.
+        column_name = column.get("name")
+        if column_name not in dataframe.columns:
+            continue
+
+        column_type = column.get("type", "").upper()
+        if column_type in {"DATETIME", "TIMESTAMP"}:
+            # Use isoformat to ensure that the years are 4 digits.
+            # https://github.com/googleapis/python-bigquery-pandas/issues/365
+            def convert(x):
+                if pandas.isna(x):
+                    return None
+                try:
+                    return x.isoformat(sep=" ")
+                except AttributeError:
+                    # It might be a string already or some other type.
+                    return x
+
+            new_columns[column_name] = dataframe[column_name].map(convert)
+
+    if new_columns:
+        dataframe = dataframe.assign(**new_columns)
+    return dataframe
+
+
 def load_parquet(
     client: bigquery.Client,
     dataframe: pandas.DataFrame,
@@ -195,6 +234,9 @@ def load_csv_from_dataframe(
         bq_schema = pandas_gbq.schema.to_google_cloud_bigquery(schema)
 
     def load_chunk(chunk, job_config):
+        if schema is not None:
+            chunk = cast_dataframe_for_csv(chunk, schema)
+
         client.load_table_from_dataframe(
             chunk,
             destination_table_ref,
diff --git a/tests/system/test_to_gbq.py b/tests/system/test_to_gbq.py
@@ -160,6 +160,27 @@ def test_series_round_trip(
                     ),
                 }
             ),
+            expected_df=pandas.DataFrame(
+                {
+                    "row_num": [0, 1, 2],
+                    "bool_col": pandas.Series(
+                        [True, False, True],
+                        dtype="bool",
+                    ),
+                    "boolean_col": pandas.Series(
+                        [None, True, False],
+                        dtype="boolean",
+                    ),
+                    "object_col": pandas.Series(
+                        [
+                            False,
+                            (pandas.NA if hasattr(pandas, "NA") else None),
+                            True,
+                        ],
+                        dtype="object",
+                    ),
+                }
+            ),
             table_schema=[
                 {"name": "bool_col", "type": "BOOLEAN"},
                 {"name": "boolean_col", "type": "BOOLEAN"},