Skip to content
Permalink
Browse files
fix: issue a warning if buggy pyarrow is detected (#787)
Some pyarrow versions can cause issue when loading data from dataframe.
This commit detects if such pyarrow version is installed and warns the
user.
  • Loading branch information
plamut committed Jul 21, 2021
1 parent d1cbc38 commit e403721af1373eb1f1a1c7be5b2182e3819ed1f9
Showing with 52 additions and 0 deletions.
  1. +15 −0 google/cloud/bigquery/client.py
  2. +37 −0 tests/unit/test_client.py
@@ -27,13 +27,16 @@
import json
import math
import os
import packaging.version
import tempfile
from typing import Any, BinaryIO, Dict, Iterable, Optional, Sequence, Tuple, Union
import uuid
import warnings

try:
import pyarrow

_PYARROW_VERSION = packaging.version.parse(pyarrow.__version__)
except ImportError: # pragma: NO COVER
pyarrow = None

@@ -118,6 +121,9 @@
# https://github.com/googleapis/python-bigquery/issues/438
_MIN_GET_QUERY_RESULTS_TIMEOUT = 120

# https://github.com/googleapis/python-bigquery/issues/781#issuecomment-883497414
_PYARROW_BAD_VERSIONS = frozenset([packaging.version.Version("2.0.0")])


class Project(object):
"""Wrapper for resource describing a BigQuery project.
@@ -2609,6 +2615,15 @@ def load_table_from_dataframe(
try:

if job_config.source_format == job.SourceFormat.PARQUET:
if _PYARROW_VERSION in _PYARROW_BAD_VERSIONS:
msg = (
"Loading dataframe data in PARQUET format with pyarrow "
f"{_PYARROW_VERSION} can result in data corruption. It is "
"therefore *strongly* advised to use a different pyarrow "
"version or a different source format. "
"See: https://github.com/googleapis/python-bigquery/issues/781"
)
warnings.warn(msg, category=RuntimeWarning)

if job_config.schema:
if parquet_compression == "snappy": # adjust the default value
@@ -27,6 +27,7 @@
import warnings

import mock
import packaging
import requests
import pytest
import pytz
@@ -7510,6 +7511,42 @@ def test_load_table_from_dataframe_wo_pyarrow_raises_error(self):
parquet_compression="gzip",
)

def test_load_table_from_dataframe_w_bad_pyarrow_issues_warning(self):
pytest.importorskip("pandas", reason="Requires `pandas`")
pytest.importorskip("pyarrow", reason="Requires `pyarrow`")

client = self._make_client()
records = [{"id": 1, "age": 100}, {"id": 2, "age": 60}]
dataframe = pandas.DataFrame(records)

pyarrow_version_patch = mock.patch(
"google.cloud.bigquery.client._PYARROW_VERSION",
packaging.version.parse("2.0.0"), # A known bad version of pyarrow.
)
get_table_patch = mock.patch(
"google.cloud.bigquery.client.Client.get_table",
autospec=True,
side_effect=google.api_core.exceptions.NotFound("Table not found"),
)
load_patch = mock.patch(
"google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
)

with load_patch, get_table_patch, pyarrow_version_patch:
with warnings.catch_warnings(record=True) as warned:
client.load_table_from_dataframe(
dataframe, self.TABLE_REF, location=self.LOCATION,
)

expected_warnings = [
warning for warning in warned if "pyarrow" in str(warning).lower()
]
assert len(expected_warnings) == 1
assert issubclass(expected_warnings[0].category, RuntimeWarning)
msg = str(expected_warnings[0].message)
assert "pyarrow 2.0.0" in msg
assert "data corruption" in msg

@unittest.skipIf(pandas is None, "Requires `pandas`")
@unittest.skipIf(pyarrow is None, "Requires `pyarrow`")
def test_load_table_from_dataframe_w_nulls(self):

0 comments on commit e403721

Please sign in to comment.