Skip to content
Permalink
Browse files
fix: guard imports against unsupported pyarrow versions (#934)
* fix: guard imports against unsupported pyarrow versions

* add unit tests

* fix pytype

* second try at fixing pytype
  • Loading branch information
tswast committed Sep 1, 2021
1 parent 10fee52 commit b28907693bbe889becc1b9c8963f0a7e1ee6c35a
@@ -19,7 +19,7 @@
import decimal
import math
import re
from typing import Union
from typing import Any, Union

from google.cloud._helpers import UTC
from google.cloud._helpers import _date_from_iso8601_date
@@ -29,7 +29,10 @@
from google.cloud._helpers import _to_bytes
import packaging.version

from google.cloud.bigquery.exceptions import LegacyBigQueryStorageError
from google.cloud.bigquery.exceptions import (
LegacyBigQueryStorageError,
LegacyPyarrowError,
)


_RFC3339_MICROS_NO_ZULU = "%Y-%m-%dT%H:%M:%S.%f"
@@ -42,6 +45,7 @@
re.VERBOSE,
)

_MIN_PYARROW_VERSION = packaging.version.Version("3.0.0")
_MIN_BQ_STORAGE_VERSION = packaging.version.Version("2.0.0")
_BQ_STORAGE_OPTIONAL_READ_SESSION_VERSION = packaging.version.Version("2.6.0")

@@ -95,12 +99,74 @@ def verify_version(self):
if self.installed_version < _MIN_BQ_STORAGE_VERSION:
msg = (
"Dependency google-cloud-bigquery-storage is outdated, please upgrade "
f"it to version >= 2.0.0 (version found: {self.installed_version})."
f"it to version >= {_MIN_BQ_STORAGE_VERSION} (version found: {self.installed_version})."
)
raise LegacyBigQueryStorageError(msg)


class PyarrowVersions:
"""Version comparisons for pyarrow package."""

def __init__(self):
self._installed_version = None

@property
def installed_version(self) -> packaging.version.Version:
"""Return the parsed version of pyarrow."""
if self._installed_version is None:
import pyarrow

self._installed_version = packaging.version.parse(
# Use 0.0.0, since it is earlier than any released version.
# Legacy versions also have the same property, but
# creating a LegacyVersion has been deprecated.
# https://github.com/pypa/packaging/issues/321
getattr(pyarrow, "__version__", "0.0.0")
)

return self._installed_version

def try_import(self, raise_if_error: bool = False) -> Any:
"""Verify that a recent enough version of pyarrow extra is
installed.
The function assumes that pyarrow extra is installed, and should thus
be used in places where this assumption holds.
Because `pip` can install an outdated version of this extra despite the
constraints in `setup.py`, the calling code can use this helper to
verify the version compatibility at runtime.
Returns:
The ``pyarrow`` module or ``None``.
Raises:
LegacyPyarrowError:
If the pyarrow package is outdated and ``raise_if_error`` is ``True``.
"""
try:
import pyarrow
except ImportError as exc: # pragma: NO COVER
if raise_if_error:
raise LegacyPyarrowError(
f"pyarrow package not found. Install pyarrow version >= {_MIN_PYARROW_VERSION}."
) from exc
return None

if self.installed_version < _MIN_PYARROW_VERSION:
if raise_if_error:
msg = (
"Dependency pyarrow is outdated, please upgrade "
f"it to version >= {_MIN_PYARROW_VERSION} (version found: {self.installed_version})."
)
raise LegacyPyarrowError(msg)
return None

return pyarrow


BQ_STORAGE_VERSIONS = BQStorageVersions()
PYARROW_VERSIONS = PyarrowVersions()


def _not_null(value, field):
@@ -55,12 +55,6 @@ def _to_wkb(v):

_to_wkb = _to_wkb()

try:
import pyarrow
import pyarrow.parquet
except ImportError: # pragma: NO COVER
pyarrow = None

try:
from google.cloud.bigquery_storage import ArrowSerializationOptions
except ImportError:
@@ -73,12 +67,10 @@ def _to_wkb(v):
from google.cloud.bigquery import schema


_LOGGER = logging.getLogger(__name__)
pyarrow = _helpers.PYARROW_VERSIONS.try_import()

_NO_BQSTORAGE_ERROR = (
"The google-cloud-bigquery-storage library is not installed, "
"please install google-cloud-bigquery-storage to use bqstorage features."
)

_LOGGER = logging.getLogger(__name__)

_PROGRESS_INTERVAL = 0.2 # Maximum time between download status checks, in seconds.

@@ -548,8 +540,9 @@ def dataframe_to_parquet(dataframe, bq_schema, filepath, parquet_compression="SN
serializing method. Defaults to "SNAPPY".
https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html#pyarrow-parquet-write-table
"""
if pyarrow is None:
raise ValueError("pyarrow is required for BigQuery schema conversion.")
pyarrow = _helpers.PYARROW_VERSIONS.try_import(raise_if_error=True)

import pyarrow.parquet

bq_schema = schema._to_schema_fields(bq_schema)
arrow_table = dataframe_to_arrow(dataframe, bq_schema)
@@ -19,3 +19,7 @@ class BigQueryError(Exception):

class LegacyBigQueryStorageError(BigQueryError):
"""Raised when too old a version of BigQuery Storage extra is detected at runtime."""


class LegacyPyarrowError(BigQueryError):
"""Raised when too old a version of pyarrow package is detected at runtime."""
@@ -94,9 +94,16 @@ def unit(session):
default(session)


@nox.session(python=UNIT_TEST_PYTHON_VERSIONS[-1])
@nox.session(python=[UNIT_TEST_PYTHON_VERSIONS[0], UNIT_TEST_PYTHON_VERSIONS[-1]])
def unit_noextras(session):
"""Run the unit test suite."""

# Install optional dependencies that are out-of-date.
# https://github.com/googleapis/python-bigquery/issues/933
# There is no pyarrow 1.0.0 package for Python 3.9.
if session.python == UNIT_TEST_PYTHON_VERSIONS[0]:
session.install("pyarrow==1.0.0")

default(session, install_extras=False)


@@ -19,6 +19,6 @@ proto-plus==1.10.0
protobuf==3.12.0
pyarrow==3.0.0
requests==2.18.0
shapely==1.6.0
Shapely==1.6.0
six==1.13.0
tqdm==4.7.4
@@ -31,10 +31,6 @@
import geopandas
except (ImportError, AttributeError): # pragma: NO COVER
geopandas = None
try:
import pyarrow
except (ImportError, AttributeError): # pragma: NO COVER
pyarrow = None
try:
from google.cloud import bigquery_storage
except (ImportError, AttributeError): # pragma: NO COVER
@@ -44,11 +40,15 @@
except (ImportError, AttributeError): # pragma: NO COVER
tqdm = None

from google.cloud.bigquery import _helpers
from .helpers import _make_client
from .helpers import _make_connection
from .helpers import _make_job_resource


pyarrow = _helpers.PYARROW_VERSIONS.try_import()


@pytest.fixture
def table_read_options_kwarg():
# Create a BigQuery Storage table read options object with pyarrow compression
@@ -24,9 +24,20 @@
except ImportError: # pragma: NO COVER
bigquery_storage = None

try:
import pyarrow
except ImportError: # pragma: NO COVER
pyarrow = None


@unittest.skipIf(bigquery_storage is None, "Requires `google-cloud-bigquery-storage`")
class TestBQStorageVersions(unittest.TestCase):
def tearDown(self):
from google.cloud.bigquery import _helpers

# Reset any cached versions since it may not match reality.
_helpers.BQ_STORAGE_VERSIONS._installed_version = None

def _object_under_test(self):
from google.cloud.bigquery import _helpers

@@ -89,6 +100,63 @@ def test_is_read_session_optional_false(self):
assert not versions.is_read_session_optional


@unittest.skipIf(pyarrow is None, "Requires `pyarrow`")
class TestPyarrowVersions(unittest.TestCase):
def tearDown(self):
from google.cloud.bigquery import _helpers

# Reset any cached versions since it may not match reality.
_helpers.PYARROW_VERSIONS._installed_version = None

def _object_under_test(self):
from google.cloud.bigquery import _helpers

return _helpers.PyarrowVersions()

def _call_try_import(self, **kwargs):
from google.cloud.bigquery import _helpers

_helpers.PYARROW_VERSIONS._installed_version = None
return _helpers.PYARROW_VERSIONS.try_import(**kwargs)

def test_try_import_raises_no_error_w_recent_pyarrow(self):
from google.cloud.bigquery.exceptions import LegacyPyarrowError

with mock.patch("pyarrow.__version__", new="5.0.0"):
try:
pyarrow = self._call_try_import(raise_if_error=True)
self.assertIsNotNone(pyarrow)
except LegacyPyarrowError: # pragma: NO COVER
self.fail("Legacy error raised with a non-legacy dependency version.")

def test_try_import_returns_none_w_legacy_pyarrow(self):
with mock.patch("pyarrow.__version__", new="2.0.0"):
pyarrow = self._call_try_import()
self.assertIsNone(pyarrow)

def test_try_import_raises_error_w_legacy_pyarrow(self):
from google.cloud.bigquery.exceptions import LegacyPyarrowError

with mock.patch("pyarrow.__version__", new="2.0.0"):
with self.assertRaises(LegacyPyarrowError):
self._call_try_import(raise_if_error=True)

def test_installed_version_returns_cached(self):
versions = self._object_under_test()
versions._installed_version = object()
assert versions.installed_version is versions._installed_version

def test_installed_version_returns_parsed_version(self):
versions = self._object_under_test()

with mock.patch("pyarrow.__version__", new="1.2.3"):
version = versions.installed_version

assert version.major == 1
assert version.minor == 2
assert version.micro == 3


class Test_not_null(unittest.TestCase):
def _call_fut(self, value, field):
from google.cloud.bigquery._helpers import _not_null
@@ -29,13 +29,6 @@
import pandas.testing
except ImportError: # pragma: NO COVER
pandas = None
try:
import pyarrow
import pyarrow.types
except ImportError: # pragma: NO COVER
# Mock out pyarrow when missing, because methods from pyarrow.types are
# used in test parameterization.
pyarrow = mock.Mock()
try:
import geopandas
except ImportError: # pragma: NO COVER
@@ -44,9 +37,19 @@
import pytest

from google import api_core
from google.cloud.bigquery import exceptions
from google.cloud.bigquery import _helpers
from google.cloud.bigquery import schema


pyarrow = _helpers.PYARROW_VERSIONS.try_import()
if pyarrow:
import pyarrow.types
else: # pragma: NO COVER
# Mock out pyarrow when missing, because methods from pyarrow.types are
# used in test parameterization.
pyarrow = mock.Mock()

try:
from google.cloud import bigquery_storage

@@ -1120,15 +1123,19 @@ def test_dataframe_to_arrow_dict_sequence_schema(module_under_test):

@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
def test_dataframe_to_parquet_without_pyarrow(module_under_test, monkeypatch):
monkeypatch.setattr(module_under_test, "pyarrow", None)
with pytest.raises(ValueError) as exc_context:
mock_pyarrow_import = mock.Mock()
mock_pyarrow_import.side_effect = exceptions.LegacyPyarrowError(
"pyarrow not installed"
)
monkeypatch.setattr(_helpers.PYARROW_VERSIONS, "try_import", mock_pyarrow_import)

with pytest.raises(exceptions.LegacyPyarrowError):
module_under_test.dataframe_to_parquet(pandas.DataFrame(), (), None)
assert "pyarrow is required" in str(exc_context.value)


@pytest.mark.skipif(pandas is None, reason="Requires `pandas`")
@pytest.mark.skipif(isinstance(pyarrow, mock.Mock), reason="Requires `pyarrow`")
def test_dataframe_to_parquet_w_extra_fields(module_under_test, monkeypatch):
def test_dataframe_to_parquet_w_extra_fields(module_under_test):
with pytest.raises(ValueError) as exc_context:
module_under_test.dataframe_to_parquet(
pandas.DataFrame(), (schema.SchemaField("not_in_df", "STRING"),), None
@@ -45,18 +45,18 @@
except (ImportError, AttributeError): # pragma: NO COVER
geopandas = None

try:
import pyarrow
import pyarrow.types
except ImportError: # pragma: NO COVER
pyarrow = None

try:
from tqdm import tqdm
except (ImportError, AttributeError): # pragma: NO COVER
tqdm = None

from google.cloud.bigquery.dataset import DatasetReference
from google.cloud.bigquery import _helpers


pyarrow = _helpers.PYARROW_VERSIONS.try_import()
if pyarrow:
import pyarrow.types


def _mock_client():

0 comments on commit b289076

Please sign in to comment.