Skip to content

Commit

Permalink
fix: add pyarrow version check for range support (#1914)
Browse files Browse the repository at this point in the history
* fix: add pyarrow version check for range support

* add comment why we are making a separate constant

---------

Co-authored-by: Tim Sweña (Swast) <swast@google.com>
  • Loading branch information
Linchin and tswast committed May 9, 2024
1 parent 74e75e8 commit a86d7b9
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 33 deletions.
14 changes: 14 additions & 0 deletions google/cloud/bigquery/_versions_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@
_BQ_STORAGE_OPTIONAL_READ_SESSION_VERSION = packaging.version.Version("2.6.0")
_MIN_PANDAS_VERSION = packaging.version.Version("1.1.0")

_MIN_PANDAS_VERSION_RANGE = packaging.version.Version("1.5.0")
_MIN_PYARROW_VERSION_RANGE = packaging.version.Version("10.0.1")


class PyarrowVersions:
"""Version comparisons for pyarrow package."""
Expand Down Expand Up @@ -234,3 +237,14 @@ def try_import(self, raise_if_error: bool = False) -> Any:


PANDAS_VERSIONS = PandasVersions()

# Since RANGE support in pandas requires specific versions
# of both pyarrow and pandas, we make this a separate
# constant instead of as a property of PANDAS_VERSIONS
# or PYARROW_VERSIONS.
SUPPORTS_RANGE_PYARROW = (
PANDAS_VERSIONS.try_import() is not None
and PANDAS_VERSIONS.installed_version >= _MIN_PANDAS_VERSION_RANGE
and PYARROW_VERSIONS.try_import() is not None
and PYARROW_VERSIONS.installed_version >= _MIN_PYARROW_VERSION_RANGE
)
48 changes: 15 additions & 33 deletions google/cloud/bigquery/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,12 @@
"because the necessary `__from_arrow__` attribute is missing."
)

_RANGE_PYARROW_WARNING = (
"Unable to represent RANGE schema as struct using pandas ArrowDtype. Using "
"`object` instead. To use ArrowDtype, use pandas >= 1.5 and "
"pyarrow >= 10.0.1."
)

# How many of the total rows need to be downloaded already for us to skip
# calling the BQ Storage API?
ALMOST_COMPLETELY_CACHED_RATIO = 0.333
Expand Down Expand Up @@ -2279,26 +2285,18 @@ def to_dataframe(
time_dtype = db_dtypes.TimeDtype()

if range_date_dtype is DefaultPandasDTypes.RANGE_DATE_DTYPE:
try:
if _versions_helpers.SUPPORTS_RANGE_PYARROW:
range_date_dtype = pandas.ArrowDtype(
pyarrow.struct(
[("start", pyarrow.date32()), ("end", pyarrow.date32())]
)
)
except AttributeError:
# pandas.ArrowDtype was introduced in pandas 1.5, but python 3.7
# only supports upto pandas 1.3. If pandas.ArrowDtype is not
# present, we raise a warning and set range_date_dtype to None.
msg = (
"Unable to find class ArrowDtype in pandas, setting "
"range_date_dtype to be None. To use ArrowDtype, please "
"use pandas >= 1.5 and python >= 3.8."
)
warnings.warn(msg)
else:
warnings.warn(_RANGE_PYARROW_WARNING)
range_date_dtype = None

if range_datetime_dtype is DefaultPandasDTypes.RANGE_DATETIME_DTYPE:
try:
if _versions_helpers.SUPPORTS_RANGE_PYARROW:
range_datetime_dtype = pandas.ArrowDtype(
pyarrow.struct(
[
Expand All @@ -2307,20 +2305,12 @@ def to_dataframe(
]
)
)
except AttributeError:
# pandas.ArrowDtype was introduced in pandas 1.5, but python 3.7
# only supports upto pandas 1.3. If pandas.ArrowDtype is not
# present, we raise a warning and set range_datetime_dtype to None.
msg = (
"Unable to find class ArrowDtype in pandas, setting "
"range_datetime_dtype to be None. To use ArrowDtype, "
"please use pandas >= 1.5 and python >= 3.8."
)
warnings.warn(msg)
else:
warnings.warn(_RANGE_PYARROW_WARNING)
range_datetime_dtype = None

if range_timestamp_dtype is DefaultPandasDTypes.RANGE_TIMESTAMP_DTYPE:
try:
if _versions_helpers.SUPPORTS_RANGE_PYARROW:
range_timestamp_dtype = pandas.ArrowDtype(
pyarrow.struct(
[
Expand All @@ -2329,16 +2319,8 @@ def to_dataframe(
]
)
)
except AttributeError:
# pandas.ArrowDtype was introduced in pandas 1.5, but python 3.7
# only supports upto pandas 1.3. If pandas.ArrowDtype is not
# present, we raise a warning and set range_timestamp_dtype to None.
msg = (
"Unable to find class ArrowDtype in pandas, setting "
"range_timestamp_dtype to be None. To use ArrowDtype, "
"please use pandas >= 1.5 and python >= 3.8."
)
warnings.warn(msg)
else:
warnings.warn(_RANGE_PYARROW_WARNING)
range_timestamp_dtype = None

if bool_dtype is not None and not hasattr(bool_dtype, "__from_arrow__"):
Expand Down

0 comments on commit a86d7b9

Please sign in to comment.