Skip to content
Permalink
Browse files
fix: converting to dataframe with out of bounds timestamps (#209)
Fixes #168.

This PR fixes the problem when converting query results to Pandas with `pyarrow` when data contains timestamps that would fall out of `pyarrow`'s nanoseconds precision.

The fix requires `pyarrow>=1.0.0`, thus it only works on Python 3.

### PR checklist
- [x] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery/issues/new/choose) before writing your code!  That way we can discuss the change, evaluate designs, and agree on the general idea
- [x] Ensure the tests and linter pass
- [x] Code coverage does not decrease (if any source code was changed)
- [x] Appropriate docs were updated (if necessary)
  • Loading branch information
plamut committed Aug 15, 2020
1 parent 478597a commit 8209203e967f0624ad306166c0af6f6f1027c550
Showing with 96 additions and 2 deletions.
  1. +30 −1 google/cloud/bigquery/table.py
  2. +3 −1 setup.py
  3. +63 −0 tests/unit/test_table.py
@@ -21,6 +21,7 @@
import functools
import logging
import operator
import pytz
import warnings

import six
@@ -1726,7 +1727,35 @@ def to_dataframe(
bqstorage_client=bqstorage_client,
create_bqstorage_client=create_bqstorage_client,
)
df = record_batch.to_pandas(date_as_object=date_as_object)

# When converting timestamp values to nanosecond precision, the result
# can be out of pyarrow bounds. To avoid the error when converting to
# Pandas, we set the timestamp_as_object parameter to True, if necessary.
#
# NOTE: Python 3+ only, as timestamp_as_object parameter is only supported
# in pyarrow>=1.0, but the latter is not compatible with Python 2.
if six.PY2:
extra_kwargs = {}
else:
types_to_check = {
pyarrow.timestamp("us"),
pyarrow.timestamp("us", tz=pytz.UTC),
}

for column in record_batch:
if column.type in types_to_check:
try:
column.cast("timestamp[ns]")
except pyarrow.lib.ArrowInvalid:
timestamp_as_object = True
break
else:
timestamp_as_object = False

extra_kwargs = {"timestamp_as_object": timestamp_as_object}

df = record_batch.to_pandas(date_as_object=date_as_object, **extra_kwargs)

for column in dtypes:
df[column] = pandas.Series(df[column], dtype=dtypes[column])
return df
@@ -48,7 +48,9 @@
"pandas": ["pandas>=0.17.1"],
# Exclude PyArrow dependency from Windows Python 2.7.
'pyarrow: platform_system != "Windows" or python_version >= "3.5"': [
"pyarrow>=0.17.0"
"pyarrow>=1.0.0, <2.0dev; python_version>='3.4'",
# Pyarrow >= 0.17.0 is not compatible with Python 2 anymore.
"pyarrow < 0.17.0; python_version < '3.0'",
],
"tqdm": ["tqdm >= 4.0.0, <5.0.0dev"],
"fastparquet": [
@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import datetime as dt
import itertools
import logging
import time
@@ -2271,6 +2272,68 @@ def test_to_dataframe(self):
self.assertEqual(df.name.dtype.name, "object")
self.assertEqual(df.age.dtype.name, "int64")

@pytest.mark.xfail(
six.PY2,
reason=(
"Requires pyarrow>-1.0 to work, but the latter is not compatible "
"with Python 2 anymore."
),
)
@unittest.skipIf(pandas is None, "Requires `pandas`")
@unittest.skipIf(pyarrow is None, "Requires `pyarrow`")
def test_to_dataframe_timestamp_out_of_pyarrow_bounds(self):
from google.cloud.bigquery.schema import SchemaField

schema = [SchemaField("some_timestamp", "TIMESTAMP")]
rows = [
{"f": [{"v": "81953424000.0"}]}, # 4567-01-01 00:00:00 UTC
{"f": [{"v": "253402214400.0"}]}, # 9999-12-31 00:00:00 UTC
]
path = "/foo"
api_request = mock.Mock(return_value={"rows": rows})
row_iterator = self._make_one(_mock_client(), api_request, path, schema)

df = row_iterator.to_dataframe(create_bqstorage_client=False)

self.assertIsInstance(df, pandas.DataFrame)
self.assertEqual(len(df), 2) # verify the number of rows
self.assertEqual(list(df.columns), ["some_timestamp"])
self.assertEqual(
list(df["some_timestamp"]),
[dt.datetime(4567, 1, 1), dt.datetime(9999, 12, 31)],
)

@pytest.mark.xfail(
six.PY2,
reason=(
"Requires pyarrow>-1.0 to work, but the latter is not compatible "
"with Python 2 anymore."
),
)
@unittest.skipIf(pandas is None, "Requires `pandas`")
@unittest.skipIf(pyarrow is None, "Requires `pyarrow`")
def test_to_dataframe_datetime_out_of_pyarrow_bounds(self):
from google.cloud.bigquery.schema import SchemaField

schema = [SchemaField("some_datetime", "DATETIME")]
rows = [
{"f": [{"v": "4567-01-01T00:00:00"}]},
{"f": [{"v": "9999-12-31T00:00:00"}]},
]
path = "/foo"
api_request = mock.Mock(return_value={"rows": rows})
row_iterator = self._make_one(_mock_client(), api_request, path, schema)

df = row_iterator.to_dataframe(create_bqstorage_client=False)

self.assertIsInstance(df, pandas.DataFrame)
self.assertEqual(len(df), 2) # verify the number of rows
self.assertEqual(list(df.columns), ["some_datetime"])
self.assertEqual(
list(df["some_datetime"]),
[dt.datetime(4567, 1, 1), dt.datetime(9999, 12, 31)],
)

@unittest.skipIf(pandas is None, "Requires `pandas`")
def test_to_dataframe_warning_wo_pyarrow(self):
from google.cloud.bigquery.client import PyarrowMissingWarning

0 comments on commit 8209203

Please sign in to comment.