From 433391097bae57dd12a93db18fc2bab573d8f128 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Wed, 30 Mar 2022 14:20:16 -0500 Subject: [PATCH] fix(deps): raise exception when pandas is installed but db-dtypes is not (#1191) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `db-dtypes` is already present in the `pandas` "extras", but this PR ensures that if pandas is present and db-dtypes is not, a more understandable error message is raised. ``` google/cloud/bigquery/_pandas_helpers.py:991: ValueError ____________________________________ test_list_rows_nullable_scalars_extreme_dtypes[10] _____________________________________ # Copyright 2019 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Shared helper functions for connecting BigQuery and pandas.""" import concurrent.futures from datetime import datetime import functools from itertools import islice import logging import queue import warnings try: import pandas # type: ignore pandas_import_exception = None except ImportError as exc: # pragma: NO COVER pandas = None pandas_import_exception = exc else: import numpy try: > import db_dtypes # type: ignore E ModuleNotFoundError: No module named 'db_dtypes' google/cloud/bigquery/_pandas_helpers.py:36: ModuleNotFoundError The above exception was the direct cause of the following exception: bigquery_client = scalars_extreme_table = 'swast-scratch.python_bigquery_tests_system_20220330160830_ffff89.scalars_extreme_jsonl0x3ffeb' max_results = 10 @pytest.mark.parametrize( ("max_results",), ( (None,), (10,), ), # Use BQ Storage API. # Use REST API. ) def test_list_rows_nullable_scalars_extreme_dtypes( bigquery_client, scalars_extreme_table, max_results ): # TODO(GH#836): Avoid INTERVAL columns until they are supported by the # BigQuery Storage API and pyarrow. schema = [ bigquery.SchemaField("bool_col", enums.SqlTypeNames.BOOLEAN), bigquery.SchemaField("bignumeric_col", enums.SqlTypeNames.BIGNUMERIC), bigquery.SchemaField("bytes_col", enums.SqlTypeNames.BYTES), bigquery.SchemaField("date_col", enums.SqlTypeNames.DATE), bigquery.SchemaField("datetime_col", enums.SqlTypeNames.DATETIME), bigquery.SchemaField("float64_col", enums.SqlTypeNames.FLOAT64), bigquery.SchemaField("geography_col", enums.SqlTypeNames.GEOGRAPHY), bigquery.SchemaField("int64_col", enums.SqlTypeNames.INT64), bigquery.SchemaField("numeric_col", enums.SqlTypeNames.NUMERIC), bigquery.SchemaField("string_col", enums.SqlTypeNames.STRING), bigquery.SchemaField("time_col", enums.SqlTypeNames.TIME), bigquery.SchemaField("timestamp_col", enums.SqlTypeNames.TIMESTAMP), ] df = bigquery_client.list_rows( scalars_extreme_table, max_results=max_results, selected_fields=schema, > ).to_dataframe() tests/system/test_pandas.py:1084: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ google/cloud/bigquery/table.py:1925: in to_dataframe _pandas_helpers.verify_pandas_imports() _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ def verify_pandas_imports(): if pandas is None: raise ValueError(_NO_PANDAS_ERROR) from pandas_import_exception if db_dtypes is None: > raise ValueError(_NO_DB_TYPES_ERROR) from db_dtypes_import_exception E ValueError: Please install the 'db-dtypes' package to use this function. google/cloud/bigquery/_pandas_helpers.py:991: ValueError ``` Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes #1188 🦕 --- google/cloud/bigquery/_pandas_helpers.py | 33 +++++++++++++++++++----- google/cloud/bigquery/table.py | 20 +++++--------- tests/unit/test__pandas_helpers.py | 13 ++++++++++ tests/unit/test_table.py | 8 +++--- 4 files changed, 49 insertions(+), 25 deletions(-) diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py index 17de6830a..cc0ee75ff 100644 --- a/google/cloud/bigquery/_pandas_helpers.py +++ b/google/cloud/bigquery/_pandas_helpers.py @@ -24,16 +24,25 @@ try: import pandas # type: ignore -except ImportError: # pragma: NO COVER + + pandas_import_exception = None +except ImportError as exc: # pragma: NO COVER pandas = None - date_dtype_name = time_dtype_name = "" # Use '' rather than None because pytype + pandas_import_exception = exc else: import numpy - from db_dtypes import DateDtype, TimeDtype # type: ignore +try: + import db_dtypes # type: ignore + + date_dtype_name = db_dtypes.DateDtype.name + time_dtype_name = db_dtypes.TimeDtype.name + db_dtypes_import_exception = None +except ImportError as exc: # pragma: NO COVER + db_dtypes = None + db_dtypes_import_exception = exc + date_dtype_name = time_dtype_name = "" # Use '' rather than None because pytype - date_dtype_name = DateDtype.name - time_dtype_name = TimeDtype.name import pyarrow # type: ignore import pyarrow.parquet # type: ignore @@ -84,6 +93,9 @@ def _to_wkb(v): _MAX_QUEUE_SIZE_DEFAULT = object() # max queue size sentinel for BQ Storage downloads +_NO_PANDAS_ERROR = "Please install the 'pandas' package to use this function." +_NO_DB_TYPES_ERROR = "Please install the 'db-dtypes' package to use this function." + _PANDAS_DTYPE_TO_BQ = { "bool": "BOOLEAN", "datetime64[ns, UTC]": "TIMESTAMP", @@ -290,13 +302,13 @@ def types_mapper(arrow_data_type): not date_as_object and pyarrow.types.is_date(arrow_data_type) ): - return DateDtype() + return db_dtypes.DateDtype() elif pyarrow.types.is_integer(arrow_data_type): return pandas.Int64Dtype() elif pyarrow.types.is_time(arrow_data_type): - return TimeDtype() + return db_dtypes.TimeDtype() return types_mapper @@ -970,3 +982,10 @@ def dataframe_to_json_generator(dataframe): output[column] = value yield output + + +def verify_pandas_imports(): + if pandas is None: + raise ValueError(_NO_PANDAS_ERROR) from pandas_import_exception + if db_dtypes is None: + raise ValueError(_NO_DB_TYPES_ERROR) from db_dtypes_import_exception diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py index ed4f214ce..5a4de6a01 100644 --- a/google/cloud/bigquery/table.py +++ b/google/cloud/bigquery/table.py @@ -28,8 +28,6 @@ import pandas # type: ignore except ImportError: # pragma: NO COVER pandas = None -else: - import db_dtypes # type: ignore # noqa import pyarrow # type: ignore @@ -69,10 +67,6 @@ from google.cloud.bigquery.dataset import DatasetReference -_NO_PANDAS_ERROR = ( - "The pandas library is not installed, please install " - "pandas to use the to_dataframe() function." -) _NO_GEOPANDAS_ERROR = ( "The geopandas library is not installed, please install " "geopandas to use the to_geodataframe() function." @@ -1818,8 +1812,8 @@ def to_dataframe_iterable( ValueError: If the :mod:`pandas` library cannot be imported. """ - if pandas is None: - raise ValueError(_NO_PANDAS_ERROR) + _pandas_helpers.verify_pandas_imports() + if dtypes is None: dtypes = {} @@ -1928,8 +1922,8 @@ def to_dataframe( :mod:`shapely` library cannot be imported. """ - if pandas is None: - raise ValueError(_NO_PANDAS_ERROR) + _pandas_helpers.verify_pandas_imports() + if geography_as_object and shapely is None: raise ValueError(_NO_SHAPELY_ERROR) @@ -2181,8 +2175,7 @@ def to_dataframe( Returns: pandas.DataFrame: An empty :class:`~pandas.DataFrame`. """ - if pandas is None: - raise ValueError(_NO_PANDAS_ERROR) + _pandas_helpers.verify_pandas_imports() return pandas.DataFrame() def to_geodataframe( @@ -2238,8 +2231,7 @@ def to_dataframe_iterable( ValueError: If the :mod:`pandas` library cannot be imported. """ - if pandas is None: - raise ValueError(_NO_PANDAS_ERROR) + _pandas_helpers.verify_pandas_imports() return iter((pandas.DataFrame(),)) def to_arrow_iterable( diff --git a/tests/unit/test__pandas_helpers.py b/tests/unit/test__pandas_helpers.py index 5b2fadaf1..1a3f918eb 100644 --- a/tests/unit/test__pandas_helpers.py +++ b/tests/unit/test__pandas_helpers.py @@ -1751,3 +1751,16 @@ def test_bq_to_arrow_field_metadata(module_under_test, field_type, metadata): ).metadata == metadata ) + + +def test_verify_pandas_imports_no_pandas(module_under_test, monkeypatch): + monkeypatch.setattr(module_under_test, "pandas", None) + with pytest.raises(ValueError, match="Please install the 'pandas' package"): + module_under_test.verify_pandas_imports() + + +@pytest.mark.skipif(pandas is None, reason="Requires `pandas`") +def test_verify_pandas_imports_no_db_dtypes(module_under_test, monkeypatch): + monkeypatch.setattr(module_under_test, "db_dtypes", None) + with pytest.raises(ValueError, match="Please install the 'db-dtypes' package"): + module_under_test.verify_pandas_imports() diff --git a/tests/unit/test_table.py b/tests/unit/test_table.py index 5241230a4..66bc1d3db 100644 --- a/tests/unit/test_table.py +++ b/tests/unit/test_table.py @@ -1836,7 +1836,7 @@ def test_to_arrow_iterable(self): self.assertEqual(record_batch.num_rows, 0) self.assertEqual(record_batch.num_columns, 0) - @mock.patch("google.cloud.bigquery.table.pandas", new=None) + @mock.patch("google.cloud.bigquery._pandas_helpers.pandas", new=None) def test_to_dataframe_error_if_pandas_is_none(self): row_iterator = self._make_one() with self.assertRaises(ValueError): @@ -1849,7 +1849,7 @@ def test_to_dataframe(self): self.assertIsInstance(df, pandas.DataFrame) self.assertEqual(len(df), 0) # verify the number of rows - @mock.patch("google.cloud.bigquery.table.pandas", new=None) + @mock.patch("google.cloud.bigquery._pandas_helpers.pandas", new=None) def test_to_dataframe_iterable_error_if_pandas_is_none(self): row_iterator = self._make_one() with self.assertRaises(ValueError): @@ -2967,7 +2967,7 @@ def test_to_dataframe_iterable_w_bqstorage_max_results_warning(self): assert isinstance(dataframes[0], pandas.DataFrame) assert isinstance(dataframes[1], pandas.DataFrame) - @mock.patch("google.cloud.bigquery.table.pandas", new=None) + @mock.patch("google.cloud.bigquery._pandas_helpers.pandas", new=None) def test_to_dataframe_iterable_error_if_pandas_is_none(self): from google.cloud.bigquery.schema import SchemaField @@ -3339,7 +3339,7 @@ def test_to_dataframe_datetime_objects(self): self.assertEqual(df["ts"][0].date(), datetime.date(1336, 3, 23)) self.assertEqual(df["date"][0], datetime.date(1111, 1, 1)) - @mock.patch("google.cloud.bigquery.table.pandas", new=None) + @mock.patch("google.cloud.bigquery._pandas_helpers.pandas", new=None) def test_to_dataframe_error_if_pandas_is_none(self): from google.cloud.bigquery.schema import SchemaField