From bd39414f79c770c3b661a99eed63f7712c895706 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Fri, 25 Jan 2019 15:15:39 -0800 Subject: [PATCH] CLN: Use `to_dataframe` to download query results. This allows us to remove logic for parsing the schema and align with google-cloud-bigquery. --- benchmark/README.md | 16 ++++++++++++ benchmark/read_gbq_large_results.py | 8 ++++++ benchmark/read_gbq_small_results.py | 7 +++++ pandas_gbq/gbq.py | 40 +++++------------------------ 4 files changed, 37 insertions(+), 34 deletions(-) create mode 100644 benchmark/README.md create mode 100644 benchmark/read_gbq_large_results.py create mode 100644 benchmark/read_gbq_small_results.py diff --git a/benchmark/README.md b/benchmark/README.md new file mode 100644 index 00000000..5ede71d7 --- /dev/null +++ b/benchmark/README.md @@ -0,0 +1,16 @@ +# pandas-gbq benchmarks + +This directory contains a few scripts which are useful for performance +testing the pandas-gbq library. Use cProfile to time the script and see +details about where time is spent. To avoid timing how long BigQuery takes to +execute a query, run the benchmark twice to ensure the results are cached. + +## `read_gbq` + +Read a small table (a few KB). + + python -m cProfile --sort=cumtime read_gbq_small_results.py + +Read a large-ish table (100+ MB). + + python -m cProfile --sort=cumtime read_gbq_large_results.py diff --git a/benchmark/read_gbq_large_results.py b/benchmark/read_gbq_large_results.py new file mode 100644 index 00000000..5a8bf268 --- /dev/null +++ b/benchmark/read_gbq_large_results.py @@ -0,0 +1,8 @@ + +import pandas_gbq + +# Select 163 MB worth of data, to time how long it takes to download large +# result sets. +df = pandas_gbq.read_gbq( + "SELECT * FROM `bigquery-public-data.usa_names.usa_1910_2013`", + dialect="standard") diff --git a/benchmark/read_gbq_small_results.py b/benchmark/read_gbq_small_results.py new file mode 100644 index 00000000..cfff10b3 --- /dev/null +++ b/benchmark/read_gbq_small_results.py @@ -0,0 +1,7 @@ + +import pandas_gbq + +# Select a few KB worth of data, to time downloading small result sets. +df = pandas_gbq.read_gbq( + "SELECT * FROM `bigquery-public-data.utility_us.country_code_iso`", + dialect="standard") diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py index d35eba05..6b22ade9 100644 --- a/pandas_gbq/gbq.py +++ b/pandas_gbq/gbq.py @@ -1,12 +1,9 @@ import logging -import os import time import warnings -from collections import OrderedDict from datetime import datetime import numpy as np -from pandas import DataFrame from pandas_gbq.exceptions import AccessDenied @@ -69,7 +66,7 @@ def _check_google_client_version(): def _test_google_api_imports(): try: - import pydata_google_auth + import pydata_google_auth # noqa except ImportError as ex: raise ImportError( "pandas-gbq requires pydata-google-auth: {0}".format(ex) @@ -483,15 +480,9 @@ def run_query(self, query, **kwargs): rows_iter = query_reply.result() except self.http_error as ex: self.process_http_error(ex) - result_rows = list(rows_iter) - total_rows = rows_iter.total_rows - schema = { - "fields": [field.to_api_repr() for field in rows_iter.schema] - } - - logger.debug("Got {} rows.\n".format(total_rows)) - - return schema, result_rows + df = rows_iter.to_dataframe() + logger.debug("Got {} rows.\n".format(rows_iter.total_rows)) + return df def load_data( self, @@ -662,25 +653,6 @@ def _parse_schema(schema_fields): yield name, dtype -def _parse_data(schema, rows): - - column_dtypes = OrderedDict(_parse_schema(schema["fields"])) - df = DataFrame(data=(iter(r) for r in rows), columns=column_dtypes.keys()) - - for column in df: - dtype = column_dtypes[column] - null_safe = ( - df[column].notnull().all() - or dtype == float - or dtype == "datetime64[ns]" - ) - if dtype and null_safe: - df[column] = df[column].astype( - column_dtypes[column], errors="ignore" - ) - return df - - def read_gbq( query, project_id=None, @@ -833,8 +805,8 @@ def read_gbq( credentials=credentials, private_key=private_key, ) - schema, rows = connector.run_query(query, configuration=configuration) - final_df = _parse_data(schema, rows) + + final_df = connector.run_query(query, configuration=configuration) # Reindex the DataFrame on the provided column if index_col is not None: