From bd39414f79c770c3b661a99eed63f7712c895706 Mon Sep 17 00:00:00 2001
From: Tim Swast <swast@google.com>
Date: Fri, 25 Jan 2019 15:15:39 -0800
Subject: [PATCH] CLN: Use `to_dataframe` to download query results.

This allows us to remove logic for parsing the schema and align with
google-cloud-bigquery.
---
 benchmark/README.md                 | 16 ++++++++++++
 benchmark/read_gbq_large_results.py |  8 ++++++
 benchmark/read_gbq_small_results.py |  7 +++++
 pandas_gbq/gbq.py                   | 40 +++++------------------------
 4 files changed, 37 insertions(+), 34 deletions(-)
 create mode 100644 benchmark/README.md
 create mode 100644 benchmark/read_gbq_large_results.py
 create mode 100644 benchmark/read_gbq_small_results.py

diff --git a/benchmark/README.md b/benchmark/README.md
new file mode 100644
index 00000000..5ede71d7
--- /dev/null
+++ b/benchmark/README.md
@@ -0,0 +1,16 @@
+# pandas-gbq benchmarks
+
+This directory contains a few scripts which are useful for performance
+testing the pandas-gbq library. Use cProfile to time the script and see
+details about where time is spent. To avoid timing how long BigQuery takes to
+execute a query, run the benchmark twice to ensure the results are cached.
+
+## `read_gbq`
+
+Read a small table (a few KB).
+
+    python -m cProfile --sort=cumtime read_gbq_small_results.py
+
+Read a large-ish table (100+ MB).
+
+    python -m cProfile --sort=cumtime read_gbq_large_results.py
diff --git a/benchmark/read_gbq_large_results.py b/benchmark/read_gbq_large_results.py
new file mode 100644
index 00000000..5a8bf268
--- /dev/null
+++ b/benchmark/read_gbq_large_results.py
@@ -0,0 +1,8 @@
+
+import pandas_gbq
+
+# Select 163 MB worth of data, to time how long it takes to download large
+# result sets.
+df = pandas_gbq.read_gbq(
+    "SELECT * FROM `bigquery-public-data.usa_names.usa_1910_2013`",
+    dialect="standard")
diff --git a/benchmark/read_gbq_small_results.py b/benchmark/read_gbq_small_results.py
new file mode 100644
index 00000000..cfff10b3
--- /dev/null
+++ b/benchmark/read_gbq_small_results.py
@@ -0,0 +1,7 @@
+
+import pandas_gbq
+
+# Select a few KB worth of data, to time downloading small result sets.
+df = pandas_gbq.read_gbq(
+    "SELECT * FROM `bigquery-public-data.utility_us.country_code_iso`",
+    dialect="standard")
diff --git a/pandas_gbq/gbq.py b/pandas_gbq/gbq.py
index d35eba05..6b22ade9 100644
--- a/pandas_gbq/gbq.py
+++ b/pandas_gbq/gbq.py
@@ -1,12 +1,9 @@
 import logging
-import os
 import time
 import warnings
-from collections import OrderedDict
 from datetime import datetime
 
 import numpy as np
-from pandas import DataFrame
 
 from pandas_gbq.exceptions import AccessDenied
 
@@ -69,7 +66,7 @@ def _check_google_client_version():
 def _test_google_api_imports():
 
     try:
-        import pydata_google_auth
+        import pydata_google_auth  # noqa
     except ImportError as ex:
         raise ImportError(
             "pandas-gbq requires pydata-google-auth: {0}".format(ex)
@@ -483,15 +480,9 @@ def run_query(self, query, **kwargs):
             rows_iter = query_reply.result()
         except self.http_error as ex:
             self.process_http_error(ex)
-        result_rows = list(rows_iter)
-        total_rows = rows_iter.total_rows
-        schema = {
-            "fields": [field.to_api_repr() for field in rows_iter.schema]
-        }
-
-        logger.debug("Got {} rows.\n".format(total_rows))
-
-        return schema, result_rows
+        df = rows_iter.to_dataframe()
+        logger.debug("Got {} rows.\n".format(rows_iter.total_rows))
+        return df
 
     def load_data(
         self,
@@ -662,25 +653,6 @@ def _parse_schema(schema_fields):
             yield name, dtype
 
 
-def _parse_data(schema, rows):
-
-    column_dtypes = OrderedDict(_parse_schema(schema["fields"]))
-    df = DataFrame(data=(iter(r) for r in rows), columns=column_dtypes.keys())
-
-    for column in df:
-        dtype = column_dtypes[column]
-        null_safe = (
-            df[column].notnull().all()
-            or dtype == float
-            or dtype == "datetime64[ns]"
-        )
-        if dtype and null_safe:
-            df[column] = df[column].astype(
-                column_dtypes[column], errors="ignore"
-            )
-    return df
-
-
 def read_gbq(
     query,
     project_id=None,
@@ -833,8 +805,8 @@ def read_gbq(
         credentials=credentials,
         private_key=private_key,
     )
-    schema, rows = connector.run_query(query, configuration=configuration)
-    final_df = _parse_data(schema, rows)
+
+    final_df = connector.run_query(query, configuration=configuration)
 
     # Reindex the DataFrame on the provided column
     if index_col is not None: