Skip to content

Commit

Permalink
CLN: Use to_dataframe to download query results.
Browse files Browse the repository at this point in the history
This allows us to remove logic for parsing the schema and align with
google-cloud-bigquery.
  • Loading branch information
tswast committed Jan 25, 2019
1 parent b0254c4 commit bd39414
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 34 deletions.
16 changes: 16 additions & 0 deletions benchmark/README.md
@@ -0,0 +1,16 @@
# pandas-gbq benchmarks

This directory contains a few scripts which are useful for performance
testing the pandas-gbq library. Use cProfile to time the script and see
details about where time is spent. To avoid timing how long BigQuery takes to
execute a query, run the benchmark twice to ensure the results are cached.

## `read_gbq`

Read a small table (a few KB).

python -m cProfile --sort=cumtime read_gbq_small_results.py

Read a large-ish table (100+ MB).

python -m cProfile --sort=cumtime read_gbq_large_results.py
8 changes: 8 additions & 0 deletions benchmark/read_gbq_large_results.py
@@ -0,0 +1,8 @@

import pandas_gbq

# Select 163 MB worth of data, to time how long it takes to download large
# result sets.
df = pandas_gbq.read_gbq(
"SELECT * FROM `bigquery-public-data.usa_names.usa_1910_2013`",
dialect="standard")
7 changes: 7 additions & 0 deletions benchmark/read_gbq_small_results.py
@@ -0,0 +1,7 @@

import pandas_gbq

# Select a few KB worth of data, to time downloading small result sets.
df = pandas_gbq.read_gbq(
"SELECT * FROM `bigquery-public-data.utility_us.country_code_iso`",
dialect="standard")
40 changes: 6 additions & 34 deletions pandas_gbq/gbq.py
@@ -1,12 +1,9 @@
import logging
import os
import time
import warnings
from collections import OrderedDict
from datetime import datetime

import numpy as np
from pandas import DataFrame

from pandas_gbq.exceptions import AccessDenied

Expand Down Expand Up @@ -69,7 +66,7 @@ def _check_google_client_version():
def _test_google_api_imports():

try:
import pydata_google_auth
import pydata_google_auth # noqa
except ImportError as ex:
raise ImportError(
"pandas-gbq requires pydata-google-auth: {0}".format(ex)
Expand Down Expand Up @@ -483,15 +480,9 @@ def run_query(self, query, **kwargs):
rows_iter = query_reply.result()
except self.http_error as ex:
self.process_http_error(ex)
result_rows = list(rows_iter)
total_rows = rows_iter.total_rows
schema = {
"fields": [field.to_api_repr() for field in rows_iter.schema]
}

logger.debug("Got {} rows.\n".format(total_rows))

return schema, result_rows
df = rows_iter.to_dataframe()
logger.debug("Got {} rows.\n".format(rows_iter.total_rows))
return df

def load_data(
self,
Expand Down Expand Up @@ -662,25 +653,6 @@ def _parse_schema(schema_fields):
yield name, dtype


def _parse_data(schema, rows):

column_dtypes = OrderedDict(_parse_schema(schema["fields"]))
df = DataFrame(data=(iter(r) for r in rows), columns=column_dtypes.keys())

for column in df:
dtype = column_dtypes[column]
null_safe = (
df[column].notnull().all()
or dtype == float
or dtype == "datetime64[ns]"
)
if dtype and null_safe:
df[column] = df[column].astype(
column_dtypes[column], errors="ignore"
)
return df


def read_gbq(
query,
project_id=None,
Expand Down Expand Up @@ -833,8 +805,8 @@ def read_gbq(
credentials=credentials,
private_key=private_key,
)
schema, rows = connector.run_query(query, configuration=configuration)
final_df = _parse_data(schema, rows)

final_df = connector.run_query(query, configuration=configuration)

# Reindex the DataFrame on the provided column
if index_col is not None:
Expand Down

0 comments on commit bd39414

Please sign in to comment.