Skip to content

Commit ac771c1

Browse files
authored
feat: add pandas_gbq.sample (#983)
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery-pandas/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes #<issue_number_goes_here> 🦕
1 parent 0e94692 commit ac771c1

File tree

14 files changed

+1035
-187
lines changed

14 files changed

+1035
-187
lines changed

pandas_gbq/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@
22
# Use of this source code is governed by a BSD-style
33
# license that can be found in the LICENSE file.
44

5+
import logging
56
import warnings
67

78
from pandas_gbq import version as pandas_gbq_version
89
from pandas_gbq.contexts import Context, context
10+
from pandas_gbq.core.sample import sample
911

1012
from . import _versions_helpers
1113
from .gbq import read_gbq, to_gbq # noqa
@@ -21,6 +23,8 @@
2123
FutureWarning,
2224
)
2325

26+
logger = logging.Logger(__name__)
27+
2428
__version__ = pandas_gbq_version.__version__
2529

2630
__all__ = [
@@ -29,4 +33,5 @@
2933
"read_gbq",
3034
"Context",
3135
"context",
36+
"sample",
3237
]

pandas_gbq/constants.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
# Use of this source code is governed by a BSD-style
33
# license that can be found in the LICENSE file.
44

5+
import google.api_core.exceptions
6+
57
# BigQuery uses powers of 2 in calculating data sizes. See:
68
# https://cloud.google.com/bigquery/pricing#data The documentation uses
79
# GiB rather than GB to disambiguate from the alternative base 10 units.
@@ -10,3 +12,8 @@
1012
BYTES_IN_MIB = 1024 * BYTES_IN_KIB
1113
BYTES_IN_GIB = 1024 * BYTES_IN_MIB
1214
BYTES_TO_RECOMMEND_BIGFRAMES = BYTES_IN_GIB
15+
16+
HTTP_ERRORS = (
17+
google.api_core.exceptions.ClientError,
18+
google.api_core.exceptions.GoogleAPIError,
19+
)

pandas_gbq/core/read.py

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
# Copyright (c) 2025 pandas-gbq Authors All rights reserved.
2+
# Use of this source code is governed by a BSD-style
3+
# license that can be found in the LICENSE file.
4+
5+
from __future__ import annotations
6+
7+
import typing
8+
from typing import Any, Dict, Optional, Sequence
9+
import warnings
10+
11+
import google.cloud.bigquery
12+
import google.cloud.bigquery.table
13+
import numpy as np
14+
15+
import pandas_gbq
16+
import pandas_gbq.constants
17+
import pandas_gbq.exceptions
18+
import pandas_gbq.features
19+
import pandas_gbq.timestamp
20+
21+
# Only import at module-level at type checking time to avoid circular
22+
# dependencies in the pandas package, which has an optional dependency on
23+
# pandas-gbq.
24+
if typing.TYPE_CHECKING: # pragma: NO COVER
25+
import pandas
26+
27+
28+
def _bqschema_to_nullsafe_dtypes(schema_fields):
29+
"""Specify explicit dtypes based on BigQuery schema.
30+
31+
This function only specifies a dtype when the dtype allows nulls.
32+
Otherwise, use pandas's default dtype choice.
33+
34+
See: http://pandas.pydata.org/pandas-docs/dev/missing_data.html
35+
#missing-data-casting-rules-and-indexing
36+
"""
37+
import db_dtypes
38+
39+
# If you update this mapping, also update the table at
40+
# `docs/reading.rst`.
41+
dtype_map = {
42+
"FLOAT": np.dtype(float),
43+
"INTEGER": "Int64",
44+
"TIME": db_dtypes.TimeDtype(),
45+
# Note: Other types such as 'datetime64[ns]' and db_types.DateDtype()
46+
# are not included because the pandas range does not align with the
47+
# BigQuery range. We need to attempt a conversion to those types and
48+
# fall back to 'object' when there are out-of-range values.
49+
}
50+
51+
# Amend dtype_map with newer extension types if pandas version allows.
52+
if pandas_gbq.features.FEATURES.pandas_has_boolean_dtype:
53+
dtype_map["BOOLEAN"] = "boolean"
54+
55+
dtypes = {}
56+
for field in schema_fields:
57+
name = str(field["name"])
58+
# Array BigQuery type is represented as an object column containing
59+
# list objects.
60+
if field["mode"].upper() == "REPEATED":
61+
dtypes[name] = "object"
62+
continue
63+
64+
dtype = dtype_map.get(field["type"].upper())
65+
if dtype:
66+
dtypes[name] = dtype
67+
68+
return dtypes
69+
70+
71+
def _finalize_dtypes(
72+
df: pandas.DataFrame, schema_fields: Sequence[Dict[str, Any]]
73+
) -> pandas.DataFrame:
74+
"""
75+
Attempt to change the dtypes of those columns that don't map exactly.
76+
77+
For example db_dtypes.DateDtype() and datetime64[ns] cannot represent
78+
0001-01-01, but they can represent dates within a couple hundred years of
79+
1970. See:
80+
https://github.com/googleapis/python-bigquery-pandas/issues/365
81+
"""
82+
import db_dtypes
83+
import pandas.api.types
84+
85+
# If you update this mapping, also update the table at
86+
# `docs/reading.rst`.
87+
dtype_map = {
88+
"DATE": db_dtypes.DateDtype(),
89+
"DATETIME": "datetime64[ns]",
90+
"TIMESTAMP": "datetime64[ns]",
91+
}
92+
93+
for field in schema_fields:
94+
# This method doesn't modify ARRAY/REPEATED columns.
95+
if field["mode"].upper() == "REPEATED":
96+
continue
97+
98+
name = str(field["name"])
99+
dtype = dtype_map.get(field["type"].upper())
100+
101+
# Avoid deprecated conversion to timezone-naive dtype by only casting
102+
# object dtypes.
103+
if dtype and pandas.api.types.is_object_dtype(df[name]):
104+
df[name] = df[name].astype(dtype, errors="ignore")
105+
106+
# Ensure any TIMESTAMP columns are tz-aware.
107+
df = pandas_gbq.timestamp.localize_df(df, schema_fields)
108+
109+
return df
110+
111+
112+
def download_results(
113+
results: google.cloud.bigquery.table.RowIterator,
114+
*,
115+
bqclient: google.cloud.bigquery.Client,
116+
progress_bar_type: Optional[str],
117+
warn_on_large_results: bool = True,
118+
max_results: Optional[int],
119+
user_dtypes: Optional[dict],
120+
use_bqstorage_api: bool,
121+
) -> Optional[pandas.DataFrame]:
122+
# No results are desired, so don't bother downloading anything.
123+
if max_results == 0:
124+
return None
125+
126+
if user_dtypes is None:
127+
user_dtypes = {}
128+
129+
create_bqstorage_client = use_bqstorage_api
130+
if max_results is not None:
131+
create_bqstorage_client = False
132+
133+
# If we're downloading a large table, BigQuery DataFrames might be a
134+
# better fit. Not all code paths will populate rows_iter._table, but
135+
# if it's not populated that means we are working with a small result
136+
# set.
137+
if (
138+
warn_on_large_results
139+
and (table_ref := getattr(results, "_table", None)) is not None
140+
):
141+
table = bqclient.get_table(table_ref)
142+
if (
143+
isinstance((num_bytes := table.num_bytes), int)
144+
and num_bytes > pandas_gbq.constants.BYTES_TO_RECOMMEND_BIGFRAMES
145+
):
146+
num_gib = num_bytes / pandas_gbq.constants.BYTES_IN_GIB
147+
warnings.warn(
148+
f"Recommendation: Your results are {num_gib:.1f} GiB. "
149+
"Consider using BigQuery DataFrames (https://bit.ly/bigframes-intro)"
150+
"to process large results with pandas compatible APIs with transparent SQL "
151+
"pushdown to BigQuery engine. This provides an opportunity to save on costs "
152+
"and improve performance. "
153+
"Please reach out to bigframes-feedback@google.com with any "
154+
"questions or concerns. To disable this message, run "
155+
"warnings.simplefilter('ignore', category=pandas_gbq.exceptions.LargeResultsWarning)",
156+
category=pandas_gbq.exceptions.LargeResultsWarning,
157+
# user's code
158+
# -> read_gbq
159+
# -> run_query
160+
# -> download_results
161+
stacklevel=4,
162+
)
163+
164+
try:
165+
schema_fields = [field.to_api_repr() for field in results.schema]
166+
conversion_dtypes = _bqschema_to_nullsafe_dtypes(schema_fields)
167+
conversion_dtypes.update(user_dtypes)
168+
df = results.to_dataframe(
169+
dtypes=conversion_dtypes,
170+
progress_bar_type=progress_bar_type,
171+
create_bqstorage_client=create_bqstorage_client,
172+
)
173+
except pandas_gbq.constants.HTTP_ERRORS as ex:
174+
raise pandas_gbq.exceptions.translate_exception(ex) from ex
175+
176+
df = _finalize_dtypes(df, schema_fields)
177+
178+
pandas_gbq.logger.debug("Got {} rows.\n".format(results.total_rows))
179+
return df

0 commit comments

Comments
 (0)