Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

docs(bigquery): document how to achieve higher write limit and add tests #9574

Merged
merged 4 commits into from Nov 1, 2019
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
14 changes: 14 additions & 0 deletions bigquery/docs/usage/tables.rst
Expand Up @@ -122,6 +122,20 @@ Insert rows into a table's data with the
:start-after: [START bigquery_table_insert_rows]
:end-before: [END bigquery_table_insert_rows]

Insert rows into a table's data with the
:func:`~google.cloud.bigquery.client.Client.insert_rows` method, achieving
higher write limit:

.. literalinclude:: ../samples/table_insert_rows_no_explicit_row_ids.py
:language: python
:dedent: 4
:start-after: [START bigquery_table_insert_rows_no_explicit_row_ids]
:end-before: [END bigquery_table_insert_rows_no_explicit_row_ids]

Mind that inserting data without row insert IDs can come at the expense of more
duplicate inserts. See also:
`Streaming inserts <https://cloud.google.com/bigquery/quotas#streaming_inserts>`_.

Add an empty column to the existing table with the
:func:`~google.cloud.bigquery.update_table` method:

Expand Down
35 changes: 19 additions & 16 deletions bigquery/google/cloud/bigquery/client.py
Expand Up @@ -2264,29 +2264,32 @@ def insert_rows_json(
table (Union[ \
google.cloud.bigquery.table.Table \
google.cloud.bigquery.table.TableReference, \
str, \
str \
]):
The destination table for the row data, or a reference to it.
json_rows (Sequence[Dict]):
Row data to be inserted. Keys must match the table schema fields
and values must be JSON-compatible representations.
row_ids (Sequence[str]):
(Optional) Unique ids, one per row being inserted. If omitted,
unique IDs are created.
skip_invalid_rows (bool):
(Optional) Insert all valid rows of a request, even if invalid
rows exist. The default value is False, which causes the entire
request to fail if any invalid rows exist.
ignore_unknown_values (bool):
(Optional) Accept rows that contain values that do not match the
schema. The unknown values are ignored. Default is False, which
row_ids (Optional[Sequence[Optional[str]]]):
Unique IDs, one per row being inserted. An ID can also be
``None``, indicating that an explicit insert ID should **not**
be used for that row. If the argument is omitted altogether,
unique IDs are created automatically.
skip_invalid_rows (Optional[bool]):
Insert all valid rows of a request, even if invalid rows exist.
The default value is ``False``, which causes the entire request
to fail if any invalid rows exist.
ignore_unknown_values (Optional[bool]):
Accept rows that contain values that do not match the schema.
The unknown values are ignored. Default is ``False``, which
treats unknown values as errors.
template_suffix (str):
(Optional) treat ``name`` as a template table and provide a suffix.
BigQuery will create the table ``<name> + <template_suffix>`` based
on the schema of the template table. See
template_suffix (Optional[str]):
Treat ``name`` as a template table and provide a suffix.
BigQuery will create the table ``<name> + <template_suffix>``
based on the schema of the template table. See
https://cloud.google.com/bigquery/streaming-data-into-bigquery#template-tables
retry (google.api_core.retry.Retry): (Optional) How to retry the RPC.
retry (Optional[google.api_core.retry.Retry]):
How to retry the RPC.

Returns:
Sequence[Mappings]:
Expand Down
36 changes: 36 additions & 0 deletions bigquery/samples/table_insert_rows_no_explicit_row_ids.py
@@ -0,0 +1,36 @@
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


def table_insert_rows_no_explicit_row_ids(client, table_id):

# [START bigquery_table_insert_rows_no_explicit_row_ids]
# TODO(developer): Import the client library.
# from google.cloud import bigquery

# TODO(developer): Construct a BigQuery client object.
# client = bigquery.Client()

# TODO(developer): Set table_id to the ID of the model to fetch.
# table_id = "your-project.your_dataset.your_table"

table = client.get_table(table_id) # Make an API request.
rows_to_insert = [(u"Phred Phlyntstone", 32), (u"Wylma Phlyntstone", 29)]

errors = client.insert_rows(
table, rows_to_insert, row_ids=[None] * len(rows_to_insert)
) # Make an API request.
if errors == []:
print("New rows have been added.")
# [END bigquery_table_insert_rows_no_explicit_row_ids]
@@ -0,0 +1,33 @@
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from google.cloud import bigquery

from .. import table_insert_rows_no_explicit_row_ids as mut


def test_table_insert_rows_no_explicit_row_ids(capsys, client, random_table_id):

schema = [
bigquery.SchemaField("full_name", "STRING", mode="REQUIRED"),
bigquery.SchemaField("age", "INTEGER", mode="REQUIRED"),
]

table = bigquery.Table(random_table_id, schema=schema)
table = client.create_table(table)

mut.table_insert_rows_no_explicit_row_ids(client, random_table_id)
out, err = capsys.readouterr()
assert "New rows have been added." in out
104 changes: 104 additions & 0 deletions bigquery/tests/unit/test_client.py
Expand Up @@ -4572,6 +4572,40 @@ def test_insert_rows_w_record_schema(self):
method="POST", path="/%s" % PATH, data=SENT
)

def test_insert_rows_wo_explicit_insert_ids(self):
plamut marked this conversation as resolved.
Show resolved Hide resolved
from google.cloud.bigquery.schema import SchemaField
from google.cloud.bigquery.table import Table

PATH = "projects/{}/datasets/{}/tables/{}/insertAll".format(
self.PROJECT, self.DS_ID, self.TABLE_ID,
)
creds = _make_credentials()
http = object()
client = self._make_one(project=self.PROJECT, credentials=creds, _http=http)
conn = client._connection = make_connection({})
schema = [
SchemaField("full_name", "STRING", mode="REQUIRED"),
SchemaField("age", "INTEGER", mode="REQUIRED"),
]
table = Table(self.TABLE_REF, schema=schema)
ROWS = [
{"full_name": "Phred Phlyntstone", "age": 32},
{"full_name": "Bharney Rhubble", "age": 33},
]

def _row_data(row):
row["age"] = str(row["age"])
return row

SENT = {"rows": [{"json": _row_data(row), "insertId": None} for row in ROWS]}

errors = client.insert_rows(table, ROWS, row_ids=[None] * len(ROWS))

self.assertEqual(len(errors), 0)
conn.api_request.assert_called_once_with(
method="POST", path="/{}".format(PATH), data=SENT
)

def test_insert_rows_errors(self):
from google.cloud.bigquery.table import Table

Expand Down Expand Up @@ -4765,6 +4799,55 @@ def test_insert_rows_from_dataframe_many_columns(self):
assert len(actual_calls) == 1
assert actual_calls[0] == expected_call

@unittest.skipIf(pandas is None, "Requires `pandas`")
def test_insert_rows_from_dataframe_wo_explicit_insert_ids(self):
plamut marked this conversation as resolved.
Show resolved Hide resolved
from google.cloud.bigquery.table import SchemaField
from google.cloud.bigquery.table import Table

API_PATH = "/projects/{}/datasets/{}/tables/{}/insertAll".format(
self.PROJECT, self.DS_ID, self.TABLE_REF.table_id
)

dataframe = pandas.DataFrame(
[
{"name": u"Little One", "adult": False},
{"name": u"Young Gun", "adult": True},
]
)

# create client
creds = _make_credentials()
http = object()
client = self._make_one(project=self.PROJECT, credentials=creds, _http=http)
conn = client._connection = make_connection({}, {})

# create table
schema = [
SchemaField("name", "STRING", mode="REQUIRED"),
SchemaField("adult", "BOOLEAN", mode="REQUIRED"),
]
table = Table(self.TABLE_REF, schema=schema)

error_info = client.insert_rows_from_dataframe(
table, dataframe, row_ids=[None] * len(dataframe)
)

self.assertEqual(len(error_info), 1)
assert error_info[0] == [] # no chunk errors

EXPECTED_SENT_DATA = {
"rows": [
{"insertId": None, "json": {"name": "Little One", "adult": "false"}},
{"insertId": None, "json": {"name": "Young Gun", "adult": "true"}},
]
}

actual_calls = conn.api_request.call_args_list
assert len(actual_calls) == 1
assert actual_calls[0] == mock.call(
method="POST", path=API_PATH, data=EXPECTED_SENT_DATA
)

def test_insert_rows_json(self):
from google.cloud.bigquery.table import Table, SchemaField
from google.cloud.bigquery.dataset import DatasetReference
Expand Down Expand Up @@ -4833,6 +4916,27 @@ def test_insert_rows_json_with_string_id(self):
data=expected,
)

def test_insert_rows_json_wo_explicit_insert_ids(self):
plamut marked this conversation as resolved.
Show resolved Hide resolved
rows = [{"col1": "val1"}, {"col2": "val2"}]
creds = _make_credentials()
http = object()
client = self._make_one(
project="default-project", credentials=creds, _http=http
)
conn = client._connection = make_connection({})

errors = client.insert_rows_json(
"proj.dset.tbl", rows, row_ids=[None] * len(rows),
)

self.assertEqual(len(errors), 0)
expected = {"rows": [{"json": row, "insertId": None} for row in rows]}
conn.api_request.assert_called_once_with(
method="POST",
path="/projects/proj/datasets/dset/tables/tbl/insertAll",
data=expected,
)

def test_list_partitions(self):
from google.cloud.bigquery.table import Table

Expand Down