Skip to content
Permalink
Browse files
perf: remove redundant array deepcopy (#26)
* perf(bigquery): remove redundant array deepcopy

deepcopy can be a very costly operation when considering large arrays with complex nested objects.
refactor helpers to allow recursive conversion without copying arrays.

* add check to ignore REPEATED mode

* Update google/cloud/bigquery/_helpers.py

Co-authored-by: Bu Sun Kim <8822365+busunkim96@users.noreply.github.com>

Co-authored-by: Tres Seaver <tseaver@palladion.com>
Co-authored-by: Tim Swast <swast@google.com>
Co-authored-by: Bu Sun Kim <8822365+busunkim96@users.noreply.github.com>
  • Loading branch information
4 people committed Oct 7, 2020
1 parent d1eb8b3 commit b54f86769c982ce5c8fcbf3889f82450428bb40c
Showing with 64 additions and 10 deletions.
  1. +29 −10 google/cloud/bigquery/_helpers.py
  2. +35 −0 tests/unit/test__helpers.py
@@ -15,7 +15,6 @@
"""Shared helper functions for BigQuery API classes."""

import base64
import copy
import datetime
import decimal
import re
@@ -397,13 +396,9 @@ def _repeated_field_to_json(field, row_value):
Returns:
List[Any]: A list of JSON-serializable objects.
"""
# Remove the REPEATED, but keep the other fields. This allows us to process
# each item as if it were a top-level field.
item_field = copy.deepcopy(field)
item_field._mode = "NULLABLE"
values = []
for item in row_value:
values.append(_field_to_json(item_field, item))
values.append(_single_field_to_json(field, item))
return values


@@ -462,6 +457,33 @@ def _record_field_to_json(fields, row_value):
return record


def _single_field_to_json(field, row_value):
"""Convert a single field into JSON-serializable values.
Ignores mode so that this can function for ARRAY / REPEATING fields
without requiring a deepcopy of the field. See:
https://github.com/googleapis/python-bigquery/issues/6
Args:
field (google.cloud.bigquery.schema.SchemaField):
The SchemaField to use for type conversion and field name.
row_value (Any):
Scalar or Struct to be inserted. The type
is inferred from the SchemaField's field_type.
Returns:
Any: A JSON-serializable object.
"""
if row_value is None:
return None

if field.field_type == "RECORD":
return _record_field_to_json(field.fields, row_value)

return _scalar_field_to_json(field, row_value)


def _field_to_json(field, row_value):
"""Convert a field into JSON-serializable values.
@@ -483,10 +505,7 @@ def _field_to_json(field, row_value):
if field.mode == "REPEATED":
return _repeated_field_to_json(field, row_value)

if field.field_type == "RECORD":
return _record_field_to_json(field.fields, row_value)

return _scalar_field_to_json(field, row_value)
return _single_field_to_json(field, row_value)


def _snake_to_camel_case(value):
@@ -806,6 +806,41 @@ def test_w_known_field_type(self):
self.assertEqual(converted, str(original))


class Test_single_field_to_json(unittest.TestCase):
def _call_fut(self, field, value):
from google.cloud.bigquery._helpers import _single_field_to_json

return _single_field_to_json(field, value)

def test_w_none(self):
field = _make_field("INT64")
original = None
converted = self._call_fut(field, original)
self.assertIsNone(converted)

def test_w_record(self):
subfields = [
_make_field("INT64", name="one"),
_make_field("STRING", name="two"),
]
field = _make_field("RECORD", fields=subfields)
original = {"one": 42, "two": "two"}
converted = self._call_fut(field, original)
self.assertEqual(converted, {"one": "42", "two": "two"})

def test_w_scalar(self):
field = _make_field("INT64")
original = 42
converted = self._call_fut(field, original)
self.assertEqual(converted, str(original))

def test_w_scalar_ignores_mode(self):
field = _make_field("STRING", mode="REPEATED")
original = "hello world"
converted = self._call_fut(field, original)
self.assertEqual(converted, original)


class Test_repeated_field_to_json(unittest.TestCase):
def _call_fut(self, field, value):
from google.cloud.bigquery._helpers import _repeated_field_to_json

0 comments on commit b54f867

Please sign in to comment.