Skip to content

Commit

Permalink
fix: change load_table_from_json autodetect logic (#1804)
Browse files Browse the repository at this point in the history
  • Loading branch information
Linchin committed Jan 30, 2024
1 parent 1298594 commit 6249032
Show file tree
Hide file tree
Showing 3 changed files with 255 additions and 5 deletions.
18 changes: 16 additions & 2 deletions google/cloud/bigquery/client.py
Expand Up @@ -2833,8 +2833,22 @@ def load_table_from_json(

new_job_config.source_format = job.SourceFormat.NEWLINE_DELIMITED_JSON

if new_job_config.schema is None:
new_job_config.autodetect = True
# In specific conditions, we check if the table alread exists, and/or
# set the autodetect value for the user. For exact conditions, see table
# https://github.com/googleapis/python-bigquery/issues/1228#issuecomment-1910946297
if new_job_config.schema is None and new_job_config.autodetect is None:
if new_job_config.write_disposition in (
job.WriteDisposition.WRITE_TRUNCATE,
job.WriteDisposition.WRITE_EMPTY,
):
new_job_config.autodetect = True
else:
try:
self.get_table(destination)
except core_exceptions.NotFound:
new_job_config.autodetect = True
else:
new_job_config.autodetect = False

if project is None:
project = self.project
Expand Down
39 changes: 39 additions & 0 deletions tests/system/test_client.py
Expand Up @@ -994,6 +994,45 @@ def test_load_table_from_json_schema_autodetect(self):
self.assertEqual(tuple(table.schema), table_schema)
self.assertEqual(table.num_rows, 2)

# Autodetect makes best effort to infer the schema, but situations exist
# when the detected schema is wrong, and does not match existing schema.
# Thus the client sets autodetect = False when table exists and just uses
# the existing schema. This test case uses a special case where backend has
# no way to distinguish int from string.
def test_load_table_from_json_schema_autodetect_table_exists(self):
json_rows = [
{"name": "123", "age": 18, "birthday": "2001-10-15", "is_awesome": False},
{"name": "456", "age": 79, "birthday": "1940-03-10", "is_awesome": True},
]

dataset_id = _make_dataset_id("bq_system_test")
self.temp_dataset(dataset_id)
table_id = "{}.{}.load_table_from_json_basic_use".format(
Config.CLIENT.project, dataset_id
)

# Use schema with NULLABLE fields, because schema autodetection
# defaults to field mode NULLABLE.
table_schema = (
bigquery.SchemaField("name", "STRING", mode="NULLABLE"),
bigquery.SchemaField("age", "INTEGER", mode="NULLABLE"),
bigquery.SchemaField("birthday", "DATE", mode="NULLABLE"),
bigquery.SchemaField("is_awesome", "BOOLEAN", mode="NULLABLE"),
)
# create the table before loading so that the column order is predictable
table = helpers.retry_403(Config.CLIENT.create_table)(
Table(table_id, schema=table_schema)
)
self.to_delete.insert(0, table)

# do not pass an explicit job config to trigger automatic schema detection
load_job = Config.CLIENT.load_table_from_json(json_rows, table_id)
load_job.result()

table = Config.CLIENT.get_table(table)
self.assertEqual(tuple(table.schema), table_schema)
self.assertEqual(table.num_rows, 2)

def test_load_avro_from_uri_then_dump_table(self):
from google.cloud.bigquery.job import CreateDisposition
from google.cloud.bigquery.job import SourceFormat
Expand Down
203 changes: 200 additions & 3 deletions tests/unit/test_client.py
Expand Up @@ -8951,6 +8951,8 @@ def test_load_table_from_dataframe_w_higher_scale_decimal128_datatype(self):
SchemaField("x", "BIGNUMERIC", "NULLABLE", None),
)

# With autodetect specified, we pass the value as is. For more info, see
# https://github.com/googleapis/python-bigquery/issues/1228#issuecomment-1910946297
def test_load_table_from_json_basic_use(self):
from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES
from google.cloud.bigquery import job
Expand All @@ -8962,12 +8964,28 @@ def test_load_table_from_json_basic_use(self):
{"name": "Two", "age": 22, "birthday": "1997-08-09", "adult": True},
]

job_config = job.LoadJobConfig(autodetect=True)

load_patch = mock.patch(
"google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
)

with load_patch as load_table_from_file:
client.load_table_from_json(json_rows, self.TABLE_REF)
# mock: remote table already exists
get_table_reference = {
"projectId": "project_id",
"datasetId": "test_dataset",
"tableId": "test_table",
}
get_table_patch = mock.patch(
"google.cloud.bigquery.client.Client.get_table",
autospec=True,
return_value=mock.Mock(table_reference=get_table_reference),
)

with load_patch as load_table_from_file, get_table_patch:
client.load_table_from_json(
json_rows, self.TABLE_REF, job_config=job_config
)

load_table_from_file.assert_called_once_with(
client,
Expand Down Expand Up @@ -9066,6 +9084,174 @@ def test_load_table_from_json_w_invalid_job_config(self):
err_msg = str(exc.value)
assert "Expected an instance of LoadJobConfig" in err_msg

# When all following are true:
# (1) no schema provided;
# (2) no autodetect value provided;
# (3) writeDisposition == WRITE_APPEND or None;
# (4) table already exists,
# client sets autodetect == False
# For more details, see https://github.com/googleapis/python-bigquery/issues/1228#issuecomment-1910946297
def test_load_table_from_json_wo_schema_wo_autodetect_write_append_w_table(self):
from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES
from google.cloud.bigquery import job
from google.cloud.bigquery.job import WriteDisposition

client = self._make_client()

json_rows = [
{"name": "One", "age": 11, "birthday": "2008-09-10", "adult": False},
{"name": "Two", "age": 22, "birthday": "1997-08-09", "adult": True},
]

job_config = job.LoadJobConfig(write_disposition=WriteDisposition.WRITE_APPEND)

load_patch = mock.patch(
"google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
)

# mock: remote table already exists
get_table_reference = {
"projectId": "project_id",
"datasetId": "test_dataset",
"tableId": "test_table",
}
get_table_patch = mock.patch(
"google.cloud.bigquery.client.Client.get_table",
autospec=True,
return_value=mock.Mock(table_reference=get_table_reference),
)

with load_patch as load_table_from_file, get_table_patch:
client.load_table_from_json(
json_rows, self.TABLE_REF, job_config=job_config
)

load_table_from_file.assert_called_once_with(
client,
mock.ANY,
self.TABLE_REF,
size=mock.ANY,
num_retries=_DEFAULT_NUM_RETRIES,
job_id=mock.ANY,
job_id_prefix=None,
location=client.location,
project=client.project,
job_config=mock.ANY,
timeout=DEFAULT_TIMEOUT,
)

sent_config = load_table_from_file.mock_calls[0][2]["job_config"]
assert sent_config.source_format == job.SourceFormat.NEWLINE_DELIMITED_JSON
assert sent_config.schema is None
assert not sent_config.autodetect

# When all following are true:
# (1) no schema provided;
# (2) no autodetect value provided;
# (3) writeDisposition == WRITE_APPEND or None;
# (4) table does NOT exist,
# client sets autodetect == True
# For more details, see https://github.com/googleapis/python-bigquery/issues/1228#issuecomment-1910946297
def test_load_table_from_json_wo_schema_wo_autodetect_write_append_wo_table(self):
import google.api_core.exceptions as core_exceptions
from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES
from google.cloud.bigquery import job
from google.cloud.bigquery.job import WriteDisposition

client = self._make_client()

json_rows = [
{"name": "One", "age": 11, "birthday": "2008-09-10", "adult": False},
{"name": "Two", "age": 22, "birthday": "1997-08-09", "adult": True},
]

job_config = job.LoadJobConfig(write_disposition=WriteDisposition.WRITE_APPEND)

load_patch = mock.patch(
"google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
)

# mock: remote table doesn't exist
get_table_patch = mock.patch(
"google.cloud.bigquery.client.Client.get_table",
autospec=True,
side_effect=core_exceptions.NotFound(""),
)

with load_patch as load_table_from_file, get_table_patch:
client.load_table_from_json(
json_rows, self.TABLE_REF, job_config=job_config
)

load_table_from_file.assert_called_once_with(
client,
mock.ANY,
self.TABLE_REF,
size=mock.ANY,
num_retries=_DEFAULT_NUM_RETRIES,
job_id=mock.ANY,
job_id_prefix=None,
location=client.location,
project=client.project,
job_config=mock.ANY,
timeout=DEFAULT_TIMEOUT,
)

sent_config = load_table_from_file.mock_calls[0][2]["job_config"]
assert sent_config.source_format == job.SourceFormat.NEWLINE_DELIMITED_JSON
assert sent_config.schema is None
assert sent_config.autodetect

# When all following are true:
# (1) no schema provided;
# (2) no autodetect value provided;
# (3) writeDisposition == WRITE_TRUNCATE or WRITE_EMPTY;
# client sets autodetect == True
# For more details, see https://github.com/googleapis/python-bigquery/issues/1228#issuecomment-1910946297
def test_load_table_from_json_wo_schema_wo_autodetect_others(self):
from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES
from google.cloud.bigquery import job
from google.cloud.bigquery.job import WriteDisposition

client = self._make_client()

json_rows = [
{"name": "One", "age": 11, "birthday": "2008-09-10", "adult": False},
{"name": "Two", "age": 22, "birthday": "1997-08-09", "adult": True},
]

job_config = job.LoadJobConfig(
write_disposition=WriteDisposition.WRITE_TRUNCATE
)

load_patch = mock.patch(
"google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
)

with load_patch as load_table_from_file:
client.load_table_from_json(
json_rows, self.TABLE_REF, job_config=job_config
)

load_table_from_file.assert_called_once_with(
client,
mock.ANY,
self.TABLE_REF,
size=mock.ANY,
num_retries=_DEFAULT_NUM_RETRIES,
job_id=mock.ANY,
job_id_prefix=None,
location=client.location,
project=client.project,
job_config=mock.ANY,
timeout=DEFAULT_TIMEOUT,
)

sent_config = load_table_from_file.mock_calls[0][2]["job_config"]
assert sent_config.source_format == job.SourceFormat.NEWLINE_DELIMITED_JSON
assert sent_config.schema is None
assert sent_config.autodetect

def test_load_table_from_json_w_explicit_job_config_override(self):
from google.cloud.bigquery import job
from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES
Expand Down Expand Up @@ -9190,8 +9376,19 @@ def test_load_table_from_json_unicode_emoji_data_case(self):
load_patch = mock.patch(
"google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
)
# mock: remote table already exists
get_table_reference = {
"projectId": "project_id",
"datasetId": "test_dataset",
"tableId": "test_table",
}
get_table_patch = mock.patch(
"google.cloud.bigquery.client.Client.get_table",
autospec=True,
return_value=mock.Mock(table_reference=get_table_reference),
)

with load_patch as load_table_from_file:
with load_patch as load_table_from_file, get_table_patch:
client.load_table_from_json(json_rows, self.TABLE_REF)

load_table_from_file.assert_called_once_with(
Expand Down

0 comments on commit 6249032

Please sign in to comment.