Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We鈥檒l occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: change load_table_from_json autodetect logic #1804

Merged
merged 2 commits into from Jan 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
18 changes: 16 additions & 2 deletions google/cloud/bigquery/client.py
Expand Up @@ -2833,8 +2833,22 @@ def load_table_from_json(

new_job_config.source_format = job.SourceFormat.NEWLINE_DELIMITED_JSON

if new_job_config.schema is None:
new_job_config.autodetect = True
# In specific conditions, we check if the table alread exists, and/or
# set the autodetect value for the user. For exact conditions, see table
# https://github.com/googleapis/python-bigquery/issues/1228#issuecomment-1910946297
if new_job_config.schema is None and new_job_config.autodetect is None:
if new_job_config.write_disposition in (
job.WriteDisposition.WRITE_TRUNCATE,
job.WriteDisposition.WRITE_EMPTY,
):
new_job_config.autodetect = True
else:
try:
self.get_table(destination)
except core_exceptions.NotFound:
new_job_config.autodetect = True
else:
new_job_config.autodetect = False

if project is None:
project = self.project
Expand Down
39 changes: 39 additions & 0 deletions tests/system/test_client.py
Expand Up @@ -994,6 +994,45 @@ def test_load_table_from_json_schema_autodetect(self):
self.assertEqual(tuple(table.schema), table_schema)
self.assertEqual(table.num_rows, 2)

# Autodetect makes best effort to infer the schema, but situations exist
# when the detected schema is wrong, and does not match existing schema.
# Thus the client sets autodetect = False when table exists and just uses
# the existing schema. This test case uses a special case where backend has
# no way to distinguish int from string.
def test_load_table_from_json_schema_autodetect_table_exists(self):
json_rows = [
{"name": "123", "age": 18, "birthday": "2001-10-15", "is_awesome": False},
{"name": "456", "age": 79, "birthday": "1940-03-10", "is_awesome": True},
]

dataset_id = _make_dataset_id("bq_system_test")
self.temp_dataset(dataset_id)
table_id = "{}.{}.load_table_from_json_basic_use".format(
Config.CLIENT.project, dataset_id
)

# Use schema with NULLABLE fields, because schema autodetection
# defaults to field mode NULLABLE.
table_schema = (
bigquery.SchemaField("name", "STRING", mode="NULLABLE"),
bigquery.SchemaField("age", "INTEGER", mode="NULLABLE"),
bigquery.SchemaField("birthday", "DATE", mode="NULLABLE"),
bigquery.SchemaField("is_awesome", "BOOLEAN", mode="NULLABLE"),
)
# create the table before loading so that the column order is predictable
table = helpers.retry_403(Config.CLIENT.create_table)(
Table(table_id, schema=table_schema)
)
self.to_delete.insert(0, table)

# do not pass an explicit job config to trigger automatic schema detection
load_job = Config.CLIENT.load_table_from_json(json_rows, table_id)
load_job.result()

table = Config.CLIENT.get_table(table)
self.assertEqual(tuple(table.schema), table_schema)
self.assertEqual(table.num_rows, 2)

def test_load_avro_from_uri_then_dump_table(self):
from google.cloud.bigquery.job import CreateDisposition
from google.cloud.bigquery.job import SourceFormat
Expand Down
203 changes: 200 additions & 3 deletions tests/unit/test_client.py
Expand Up @@ -8951,6 +8951,8 @@ def test_load_table_from_dataframe_w_higher_scale_decimal128_datatype(self):
SchemaField("x", "BIGNUMERIC", "NULLABLE", None),
)

# With autodetect specified, we pass the value as is. For more info, see
# https://github.com/googleapis/python-bigquery/issues/1228#issuecomment-1910946297
def test_load_table_from_json_basic_use(self):
from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES
from google.cloud.bigquery import job
Expand All @@ -8962,12 +8964,28 @@ def test_load_table_from_json_basic_use(self):
{"name": "Two", "age": 22, "birthday": "1997-08-09", "adult": True},
]

job_config = job.LoadJobConfig(autodetect=True)

load_patch = mock.patch(
"google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
)

with load_patch as load_table_from_file:
client.load_table_from_json(json_rows, self.TABLE_REF)
# mock: remote table already exists
get_table_reference = {
"projectId": "project_id",
"datasetId": "test_dataset",
"tableId": "test_table",
}
get_table_patch = mock.patch(
"google.cloud.bigquery.client.Client.get_table",
autospec=True,
return_value=mock.Mock(table_reference=get_table_reference),
)

with load_patch as load_table_from_file, get_table_patch:
client.load_table_from_json(
json_rows, self.TABLE_REF, job_config=job_config
)

load_table_from_file.assert_called_once_with(
client,
Expand Down Expand Up @@ -9066,6 +9084,174 @@ def test_load_table_from_json_w_invalid_job_config(self):
err_msg = str(exc.value)
assert "Expected an instance of LoadJobConfig" in err_msg

# When all following are true:
# (1) no schema provided;
# (2) no autodetect value provided;
# (3) writeDisposition == WRITE_APPEND or None;
# (4) table already exists,
# client sets autodetect == False
# For more details, see https://github.com/googleapis/python-bigquery/issues/1228#issuecomment-1910946297
def test_load_table_from_json_wo_schema_wo_autodetect_write_append_w_table(self):
from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES
from google.cloud.bigquery import job
from google.cloud.bigquery.job import WriteDisposition

client = self._make_client()

json_rows = [
{"name": "One", "age": 11, "birthday": "2008-09-10", "adult": False},
{"name": "Two", "age": 22, "birthday": "1997-08-09", "adult": True},
]

job_config = job.LoadJobConfig(write_disposition=WriteDisposition.WRITE_APPEND)

load_patch = mock.patch(
"google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
)

# mock: remote table already exists
get_table_reference = {
"projectId": "project_id",
"datasetId": "test_dataset",
"tableId": "test_table",
}
get_table_patch = mock.patch(
"google.cloud.bigquery.client.Client.get_table",
autospec=True,
return_value=mock.Mock(table_reference=get_table_reference),
)

with load_patch as load_table_from_file, get_table_patch:
client.load_table_from_json(
json_rows, self.TABLE_REF, job_config=job_config
)

load_table_from_file.assert_called_once_with(
client,
mock.ANY,
self.TABLE_REF,
size=mock.ANY,
num_retries=_DEFAULT_NUM_RETRIES,
job_id=mock.ANY,
job_id_prefix=None,
location=client.location,
project=client.project,
job_config=mock.ANY,
timeout=DEFAULT_TIMEOUT,
)

sent_config = load_table_from_file.mock_calls[0][2]["job_config"]
assert sent_config.source_format == job.SourceFormat.NEWLINE_DELIMITED_JSON
assert sent_config.schema is None
assert not sent_config.autodetect

# When all following are true:
# (1) no schema provided;
# (2) no autodetect value provided;
# (3) writeDisposition == WRITE_APPEND or None;
# (4) table does NOT exist,
# client sets autodetect == True
# For more details, see https://github.com/googleapis/python-bigquery/issues/1228#issuecomment-1910946297
def test_load_table_from_json_wo_schema_wo_autodetect_write_append_wo_table(self):
import google.api_core.exceptions as core_exceptions
from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES
from google.cloud.bigquery import job
from google.cloud.bigquery.job import WriteDisposition

client = self._make_client()

json_rows = [
{"name": "One", "age": 11, "birthday": "2008-09-10", "adult": False},
{"name": "Two", "age": 22, "birthday": "1997-08-09", "adult": True},
]

job_config = job.LoadJobConfig(write_disposition=WriteDisposition.WRITE_APPEND)

load_patch = mock.patch(
"google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
)

# mock: remote table doesn't exist
get_table_patch = mock.patch(
"google.cloud.bigquery.client.Client.get_table",
autospec=True,
side_effect=core_exceptions.NotFound(""),
)

with load_patch as load_table_from_file, get_table_patch:
client.load_table_from_json(
json_rows, self.TABLE_REF, job_config=job_config
)

load_table_from_file.assert_called_once_with(
client,
mock.ANY,
self.TABLE_REF,
size=mock.ANY,
num_retries=_DEFAULT_NUM_RETRIES,
job_id=mock.ANY,
job_id_prefix=None,
location=client.location,
project=client.project,
job_config=mock.ANY,
timeout=DEFAULT_TIMEOUT,
)

sent_config = load_table_from_file.mock_calls[0][2]["job_config"]
assert sent_config.source_format == job.SourceFormat.NEWLINE_DELIMITED_JSON
assert sent_config.schema is None
assert sent_config.autodetect

# When all following are true:
# (1) no schema provided;
# (2) no autodetect value provided;
# (3) writeDisposition == WRITE_TRUNCATE or WRITE_EMPTY;
# client sets autodetect == True
# For more details, see https://github.com/googleapis/python-bigquery/issues/1228#issuecomment-1910946297
def test_load_table_from_json_wo_schema_wo_autodetect_others(self):
from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES
from google.cloud.bigquery import job
from google.cloud.bigquery.job import WriteDisposition

client = self._make_client()

json_rows = [
{"name": "One", "age": 11, "birthday": "2008-09-10", "adult": False},
{"name": "Two", "age": 22, "birthday": "1997-08-09", "adult": True},
]

job_config = job.LoadJobConfig(
write_disposition=WriteDisposition.WRITE_TRUNCATE
)

load_patch = mock.patch(
"google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
)

with load_patch as load_table_from_file:
client.load_table_from_json(
json_rows, self.TABLE_REF, job_config=job_config
)

load_table_from_file.assert_called_once_with(
client,
mock.ANY,
self.TABLE_REF,
size=mock.ANY,
num_retries=_DEFAULT_NUM_RETRIES,
job_id=mock.ANY,
job_id_prefix=None,
location=client.location,
project=client.project,
job_config=mock.ANY,
timeout=DEFAULT_TIMEOUT,
)

sent_config = load_table_from_file.mock_calls[0][2]["job_config"]
assert sent_config.source_format == job.SourceFormat.NEWLINE_DELIMITED_JSON
assert sent_config.schema is None
assert sent_config.autodetect

def test_load_table_from_json_w_explicit_job_config_override(self):
from google.cloud.bigquery import job
from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES
Expand Down Expand Up @@ -9190,8 +9376,19 @@ def test_load_table_from_json_unicode_emoji_data_case(self):
load_patch = mock.patch(
"google.cloud.bigquery.client.Client.load_table_from_file", autospec=True
)
# mock: remote table already exists
get_table_reference = {
"projectId": "project_id",
"datasetId": "test_dataset",
"tableId": "test_table",
}
get_table_patch = mock.patch(
"google.cloud.bigquery.client.Client.get_table",
autospec=True,
return_value=mock.Mock(table_reference=get_table_reference),
)

with load_patch as load_table_from_file:
with load_patch as load_table_from_file, get_table_patch:
client.load_table_from_json(json_rows, self.TABLE_REF)

load_table_from_file.assert_called_once_with(
Expand Down