From 62490325f64e5d66303d9218992e28ac5f21cb3f Mon Sep 17 00:00:00 2001 From: Lingqing Gan Date: Tue, 30 Jan 2024 10:02:42 -0800 Subject: [PATCH] fix: change load_table_from_json autodetect logic (#1804) --- google/cloud/bigquery/client.py | 18 ++- tests/system/test_client.py | 39 ++++++ tests/unit/test_client.py | 203 +++++++++++++++++++++++++++++++- 3 files changed, 255 insertions(+), 5 deletions(-) diff --git a/google/cloud/bigquery/client.py b/google/cloud/bigquery/client.py index b2ea130c4..4708e753b 100644 --- a/google/cloud/bigquery/client.py +++ b/google/cloud/bigquery/client.py @@ -2833,8 +2833,22 @@ def load_table_from_json( new_job_config.source_format = job.SourceFormat.NEWLINE_DELIMITED_JSON - if new_job_config.schema is None: - new_job_config.autodetect = True + # In specific conditions, we check if the table alread exists, and/or + # set the autodetect value for the user. For exact conditions, see table + # https://github.com/googleapis/python-bigquery/issues/1228#issuecomment-1910946297 + if new_job_config.schema is None and new_job_config.autodetect is None: + if new_job_config.write_disposition in ( + job.WriteDisposition.WRITE_TRUNCATE, + job.WriteDisposition.WRITE_EMPTY, + ): + new_job_config.autodetect = True + else: + try: + self.get_table(destination) + except core_exceptions.NotFound: + new_job_config.autodetect = True + else: + new_job_config.autodetect = False if project is None: project = self.project diff --git a/tests/system/test_client.py b/tests/system/test_client.py index d7e56f7ff..74c152cf2 100644 --- a/tests/system/test_client.py +++ b/tests/system/test_client.py @@ -994,6 +994,45 @@ def test_load_table_from_json_schema_autodetect(self): self.assertEqual(tuple(table.schema), table_schema) self.assertEqual(table.num_rows, 2) + # Autodetect makes best effort to infer the schema, but situations exist + # when the detected schema is wrong, and does not match existing schema. + # Thus the client sets autodetect = False when table exists and just uses + # the existing schema. This test case uses a special case where backend has + # no way to distinguish int from string. + def test_load_table_from_json_schema_autodetect_table_exists(self): + json_rows = [ + {"name": "123", "age": 18, "birthday": "2001-10-15", "is_awesome": False}, + {"name": "456", "age": 79, "birthday": "1940-03-10", "is_awesome": True}, + ] + + dataset_id = _make_dataset_id("bq_system_test") + self.temp_dataset(dataset_id) + table_id = "{}.{}.load_table_from_json_basic_use".format( + Config.CLIENT.project, dataset_id + ) + + # Use schema with NULLABLE fields, because schema autodetection + # defaults to field mode NULLABLE. + table_schema = ( + bigquery.SchemaField("name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("age", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("birthday", "DATE", mode="NULLABLE"), + bigquery.SchemaField("is_awesome", "BOOLEAN", mode="NULLABLE"), + ) + # create the table before loading so that the column order is predictable + table = helpers.retry_403(Config.CLIENT.create_table)( + Table(table_id, schema=table_schema) + ) + self.to_delete.insert(0, table) + + # do not pass an explicit job config to trigger automatic schema detection + load_job = Config.CLIENT.load_table_from_json(json_rows, table_id) + load_job.result() + + table = Config.CLIENT.get_table(table) + self.assertEqual(tuple(table.schema), table_schema) + self.assertEqual(table.num_rows, 2) + def test_load_avro_from_uri_then_dump_table(self): from google.cloud.bigquery.job import CreateDisposition from google.cloud.bigquery.job import SourceFormat diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py index 56bdbad5e..42581edc1 100644 --- a/tests/unit/test_client.py +++ b/tests/unit/test_client.py @@ -8951,6 +8951,8 @@ def test_load_table_from_dataframe_w_higher_scale_decimal128_datatype(self): SchemaField("x", "BIGNUMERIC", "NULLABLE", None), ) + # With autodetect specified, we pass the value as is. For more info, see + # https://github.com/googleapis/python-bigquery/issues/1228#issuecomment-1910946297 def test_load_table_from_json_basic_use(self): from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES from google.cloud.bigquery import job @@ -8962,12 +8964,28 @@ def test_load_table_from_json_basic_use(self): {"name": "Two", "age": 22, "birthday": "1997-08-09", "adult": True}, ] + job_config = job.LoadJobConfig(autodetect=True) + load_patch = mock.patch( "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True ) - with load_patch as load_table_from_file: - client.load_table_from_json(json_rows, self.TABLE_REF) + # mock: remote table already exists + get_table_reference = { + "projectId": "project_id", + "datasetId": "test_dataset", + "tableId": "test_table", + } + get_table_patch = mock.patch( + "google.cloud.bigquery.client.Client.get_table", + autospec=True, + return_value=mock.Mock(table_reference=get_table_reference), + ) + + with load_patch as load_table_from_file, get_table_patch: + client.load_table_from_json( + json_rows, self.TABLE_REF, job_config=job_config + ) load_table_from_file.assert_called_once_with( client, @@ -9066,6 +9084,174 @@ def test_load_table_from_json_w_invalid_job_config(self): err_msg = str(exc.value) assert "Expected an instance of LoadJobConfig" in err_msg + # When all following are true: + # (1) no schema provided; + # (2) no autodetect value provided; + # (3) writeDisposition == WRITE_APPEND or None; + # (4) table already exists, + # client sets autodetect == False + # For more details, see https://github.com/googleapis/python-bigquery/issues/1228#issuecomment-1910946297 + def test_load_table_from_json_wo_schema_wo_autodetect_write_append_w_table(self): + from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES + from google.cloud.bigquery import job + from google.cloud.bigquery.job import WriteDisposition + + client = self._make_client() + + json_rows = [ + {"name": "One", "age": 11, "birthday": "2008-09-10", "adult": False}, + {"name": "Two", "age": 22, "birthday": "1997-08-09", "adult": True}, + ] + + job_config = job.LoadJobConfig(write_disposition=WriteDisposition.WRITE_APPEND) + + load_patch = mock.patch( + "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True + ) + + # mock: remote table already exists + get_table_reference = { + "projectId": "project_id", + "datasetId": "test_dataset", + "tableId": "test_table", + } + get_table_patch = mock.patch( + "google.cloud.bigquery.client.Client.get_table", + autospec=True, + return_value=mock.Mock(table_reference=get_table_reference), + ) + + with load_patch as load_table_from_file, get_table_patch: + client.load_table_from_json( + json_rows, self.TABLE_REF, job_config=job_config + ) + + load_table_from_file.assert_called_once_with( + client, + mock.ANY, + self.TABLE_REF, + size=mock.ANY, + num_retries=_DEFAULT_NUM_RETRIES, + job_id=mock.ANY, + job_id_prefix=None, + location=client.location, + project=client.project, + job_config=mock.ANY, + timeout=DEFAULT_TIMEOUT, + ) + + sent_config = load_table_from_file.mock_calls[0][2]["job_config"] + assert sent_config.source_format == job.SourceFormat.NEWLINE_DELIMITED_JSON + assert sent_config.schema is None + assert not sent_config.autodetect + + # When all following are true: + # (1) no schema provided; + # (2) no autodetect value provided; + # (3) writeDisposition == WRITE_APPEND or None; + # (4) table does NOT exist, + # client sets autodetect == True + # For more details, see https://github.com/googleapis/python-bigquery/issues/1228#issuecomment-1910946297 + def test_load_table_from_json_wo_schema_wo_autodetect_write_append_wo_table(self): + import google.api_core.exceptions as core_exceptions + from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES + from google.cloud.bigquery import job + from google.cloud.bigquery.job import WriteDisposition + + client = self._make_client() + + json_rows = [ + {"name": "One", "age": 11, "birthday": "2008-09-10", "adult": False}, + {"name": "Two", "age": 22, "birthday": "1997-08-09", "adult": True}, + ] + + job_config = job.LoadJobConfig(write_disposition=WriteDisposition.WRITE_APPEND) + + load_patch = mock.patch( + "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True + ) + + # mock: remote table doesn't exist + get_table_patch = mock.patch( + "google.cloud.bigquery.client.Client.get_table", + autospec=True, + side_effect=core_exceptions.NotFound(""), + ) + + with load_patch as load_table_from_file, get_table_patch: + client.load_table_from_json( + json_rows, self.TABLE_REF, job_config=job_config + ) + + load_table_from_file.assert_called_once_with( + client, + mock.ANY, + self.TABLE_REF, + size=mock.ANY, + num_retries=_DEFAULT_NUM_RETRIES, + job_id=mock.ANY, + job_id_prefix=None, + location=client.location, + project=client.project, + job_config=mock.ANY, + timeout=DEFAULT_TIMEOUT, + ) + + sent_config = load_table_from_file.mock_calls[0][2]["job_config"] + assert sent_config.source_format == job.SourceFormat.NEWLINE_DELIMITED_JSON + assert sent_config.schema is None + assert sent_config.autodetect + + # When all following are true: + # (1) no schema provided; + # (2) no autodetect value provided; + # (3) writeDisposition == WRITE_TRUNCATE or WRITE_EMPTY; + # client sets autodetect == True + # For more details, see https://github.com/googleapis/python-bigquery/issues/1228#issuecomment-1910946297 + def test_load_table_from_json_wo_schema_wo_autodetect_others(self): + from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES + from google.cloud.bigquery import job + from google.cloud.bigquery.job import WriteDisposition + + client = self._make_client() + + json_rows = [ + {"name": "One", "age": 11, "birthday": "2008-09-10", "adult": False}, + {"name": "Two", "age": 22, "birthday": "1997-08-09", "adult": True}, + ] + + job_config = job.LoadJobConfig( + write_disposition=WriteDisposition.WRITE_TRUNCATE + ) + + load_patch = mock.patch( + "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True + ) + + with load_patch as load_table_from_file: + client.load_table_from_json( + json_rows, self.TABLE_REF, job_config=job_config + ) + + load_table_from_file.assert_called_once_with( + client, + mock.ANY, + self.TABLE_REF, + size=mock.ANY, + num_retries=_DEFAULT_NUM_RETRIES, + job_id=mock.ANY, + job_id_prefix=None, + location=client.location, + project=client.project, + job_config=mock.ANY, + timeout=DEFAULT_TIMEOUT, + ) + + sent_config = load_table_from_file.mock_calls[0][2]["job_config"] + assert sent_config.source_format == job.SourceFormat.NEWLINE_DELIMITED_JSON + assert sent_config.schema is None + assert sent_config.autodetect + def test_load_table_from_json_w_explicit_job_config_override(self): from google.cloud.bigquery import job from google.cloud.bigquery.client import _DEFAULT_NUM_RETRIES @@ -9190,8 +9376,19 @@ def test_load_table_from_json_unicode_emoji_data_case(self): load_patch = mock.patch( "google.cloud.bigquery.client.Client.load_table_from_file", autospec=True ) + # mock: remote table already exists + get_table_reference = { + "projectId": "project_id", + "datasetId": "test_dataset", + "tableId": "test_table", + } + get_table_patch = mock.patch( + "google.cloud.bigquery.client.Client.get_table", + autospec=True, + return_value=mock.Mock(table_reference=get_table_reference), + ) - with load_patch as load_table_from_file: + with load_patch as load_table_from_file, get_table_patch: client.load_table_from_json(json_rows, self.TABLE_REF) load_table_from_file.assert_called_once_with(