diff --git a/CHANGELOG.md b/CHANGELOG.md index 785fbd3..ffe533e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # Changelog +## [0.6.0-alpha](https://github.com/googleapis/python-documentai-toolbox/compare/v0.5.0-alpha...v0.6.0-alpha) (2023-04-17) + + +### Features + +* Add blocks to PageWrapper ([#107](https://github.com/googleapis/python-documentai-toolbox/issues/107)) ([df7dfe7](https://github.com/googleapis/python-documentai-toolbox/commit/df7dfe7b79d39010d5addb3fa861a9c803caae45)) +* Added `form_fields_to_bigquery()` method ([#104](https://github.com/googleapis/python-documentai-toolbox/issues/104)) ([96abe22](https://github.com/googleapis/python-documentai-toolbox/commit/96abe220c9909bcc5642ea146c06fd082a2f8009)) + ## [0.5.0-alpha](https://github.com/googleapis/python-documentai-toolbox/compare/v0.4.1-alpha...v0.5.0-alpha) (2023-04-07) diff --git a/google/cloud/documentai_toolbox/utilities/gcs_utilities.py b/google/cloud/documentai_toolbox/utilities/gcs_utilities.py index 054df39..280d770 100644 --- a/google/cloud/documentai_toolbox/utilities/gcs_utilities.py +++ b/google/cloud/documentai_toolbox/utilities/gcs_utilities.py @@ -26,33 +26,32 @@ from google.cloud.documentai_toolbox import constants -def _get_storage_client(module: str = None): - r"""Returns a Storage client with custom user agent header. +def _get_client_info(module: str = None) -> client_info.ClientInfo: + r"""Returns a custom user agent header. Returns: - storage.Client. + client_info.ClientInfo. """ + client_library_version = documentai_toolbox.__version__ if module: - user_agent = ( - f"{constants.USER_AGENT_PRODUCT}/{documentai_toolbox.__version__}-{module}" - ) + client_library_version = f"{client_library_version}-{module}" - info = client_info.ClientInfo( - client_library_version=f"{documentai_toolbox.__version__}-{module}", - user_agent=user_agent, - ) - return storage.Client(client_info=info) + return client_info.ClientInfo( + client_library_version=client_library_version, + user_agent=f"{constants.USER_AGENT_PRODUCT}/{client_library_version}", + ) - user_agent = f"{constants.USER_AGENT_PRODUCT}/{documentai_toolbox.__version__}" - info = client_info.ClientInfo( - client_library_version=documentai_toolbox.__version__, - user_agent=user_agent, - ) +def _get_storage_client(module: str = None) -> storage.Client: + r"""Returns a Storage client with custom user agent header. - return storage.Client(client_info=info) + Returns: + storage.Client. + + """ + return storage.Client(client_info=_get_client_info(module)) def get_bytes(gcs_bucket_name: str, gcs_prefix: str) -> List[bytes]: diff --git a/google/cloud/documentai_toolbox/version.py b/google/cloud/documentai_toolbox/version.py index eba8486..56d5b17 100644 --- a/google/cloud/documentai_toolbox/version.py +++ b/google/cloud/documentai_toolbox/version.py @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. # -__version__ = "0.5.0-alpha" +__version__ = "0.6.0-alpha" diff --git a/google/cloud/documentai_toolbox/wrappers/document.py b/google/cloud/documentai_toolbox/wrappers/document.py index be8eb11..5359b05 100644 --- a/google/cloud/documentai_toolbox/wrappers/document.py +++ b/google/cloud/documentai_toolbox/wrappers/document.py @@ -236,6 +236,106 @@ def _get_batch_process_metadata( return metadata +def _insert_into_dictionary_with_list(dic: Dict, key: str, value: str) -> Dict: + r"""Inserts value into a dictionary that can contain lists. + + Args: + dic (Dict): + Required. The dictionary to insert into. + key (str): + Required. The key to be created or inserted into. + value (str): + Required. The value to be inserted. + + Returns: + Dict: + The dictionary after adding the key value pair. + """ + existing_value = dic.get(key) + + if existing_value: + # For duplicate keys, + # Change Type to a List if not already + if not isinstance(existing_value, list): + existing_value = [existing_value] + + existing_value.append(value) + dic[key] = existing_value + else: + dic[key] = value + + return dic + + +def _bigquery_column_name(input_string: str) -> str: + r"""Converts a string into a BigQuery column name. + https://cloud.google.com/bigquery/docs/schemas#column_names + + Args: + input_string (str): + Required: The string to convert. + Returns: + str + The converted string. + + """ + char_map: Dict[str, str] = { + r":|;|\(|\)|\[|\]|,|\.|\?|\!|\'|\n": "", + r"/| ": "_", + r"#": "num", + r"@": "at", + } + + for key, value in char_map.items(): + input_string = re.sub(key, value, input_string) + + return input_string.lower() + + +def _dict_to_bigquery( + dic: Dict, + dataset_name: str, + table_name: str, + project_id: Optional[str], +) -> bigquery.job.LoadJob: + r"""Loads dictionary to a BigQuery table. + + Args: + dic (Dict): + Required: The dictionary to insert. + dataset_name (str): + Required. Name of the BigQuery dataset. + table_name (str): + Required. Name of the BigQuery table. + project_id (Optional[str]): + Optional. Project ID containing the BigQuery table. If not passed, falls back to the default inferred from the environment. + Returns: + bigquery.job.LoadJob: + The BigQuery LoadJob for adding the dictionary. + + """ + bq_client = bigquery.Client( + project=project_id, client_info=gcs_utilities._get_client_info() + ) + table_ref = bigquery.DatasetReference( + project=project_id, dataset_id=dataset_name + ).table(table_name) + + job_config = bigquery.LoadJobConfig( + schema_update_options=[ + bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION, + bigquery.SchemaUpdateOption.ALLOW_FIELD_RELAXATION, + ], + source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON, + ) + + return bq_client.load_table_from_json( + json_rows=[dic], + destination=table_ref, + job_config=job_config, + ) + + @dataclasses.dataclass class Document: r"""Represents a wrapped `Document`. @@ -478,6 +578,49 @@ def get_form_field_by_name(self, target_field: str) -> List[FormField]: return found_fields + def form_fields_to_dict(self) -> Dict: + r"""Returns Dictionary of form fields in document. + + Returns: + Dict: + The Dict of the form fields indexed by type. + + """ + form_fields_dict: Dict = {} + for p in self.pages: + for form_field in p.form_fields: + field_name = _bigquery_column_name(form_field.field_name) + form_fields_dict = _insert_into_dictionary_with_list( + form_fields_dict, field_name, form_field.field_value + ) + + return form_fields_dict + + def form_fields_to_bigquery( + self, dataset_name: str, table_name: str, project_id: Optional[str] = None + ) -> bigquery.job.LoadJob: + r"""Adds extracted form fields to a BigQuery table. + + Args: + dataset_name (str): + Required. Name of the BigQuery dataset. + table_name (str): + Required. Name of the BigQuery table. + project_id (Optional[str]): + Optional. Project ID containing the BigQuery table. If not passed, falls back to the default inferred from the environment. + Returns: + bigquery.job.LoadJob: + The BigQuery LoadJob for adding the form fields. + + """ + + return _dict_to_bigquery( + self.form_fields_to_dict(), + dataset_name, + table_name, + project_id, + ) + def get_entity_by_type(self, target_type: str) -> List[Entity]: r"""Returns the list of Entities of target_type. @@ -502,20 +645,10 @@ def entities_to_dict(self) -> Dict: """ entities_dict: Dict = {} for entity in self.entities: - entity_type = entity.type_.replace("/", "_") - - existing_entity = entities_dict.get(entity_type) - if not existing_entity: - entities_dict[entity_type] = entity.mention_text - continue - - # For entities that can have multiple (e.g. line_item) - # Change Entity Type to a List - if not isinstance(existing_entity, list): - existing_entity = [existing_entity] - - existing_entity.append(entity.mention_text) - entities_dict[entity_type] = existing_entity + entity_type = _bigquery_column_name(entity.type_) + entities_dict = _insert_into_dictionary_with_list( + entities_dict, entity_type, entity.mention_text + ) return entities_dict @@ -536,23 +669,12 @@ def entities_to_bigquery( The BigQuery LoadJob for adding the entities. """ - bq_client = bigquery.Client(project=project_id) - table_ref = bigquery.DatasetReference( - project=project_id, dataset_id=dataset_name - ).table(table_name) - - job_config = bigquery.LoadJobConfig( - schema_update_options=[ - bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION, - bigquery.SchemaUpdateOption.ALLOW_FIELD_RELAXATION, - ], - source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON, - ) - return bq_client.load_table_from_json( - json_rows=[self.entities_to_dict()], - destination=table_ref, - job_config=job_config, + return _dict_to_bigquery( + self.entities_to_dict(), + dataset_name, + table_name, + project_id, ) def split_pdf(self, pdf_path: str, output_path: str) -> List[str]: diff --git a/google/cloud/documentai_toolbox/wrappers/page.py b/google/cloud/documentai_toolbox/wrappers/page.py index c311650..9c2d876 100644 --- a/google/cloud/documentai_toolbox/wrappers/page.py +++ b/google/cloud/documentai_toolbox/wrappers/page.py @@ -120,6 +120,21 @@ def _table_wrapper_from_documentai_table( ) +@dataclasses.dataclass +class Block: + """Represents a wrapped documentai.Document.Page.Block. + + Attributes: + documentai_block (google.cloud.documentai.Document.Page.Block): + Required. The original google.cloud.documentai.Document.Page.Block object. + text (str): + Required. UTF-8 encoded text. + """ + + documentai_block: documentai.Document.Page.Block + text: str + + @dataclasses.dataclass class Paragraph: """Represents a wrapped documentai.Document.Page.Paragraph. @@ -191,6 +206,32 @@ def _text_from_layout(layout: documentai.Document.Page.Layout, text: str) -> str return result_text +def _get_blocks(blocks: List[documentai.Document.Page.Block], text: str) -> List[Block]: + r"""Returns a list of Block. + + Args: + blocks (List[documentai.Document.Page.Block]): + Required. A list of documentai.Document.Page.Block objects. + text (str): + Required. UTF-8 encoded text in reading order + from the document. + Returns: + List[Block]: + A list of Blocks. + """ + result = [] + + for block in blocks: + result.append( + Block( + documentai_block=block, + text=_text_from_layout(layout=block.layout, text=text), + ) + ) + + return result + + def _get_paragraphs( paragraphs: List[documentai.Document.Page.Paragraph], text: str ) -> List[Paragraph]: @@ -339,6 +380,10 @@ class Page: Required. A list of visually detected text paragraphs on the page. A collection of lines that a human would perceive as a paragraph. + blocks (List[Block]): + Required. A list of visually detected text blocks + on the page. A collection of lines that a human + would perceive as a block. tables (List[Table]): Required. A list of visually detected tables on the page. @@ -350,6 +395,7 @@ class Page: form_fields: List[FormField] = dataclasses.field(init=False, repr=False) lines: List[Line] = dataclasses.field(init=False, repr=False) paragraphs: List[Paragraph] = dataclasses.field(init=False, repr=False) + blocks: List[Block] = dataclasses.field(init=False, repr=False) tables: List[Table] = dataclasses.field(init=False, repr=False) def __post_init__(self): @@ -369,4 +415,5 @@ def __post_init__(self): self.paragraphs = _get_paragraphs( paragraphs=self.documentai_page.paragraphs, text=self.text ) + self.blocks = _get_blocks(blocks=self.documentai_page.blocks, text=self.text) self.tables = tables diff --git a/samples/snippets/entities_to_bigquery_sample.py b/samples/snippets/entities_to_bigquery_sample.py index 0709680..c2397c2 100644 --- a/samples/snippets/entities_to_bigquery_sample.py +++ b/samples/snippets/entities_to_bigquery_sample.py @@ -42,6 +42,11 @@ def entities_to_bigquery_sample( dataset_name=dataset_name, table_name=table_name, project_id=project_id ) + # Also supported: + # job = wrapped_document.form_fields_to_bigquery( + # dataset_name=dataset_name, table_name=table_name, project_id=project_id + # ) + print("Document entities loaded into BigQuery") print(f"Job ID: {job.job_id}") print(f"Table: {job.destination.path}") diff --git a/samples/snippets/quickstart_sample.py b/samples/snippets/quickstart_sample.py index 33ff8c0..ea59080 100644 --- a/samples/snippets/quickstart_sample.py +++ b/samples/snippets/quickstart_sample.py @@ -41,6 +41,8 @@ def quickstart_sample(gcs_bucket_name: str, gcs_prefix: str) -> None: for idx, page in enumerate(wrapped_document.pages): print(f"Page {idx}") + for block in page.blocks: + print(block.text) for paragraph in page.paragraphs: print(paragraph.text) diff --git a/samples/snippets/requirements-test.txt b/samples/snippets/requirements-test.txt index f4d3da5..bc9a2a5 100644 --- a/samples/snippets/requirements-test.txt +++ b/samples/snippets/requirements-test.txt @@ -1,3 +1,3 @@ -pytest==7.3.0 -mock==5.0.1 -google-cloud-bigquery==3.9.0 +pytest==7.3.1 +mock==5.0.2 +google-cloud-bigquery==3.10.0 diff --git a/samples/snippets/requirements.txt b/samples/snippets/requirements.txt index 7baeb2d..d222d55 100644 --- a/samples/snippets/requirements.txt +++ b/samples/snippets/requirements.txt @@ -1,4 +1,4 @@ -google-cloud-bigquery==3.9.0 +google-cloud-bigquery==3.10.0 google-cloud-documentai==2.15.0 google-cloud-storage==2.8.0 google-cloud-documentai-toolbox==0.4.1a0 diff --git a/tests/unit/test_document.py b/tests/unit/test_document.py index 0b358ab..94fb78f 100644 --- a/tests/unit/test_document.py +++ b/tests/unit/test_document.py @@ -186,7 +186,7 @@ def test_get_batch_process_metadata_with_no_metadata(mock_docai): @mock.patch("google.cloud.documentai_toolbox.wrappers.document.documentai") -def test_document_from_batch_process_operation_with_invalid_metadata_type(mock_docai): +def test_get_batch_process_metadata_with_invalid_metadata_type(mock_docai): with pytest.raises( ValueError, match="Operation metadata type is not", @@ -206,6 +206,19 @@ def test_document_from_batch_process_operation_with_invalid_metadata_type(mock_d document._get_batch_process_metadata(location, operation_name) +def test_bigquery_column_name(): + string_map = { + "Phone #:": "phone_num", + "Emergency Contact:": "emergency_contact", + "Marital Status:": "marital_status", + "Are you currently taking any medication? (If yes, please describe):": "are_you_currently_taking_any_medication_if_yes_please_describe", + "Describe your medical concerns (symptoms, diagnoses, etc):": "describe_your_medical_concerns_symptoms_diagnoses_etc", + } + + for key, value in string_map.items(): + assert document._bigquery_column_name(key) == value + + def test_document_from_document_path_with_single_shard(): actual = document.Document.from_document_path( document_path="tests/unit/resources/0/toolbox_invoice_test-0.json" @@ -401,6 +414,43 @@ def test_get_form_field_by_name(get_bytes_form_parser_mock): assert actual[0].field_value == "(906) 917-3486" +def test_form_fields_to_dict(get_bytes_form_parser_mock): + doc = document.Document.from_gcs( + gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/0" + ) + actual = doc.form_fields_to_dict() + + get_bytes_form_parser_mock.assert_called_once() + + assert len(actual) == 17 + assert actual.get("address") == "24 Barney Lane" + assert actual.get("city") == "Towaco" + + +@mock.patch("google.cloud.documentai_toolbox.wrappers.document.bigquery") +def test_form_fields_to_bigquery(mock_bigquery, get_bytes_form_parser_mock): + client = mock_bigquery.Client.return_value + + mock_table = mock.Mock() + client.dataset.table.return_value = mock_table + + mock_load_job = mock.Mock() + client.load_table_from_json.return_value = mock_load_job + + doc = document.Document.from_gcs( + gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/0" + ) + + actual = doc.form_fields_to_bigquery( + dataset_name="test_dataset", table_name="test_table", project_id="test_project" + ) + + get_bytes_form_parser_mock.assert_called_once() + mock_bigquery.Client.assert_called_once() + + assert actual + + def test_entities_to_dict(get_bytes_single_file_mock): doc = document.Document.from_gcs( gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/0" diff --git a/tests/unit/test_page.py b/tests/unit/test_page.py index 4299fc3..75915aa 100644 --- a/tests/unit/test_page.py +++ b/tests/unit/test_page.py @@ -172,6 +172,15 @@ def test_text_from_element_with_layout(docproto): assert text == "Invoice\n" +def test_get_blocks(docproto): + docproto_blocks = docproto.pages[0].blocks + + blocks = page._get_blocks(blocks=docproto_blocks, text=docproto.text) + + assert len(blocks) == 31 + assert blocks[0].text == "Invoice\n" + + def test_get_paragraphs(docproto): docproto_paragraphs = docproto.pages[0].paragraphs @@ -218,6 +227,13 @@ def test_FormField(): assert form_field.field_value == "Sally Walker" +def test_Block(): + docai_block = documentai.Document.Page.Block() + block = page.Block(documentai_block=docai_block, text="test_block") + + assert block.text == "test_block" + + def test_Paragraph(): docai_paragraph = documentai.Document.Page.Paragraph() paragraph = page.Paragraph( @@ -254,5 +270,7 @@ def test_Page(docproto): assert len(wrapped_page.lines) == 37 assert len(wrapped_page.paragraphs) == 31 + assert len(wrapped_page.blocks) == 31 assert wrapped_page.lines[0].text == "Invoice\n" assert wrapped_page.paragraphs[30].text == "Supplies used for Project Q.\n" + assert wrapped_page.blocks[30].text == "Supplies used for Project Q.\n"