Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Chore: Add DocumentWrapper, EntityWrapper, PageWrapper (#3)
* Added owlbot templeted files * updated repo-metadata * Fixed Kokoro CI errors * Fixed failing tests * added test file to documentai_toolbox to test docs * changed docs files * Added DocumentWrapper, EntityWrapper,PageWrapper * Fixed code per comments * Refactored code * updated code * updated code * refactored imports * added storage dependency to setup.py * fixed lint issues * refactored code and added tests * refactored code and tests * removed samples contents
- Loading branch information
Showing
21 changed files
with
400 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
109 changes: 109 additions & 0 deletions
109
google/cloud/documentai_toolbox/wrappers/document_wrapper.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
# -*- coding: utf-8 -*- | ||
# Copyright 2022 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
"""Wrappers for Document AI Document type.""" | ||
|
||
import dataclasses | ||
import re | ||
from typing import List | ||
|
||
from google.cloud import documentai | ||
from google.cloud import storage | ||
|
||
from google.cloud.documentai_toolbox.wrappers import page_wrapper, entity_wrapper | ||
|
||
|
||
def _entities_from_shards( | ||
shards: documentai.Document, | ||
) -> List[entity_wrapper.EntityWrapper]: | ||
result = [] | ||
for shard in shards: | ||
for entity in shard.entities: | ||
result.append(entity_wrapper.EntityWrapper.from_documentai_entity(entity)) | ||
return result | ||
|
||
|
||
def _pages_from_shards(shards: documentai.Document) -> List[page_wrapper.PageWrapper]: | ||
result = [] | ||
for shard in shards: | ||
text = shard.text | ||
for page in shard.pages: | ||
result.append(page_wrapper.PageWrapper.from_documentai_page(page, text)) | ||
|
||
return result | ||
|
||
|
||
def _get_bytes(output_bucket: str, output_prefix: str) -> List[bytes]: | ||
result = [] | ||
|
||
storage_client = storage.Client() | ||
|
||
blob_list = storage_client.list_blobs(output_bucket, prefix=output_prefix) | ||
|
||
for blob in blob_list: | ||
if blob.name.endswith(".json"): | ||
blob_as_bytes = blob.download_as_bytes() | ||
result.append(blob_as_bytes) | ||
|
||
return result | ||
|
||
|
||
def _read_output(gcs_prefix: str) -> List[documentai.Document]: | ||
"""Returns a list of Document shards.""" | ||
|
||
shards = [] | ||
|
||
match = re.match(r"gs://(.*?)/(.*)", gcs_prefix) | ||
|
||
if match is None: | ||
raise ValueError("gcs_prefix does not match accepted format") | ||
|
||
output_bucket, output_prefix = match.groups() | ||
|
||
file_check = re.match(r"(.*[.].*$)", output_prefix) | ||
|
||
if file_check is not None: | ||
raise ValueError("gcs_prefix cannot contain file types") | ||
|
||
byte_array = _get_bytes(output_bucket, output_prefix) | ||
|
||
for byte in byte_array: | ||
shards.append(documentai.Document.from_json(byte)) | ||
|
||
return shards | ||
|
||
|
||
@dataclasses.dataclass | ||
class DocumentWrapper: | ||
"""Represents a wrapped Document. | ||
A single Document protobuf message might be written as several JSON files on | ||
GCS by Document AI's BatchProcessDocuments method. This class hides away the | ||
shards from the users and implements convenient methods for searching and | ||
extracting information within the Document. | ||
""" | ||
|
||
gcs_prefix: str | ||
|
||
def __post_init__(self): | ||
self._shards = _read_output(self.gcs_prefix) | ||
self.pages = _pages_from_shards(shards=self._shards) | ||
self.entities = _entities_from_shards(shards=self._shards) | ||
|
||
pages: List[page_wrapper.PageWrapper] = dataclasses.field(init=False, repr=False) | ||
entities: List[entity_wrapper.EntityWrapper] = dataclasses.field( | ||
init=False, repr=False | ||
) | ||
_shards: List[documentai.Document] = dataclasses.field(init=False, repr=False) |
40 changes: 40 additions & 0 deletions
40
google/cloud/documentai_toolbox/wrappers/entity_wrapper.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
# -*- coding: utf-8 -*- | ||
# Copyright 2022 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
"""Wrappers for Document AI Entity type.""" | ||
|
||
import dataclasses | ||
|
||
from google.cloud import documentai | ||
|
||
|
||
@dataclasses.dataclass | ||
class EntityWrapper: | ||
"""Represents a wrapped documentai.Document.Entity . | ||
This class hides away the complexity of documentai Entity message type. | ||
""" | ||
|
||
type_: str | ||
mention_text: str | ||
_documentai_entity: documentai.Document.Entity | ||
|
||
@classmethod | ||
def from_documentai_entity( | ||
cls, documentai_entity: documentai.Document.Entity | ||
) -> "EntityWrapper": | ||
return EntityWrapper( | ||
documentai_entity.type, documentai_entity.mention_text, documentai_entity | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
# -*- coding: utf-8 -*- | ||
# Copyright 2022 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
"""Wrappers for Document AI Page type.""" | ||
|
||
import dataclasses | ||
from typing import List, Union | ||
|
||
from google.cloud import documentai | ||
|
||
ElementWithLayout = Union[ | ||
documentai.Document.Page.Paragraph, | ||
documentai.Document.Page.Line, | ||
documentai.Document.Page.Token, | ||
] | ||
|
||
|
||
def _text_from_element_with_layout( | ||
element_with_layout: List[ElementWithLayout], text: str | ||
) -> List[str]: | ||
"""Returns a list of texts from Document.page .""" | ||
result = [] | ||
# If a text segment spans several lines, it will | ||
# be stored in different text segments. | ||
for element in element_with_layout: | ||
result_text = "" | ||
for text_segment in element.layout.text_anchor.text_segments: | ||
start_index = int(text_segment.start_index) | ||
end_index = int(text_segment.end_index) | ||
result_text += text[start_index:end_index] | ||
result.append(text[start_index:end_index]) | ||
return result | ||
|
||
|
||
@dataclasses.dataclass | ||
class PageWrapper: | ||
"""Represents a wrapped documentai.Document.Page . | ||
This class hides away the complexity of documentai page message type and | ||
implements convenient methods for searching and extracting information within | ||
the Document. | ||
""" | ||
|
||
lines: List[str] | ||
paragraphs: List[str] | ||
tokens: List[str] | ||
_documentai_page: documentai.Document.Page | ||
|
||
@classmethod | ||
def from_documentai_page( | ||
cls, documentai_page: documentai.Document.Page, text: str | ||
) -> "PageWrapper": | ||
return PageWrapper( | ||
_text_from_element_with_layout(documentai_page.lines, text), | ||
_text_from_element_with_layout(documentai_page.paragraphs, text), | ||
_text_from_element_with_layout(documentai_page.tokens, text), | ||
documentai_page, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# This constraints file is used to check that lower bounds | ||
# are correct in setup.py | ||
# List *all* library dependencies and extras in this file. | ||
# Pin the version to the lower bound. | ||
# | ||
# e.g., if setup.py has "foo >= 1.14.0, < 2.0.0dev", | ||
# Then this file should have foo==1.14.0 |
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
# -*- coding: utf-8 -*- | ||
# Copyright 2022 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import os | ||
|
||
# try/except added for compatibility with python < 3.8 | ||
try: | ||
from unittest import mock | ||
except ImportError: # pragma: NO COVER | ||
import mock | ||
|
||
import pytest | ||
import glob | ||
|
||
from google.cloud.documentai_toolbox.wrappers import DocumentWrapper, document_wrapper | ||
|
||
from google.cloud import documentai | ||
|
||
|
||
def get_bytes(file_name): | ||
result = [] | ||
for filename in glob.glob(os.path.join(file_name, "*.json")): | ||
with open(os.path.join(os.getcwd(), filename), "rb") as f: | ||
result.append(f.read()) | ||
|
||
return result | ||
|
||
|
||
def test_read_output_with_gcs_uri_contains_file_type(): | ||
with pytest.raises(ValueError, match="gcs_prefix cannot contain file types"): | ||
document_wrapper._read_output( | ||
"gs://test-directory/documentai/output/123456789/0.json" | ||
) | ||
|
||
|
||
def test_read_output_with_invalid_gcs_uri(): | ||
with pytest.raises(ValueError, match="gcs_prefix does not match accepted format"): | ||
document_wrapper._read_output("test-directory/documentai/output/") | ||
|
||
|
||
def test_read_output_with_valid_gcs_uri(): | ||
with mock.patch.object(document_wrapper, "_get_bytes") as factory: | ||
factory.return_value = get_bytes("tests/unit/resources/0") | ||
actual = document_wrapper._read_output( | ||
"gs://test-directory/documentai/output/123456789/0" | ||
) | ||
# We are testing only one of the fields to make sure the file content could be loaded. | ||
assert actual[0].pages[0].page_number == 1 | ||
|
||
|
||
def test_pages_from_shards(): | ||
shards = [] | ||
for byte in get_bytes("tests/unit/resources/0"): | ||
shards.append(documentai.Document.from_json(byte)) | ||
|
||
actual = document_wrapper._pages_from_shards(shards=shards) | ||
assert len(actual[0].paragraphs) == 31 | ||
|
||
|
||
def test_entities_from_shard(): | ||
shards = [] | ||
for byte in get_bytes("tests/unit/resources/0"): | ||
shards.append(documentai.Document.from_json(byte)) | ||
|
||
actual = document_wrapper._entities_from_shards(shards=shards) | ||
|
||
assert actual[0].mention_text == "$140.00" | ||
assert actual[0].type_ == "vat" | ||
|
||
|
||
def test_document_wrapper_with_single_shard(): | ||
with mock.patch.object(document_wrapper, "_get_bytes") as factory: | ||
factory.return_value = get_bytes("tests/unit/resources/0") | ||
actual = DocumentWrapper("gs://test-directory/documentai/output/123456789/0") | ||
assert len(actual.pages) == 1 | ||
|
||
|
||
def test_document_wrapper_with_multiple_shards(): | ||
with mock.patch.object(document_wrapper, "_get_bytes") as factory: | ||
factory.return_value = get_bytes("tests/unit/resources/1") | ||
actual = DocumentWrapper("gs://test-directory/documentai/output/123456789/1") | ||
assert len(actual.pages) == 48 |
Oops, something went wrong.