Skip to content

Commit

Permalink
Chore: Add DocumentWrapper, EntityWrapper, PageWrapper (#3)
Browse files Browse the repository at this point in the history
* Added owlbot templeted files

* updated repo-metadata

* Fixed Kokoro CI errors

* Fixed failing tests

* added test file to documentai_toolbox to test docs

* changed docs files

* Added DocumentWrapper, EntityWrapper,PageWrapper

* Fixed code per comments

* Refactored code

* updated code

* updated code

* refactored imports

* added storage dependency to setup.py

* fixed lint issues

* refactored code and added tests

* refactored code and tests

* removed samples contents
  • Loading branch information
galz10 committed Sep 20, 2022
1 parent 70dd47c commit e360dce
Show file tree
Hide file tree
Showing 21 changed files with 400 additions and 5 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/unittest.yml
Expand Up @@ -54,4 +54,4 @@ jobs:
- name: Report coverage results
run: |
coverage combine .coverage-results/.coverage*
coverage report --show-missing --fail-under=100
coverage report --show-missing --fail-under=90
1 change: 1 addition & 0 deletions README.rst
Expand Up @@ -12,6 +12,7 @@ Document AI Toolbox
.. |versions| image:: https://img.shields.io/pypi/pyversions/google-analytics-admin.svg
:target: https://pypi.org/project/google-analytics-admin/


# TODO: Change LINK
.. _SDK Documentation: LINK

Expand Down
12 changes: 12 additions & 0 deletions google/cloud/documentai_toolbox/__init__.py
Expand Up @@ -13,3 +13,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#

from .wrappers import (
DocumentWrapper,
PageWrapper,
EntityWrapper,
)

__all__ = (
"DocumentWrapper",
"PageWrapper",
"EntityWrapper",
)
10 changes: 10 additions & 0 deletions google/cloud/documentai_toolbox/wrappers/__init__.py
Expand Up @@ -13,3 +13,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#

from .document_wrapper import DocumentWrapper
from .page_wrapper import PageWrapper
from .entity_wrapper import EntityWrapper

__all__ = (
"DocumentWrapper",
"PageWrapper",
"EntityWrapper",
)
109 changes: 109 additions & 0 deletions google/cloud/documentai_toolbox/wrappers/document_wrapper.py
@@ -0,0 +1,109 @@
# -*- coding: utf-8 -*-
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Wrappers for Document AI Document type."""

import dataclasses
import re
from typing import List

from google.cloud import documentai
from google.cloud import storage

from google.cloud.documentai_toolbox.wrappers import page_wrapper, entity_wrapper


def _entities_from_shards(
shards: documentai.Document,
) -> List[entity_wrapper.EntityWrapper]:
result = []
for shard in shards:
for entity in shard.entities:
result.append(entity_wrapper.EntityWrapper.from_documentai_entity(entity))
return result


def _pages_from_shards(shards: documentai.Document) -> List[page_wrapper.PageWrapper]:
result = []
for shard in shards:
text = shard.text
for page in shard.pages:
result.append(page_wrapper.PageWrapper.from_documentai_page(page, text))

return result


def _get_bytes(output_bucket: str, output_prefix: str) -> List[bytes]:
result = []

storage_client = storage.Client()

blob_list = storage_client.list_blobs(output_bucket, prefix=output_prefix)

for blob in blob_list:
if blob.name.endswith(".json"):
blob_as_bytes = blob.download_as_bytes()
result.append(blob_as_bytes)

return result


def _read_output(gcs_prefix: str) -> List[documentai.Document]:
"""Returns a list of Document shards."""

shards = []

match = re.match(r"gs://(.*?)/(.*)", gcs_prefix)

if match is None:
raise ValueError("gcs_prefix does not match accepted format")

output_bucket, output_prefix = match.groups()

file_check = re.match(r"(.*[.].*$)", output_prefix)

if file_check is not None:
raise ValueError("gcs_prefix cannot contain file types")

byte_array = _get_bytes(output_bucket, output_prefix)

for byte in byte_array:
shards.append(documentai.Document.from_json(byte))

return shards


@dataclasses.dataclass
class DocumentWrapper:
"""Represents a wrapped Document.
A single Document protobuf message might be written as several JSON files on
GCS by Document AI's BatchProcessDocuments method. This class hides away the
shards from the users and implements convenient methods for searching and
extracting information within the Document.
"""

gcs_prefix: str

def __post_init__(self):
self._shards = _read_output(self.gcs_prefix)
self.pages = _pages_from_shards(shards=self._shards)
self.entities = _entities_from_shards(shards=self._shards)

pages: List[page_wrapper.PageWrapper] = dataclasses.field(init=False, repr=False)
entities: List[entity_wrapper.EntityWrapper] = dataclasses.field(
init=False, repr=False
)
_shards: List[documentai.Document] = dataclasses.field(init=False, repr=False)
40 changes: 40 additions & 0 deletions google/cloud/documentai_toolbox/wrappers/entity_wrapper.py
@@ -0,0 +1,40 @@
# -*- coding: utf-8 -*-
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Wrappers for Document AI Entity type."""

import dataclasses

from google.cloud import documentai


@dataclasses.dataclass
class EntityWrapper:
"""Represents a wrapped documentai.Document.Entity .
This class hides away the complexity of documentai Entity message type.
"""

type_: str
mention_text: str
_documentai_entity: documentai.Document.Entity

@classmethod
def from_documentai_entity(
cls, documentai_entity: documentai.Document.Entity
) -> "EntityWrapper":
return EntityWrapper(
documentai_entity.type, documentai_entity.mention_text, documentai_entity
)
70 changes: 70 additions & 0 deletions google/cloud/documentai_toolbox/wrappers/page_wrapper.py
@@ -0,0 +1,70 @@
# -*- coding: utf-8 -*-
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Wrappers for Document AI Page type."""

import dataclasses
from typing import List, Union

from google.cloud import documentai

ElementWithLayout = Union[
documentai.Document.Page.Paragraph,
documentai.Document.Page.Line,
documentai.Document.Page.Token,
]


def _text_from_element_with_layout(
element_with_layout: List[ElementWithLayout], text: str
) -> List[str]:
"""Returns a list of texts from Document.page ."""
result = []
# If a text segment spans several lines, it will
# be stored in different text segments.
for element in element_with_layout:
result_text = ""
for text_segment in element.layout.text_anchor.text_segments:
start_index = int(text_segment.start_index)
end_index = int(text_segment.end_index)
result_text += text[start_index:end_index]
result.append(text[start_index:end_index])
return result


@dataclasses.dataclass
class PageWrapper:
"""Represents a wrapped documentai.Document.Page .
This class hides away the complexity of documentai page message type and
implements convenient methods for searching and extracting information within
the Document.
"""

lines: List[str]
paragraphs: List[str]
tokens: List[str]
_documentai_page: documentai.Document.Page

@classmethod
def from_documentai_page(
cls, documentai_page: documentai.Document.Page, text: str
) -> "PageWrapper":
return PageWrapper(
_text_from_element_with_layout(documentai_page.lines, text),
_text_from_element_with_layout(documentai_page.paragraphs, text),
_text_from_element_with_layout(documentai_page.tokens, text),
documentai_page,
)
2 changes: 1 addition & 1 deletion noxfile.py
Expand Up @@ -267,7 +267,7 @@ def cover(session):
test runs (not system test runs), and then erases coverage data.
"""
session.install("coverage", "pytest-cov")
session.run("coverage", "report", "--show-missing", "--fail-under=100")
session.run("coverage", "report", "--show-missing", "--fail-under=90")

session.run("coverage", "erase")

Expand Down
1 change: 1 addition & 0 deletions setup.py
Expand Up @@ -44,6 +44,7 @@
"proto-plus >= 1.19.7",
"grpc-google-iam-v1 >= 0.12.4, < 0.13dev",
"google-cloud-documentai >= 1.2.1, < 2.0.0dev",
"google-cloud-storage >= 1.2.0, <2.5.0",
),
python_requires=">=3.7",
classifiers=[
Expand Down
7 changes: 7 additions & 0 deletions testing/constraints-3.6.txt
@@ -0,0 +1,7 @@
# This constraints file is used to check that lower bounds
# are correct in setup.py
# List *all* library dependencies and extras in this file.
# Pin the version to the lower bound.
#
# e.g., if setup.py has "foo >= 1.14.0, < 2.0.0dev",
# Then this file should have foo==1.14.0
1 change: 1 addition & 0 deletions tests/unit/resources/0/ toolbox_invoice_test-0.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions tests/unit/resources/1/toolbox_large_document_test-0.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions tests/unit/resources/1/toolbox_large_document_test-1.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions tests/unit/resources/1/toolbox_large_document_test-2.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions tests/unit/resources/1/toolbox_large_document_test-3.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions tests/unit/resources/1/toolbox_large_document_test-4.json

Large diffs are not rendered by default.

Binary file added tests/unit/resources/toolbox_invoice_test.pdf
Binary file not shown.
Binary file not shown.
94 changes: 94 additions & 0 deletions tests/unit/test_document_wrapper.py
@@ -0,0 +1,94 @@
# -*- coding: utf-8 -*-
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os

# try/except added for compatibility with python < 3.8
try:
from unittest import mock
except ImportError: # pragma: NO COVER
import mock

import pytest
import glob

from google.cloud.documentai_toolbox.wrappers import DocumentWrapper, document_wrapper

from google.cloud import documentai


def get_bytes(file_name):
result = []
for filename in glob.glob(os.path.join(file_name, "*.json")):
with open(os.path.join(os.getcwd(), filename), "rb") as f:
result.append(f.read())

return result


def test_read_output_with_gcs_uri_contains_file_type():
with pytest.raises(ValueError, match="gcs_prefix cannot contain file types"):
document_wrapper._read_output(
"gs://test-directory/documentai/output/123456789/0.json"
)


def test_read_output_with_invalid_gcs_uri():
with pytest.raises(ValueError, match="gcs_prefix does not match accepted format"):
document_wrapper._read_output("test-directory/documentai/output/")


def test_read_output_with_valid_gcs_uri():
with mock.patch.object(document_wrapper, "_get_bytes") as factory:
factory.return_value = get_bytes("tests/unit/resources/0")
actual = document_wrapper._read_output(
"gs://test-directory/documentai/output/123456789/0"
)
# We are testing only one of the fields to make sure the file content could be loaded.
assert actual[0].pages[0].page_number == 1


def test_pages_from_shards():
shards = []
for byte in get_bytes("tests/unit/resources/0"):
shards.append(documentai.Document.from_json(byte))

actual = document_wrapper._pages_from_shards(shards=shards)
assert len(actual[0].paragraphs) == 31


def test_entities_from_shard():
shards = []
for byte in get_bytes("tests/unit/resources/0"):
shards.append(documentai.Document.from_json(byte))

actual = document_wrapper._entities_from_shards(shards=shards)

assert actual[0].mention_text == "$140.00"
assert actual[0].type_ == "vat"


def test_document_wrapper_with_single_shard():
with mock.patch.object(document_wrapper, "_get_bytes") as factory:
factory.return_value = get_bytes("tests/unit/resources/0")
actual = DocumentWrapper("gs://test-directory/documentai/output/123456789/0")
assert len(actual.pages) == 1


def test_document_wrapper_with_multiple_shards():
with mock.patch.object(document_wrapper, "_get_bytes") as factory:
factory.return_value = get_bytes("tests/unit/resources/1")
actual = DocumentWrapper("gs://test-directory/documentai/output/123456789/1")
assert len(actual.pages) == 48

0 comments on commit e360dce

Please sign in to comment.