Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Chore: Add DocumentWrapper, EntityWrapper, PageWrapper #3

Merged
merged 19 commits into from
Sep 20, 2022
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/unittest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,4 +54,4 @@ jobs:
- name: Report coverage results
run: |
coverage combine .coverage-results/.coverage*
coverage report --show-missing --fail-under=100
coverage report --show-missing --fail-under=90
1 change: 1 addition & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ Document AI Toolbox
.. |versions| image:: https://img.shields.io/pypi/pyversions/google-analytics-admin.svg
:target: https://pypi.org/project/google-analytics-admin/


# TODO: Change LINK
.. _SDK Documentation: LINK

Expand Down
12 changes: 12 additions & 0 deletions google/cloud/documentai_toolbox/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#

from .wrappers import (
DocumentWrapper,
PageWrapper,
EntityWrapper,
)

__all__ = (
"DocumentWrapper",
"PageWrapper",
"EntityWrapper",
)
10 changes: 10 additions & 0 deletions google/cloud/documentai_toolbox/wrappers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#

from .document_wrapper import DocumentWrapper
from .page_wrapper import PageWrapper
from .entity_wrapper import EntityWrapper

__all__ = (
"DocumentWrapper",
"PageWrapper",
"EntityWrapper",
)
109 changes: 109 additions & 0 deletions google/cloud/documentai_toolbox/wrappers/document_wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# -*- coding: utf-8 -*-
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Wrappers for Document AI Document type."""

import dataclasses
import re
from typing import List

from google.cloud import documentai
from google.cloud import storage

from google.cloud.documentai_toolbox.wrappers import page_wrapper, entity_wrapper


def _entities_from_shards(
shards: documentai.Document,
) -> List[entity_wrapper.EntityWrapper]:
result = []
for shard in shards:
for entity in shard.entities:
result.append(entity_wrapper.EntityWrapper.from_documentai_entity(entity))
return result


def _pages_from_shards(shards: documentai.Document) -> List[page_wrapper.PageWrapper]:
result = []
for shard in shards:
text = shard.text
for page in shard.pages:
result.append(page_wrapper.PageWrapper.from_documentai_page(page, text))

return result


def _get_bytes(output_bucket: str, output_prefix: str) -> List[bytes]:
result = []

storage_client = storage.Client()

blob_list = storage_client.list_blobs(output_bucket, prefix=output_prefix)

for blob in blob_list:
if blob.name.endswith(".json"):
blob_as_bytes = blob.download_as_bytes()
result.append(blob_as_bytes)

return result


def _read_output(gcs_prefix: str) -> List[documentai.Document]:
galz10 marked this conversation as resolved.
Show resolved Hide resolved
"""Returns a list of Document shards."""

shards = []

match = re.match(r"gs://(.*?)/(.*)", gcs_prefix)

if match is None:
raise ValueError("gcs_prefix does not match accepted format")

output_bucket, output_prefix = match.groups()

file_check = re.match(r"(.*[.].*$)", output_prefix)
galz10 marked this conversation as resolved.
Show resolved Hide resolved

if file_check is not None:
raise ValueError("gcs_prefix cannot contain file types")

byte_array = _get_bytes(output_bucket, output_prefix)

for byte in byte_array:
shards.append(documentai.Document.from_json(byte))

return shards


@dataclasses.dataclass
class DocumentWrapper:
"""Represents a wrapped Document.

A single Document protobuf message might be written as several JSON files on
GCS by Document AI's BatchProcessDocuments method. This class hides away the
shards from the users and implements convenient methods for searching and
extracting information within the Document.
"""

gcs_prefix: str
galz10 marked this conversation as resolved.
Show resolved Hide resolved

def __post_init__(self):
self._shards = _read_output(self.gcs_prefix)
self.pages = _pages_from_shards(shards=self._shards)
self.entities = _entities_from_shards(shards=self._shards)

pages: List[page_wrapper.PageWrapper] = dataclasses.field(init=False, repr=False)
entities: List[entity_wrapper.EntityWrapper] = dataclasses.field(
init=False, repr=False
)
_shards: List[documentai.Document] = dataclasses.field(init=False, repr=False)
40 changes: 40 additions & 0 deletions google/cloud/documentai_toolbox/wrappers/entity_wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# -*- coding: utf-8 -*-
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Wrappers for Document AI Entity type."""

import dataclasses

from google.cloud import documentai


@dataclasses.dataclass
class EntityWrapper:
"""Represents a wrapped documentai.Document.Entity .

This class hides away the complexity of documentai Entity message type.
"""

type_: str
mention_text: str
galz10 marked this conversation as resolved.
Show resolved Hide resolved
_documentai_entity: documentai.Document.Entity

@classmethod
def from_documentai_entity(
cls, documentai_entity: documentai.Document.Entity
) -> "EntityWrapper":
return EntityWrapper(
documentai_entity.type, documentai_entity.mention_text, documentai_entity
)
70 changes: 70 additions & 0 deletions google/cloud/documentai_toolbox/wrappers/page_wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# -*- coding: utf-8 -*-
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Wrappers for Document AI Page type."""

import dataclasses
from typing import List, Union

from google.cloud import documentai

ElementWithLayout = Union[
documentai.Document.Page.Paragraph,
documentai.Document.Page.Line,
documentai.Document.Page.Token,
]


def _text_from_element_with_layout(
element_with_layout: List[ElementWithLayout], text: str
) -> List[str]:
"""Returns a list of texts from Document.page ."""
result = []
# If a text segment spans several lines, it will
# be stored in different text segments.
for element in element_with_layout:
result_text = ""
for text_segment in element.layout.text_anchor.text_segments:
start_index = int(text_segment.start_index)
end_index = int(text_segment.end_index)
result_text += text[start_index:end_index]
result.append(text[start_index:end_index])
return result


@dataclasses.dataclass
class PageWrapper:
"""Represents a wrapped documentai.Document.Page .

This class hides away the complexity of documentai page message type and
implements convenient methods for searching and extracting information within
the Document.
"""

lines: List[str]
galz10 marked this conversation as resolved.
Show resolved Hide resolved
paragraphs: List[str]
tokens: List[str]
_documentai_page: documentai.Document.Page

@classmethod
def from_documentai_page(
cls, documentai_page: documentai.Document.Page, text: str
) -> "PageWrapper":
return PageWrapper(
_text_from_element_with_layout(documentai_page.lines, text),
_text_from_element_with_layout(documentai_page.paragraphs, text),
_text_from_element_with_layout(documentai_page.tokens, text),
documentai_page,
)
2 changes: 1 addition & 1 deletion noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ def cover(session):
test runs (not system test runs), and then erases coverage data.
"""
session.install("coverage", "pytest-cov")
session.run("coverage", "report", "--show-missing", "--fail-under=100")
session.run("coverage", "report", "--show-missing", "--fail-under=90")

session.run("coverage", "erase")

Expand Down
75 changes: 75 additions & 0 deletions samples/samples.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
"""Python wrappers for Document AI message types."""

from dataclasses import dataclass, field
import re
import time
from typing import List

from google.cloud import documentai
from google.cloud.documentai_toolbox import DocumentWrapper


PROJECT_ID = "valiant-marker-319718"
LOCATION = "us" # Format is 'us' or 'eu'
PROCESSOR_ID = "86bd7a6996805c20" # Create processor in Cloud Console

# Format 'gs://input_bucket/directory'
GCS_INPUT_PREFIX = "gs://gal-cloud-samples/documentai/input"

# Format 'gs://output_bucket/directory'
GCS_OUTPUT_URI = "gs://gal-cloud-samples/documentai/output"

def batch_process():
opts = {}
if LOCATION == "eu":
opts = {"api_endpoint": "eu-documentai.googleapis.com"}

# Instantiates a client
docai_client = documentai.DocumentProcessorServiceClient(client_options=opts)

RESOURCE_NAME = docai_client.processor_path(PROJECT_ID, LOCATION, PROCESSOR_ID)

# Cloud Storage URI for the Input Directory
gcs_prefix = documentai.GcsPrefix(gcs_uri_prefix=GCS_INPUT_PREFIX)

# Load GCS Input URI into Batch Input Config
input_config = documentai.BatchDocumentsInputConfig(gcs_prefix=gcs_prefix)

# Cloud Storage URI for Output directory
gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig(
gcs_uri=GCS_OUTPUT_URI
)

# Load GCS Output URI into OutputConfig object
output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)

# Configure Process Request
request = documentai.BatchProcessRequest(
name=RESOURCE_NAME,
input_documents=input_config,
document_output_config=output_config,
)

# Batch Process returns a Long Running Operation (LRO)
operation = docai_client.batch_process_documents(request)

print(f"Waiting for operation {operation.operation.name} to complete...")
operation.result()

print("Document processing complete.")



def main() -> None:
batch_process()

# merged_document = DocumentWrapper(
# "gs://gal-cloud-samples/documentai/output/417983068761916085/0")

# print(merged_document.pages[0].paragraphs)



if __name__ == "__main__":
main()

1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
"proto-plus >= 1.19.7",
"grpc-google-iam-v1 >= 0.12.4, < 0.13dev",
"google-cloud-documentai >= 1.2.1, < 2.0.0dev",
"google-cloud-storage >= 1.2.0, <2.5.0",
),
python_requires=">=3.7",
classifiers=[
Expand Down
7 changes: 7 additions & 0 deletions testing/constraints-3.6.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# This constraints file is used to check that lower bounds
# are correct in setup.py
# List *all* library dependencies and extras in this file.
# Pin the version to the lower bound.
#
# e.g., if setup.py has "foo >= 1.14.0, < 2.0.0dev",
# Then this file should have foo==1.14.0
1 change: 1 addition & 0 deletions tests/unit/resources/0/ toolbox_invoice_test-0.json

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Binary file added tests/unit/resources/toolbox_invoice_test.pdf
Binary file not shown.
Binary file not shown.
Loading