Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Chore: Add DocumentWrapper, EntityWrapper, PageWrapper #3

Merged
merged 19 commits into from
Sep 20, 2022
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/unittest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,4 +54,4 @@ jobs:
- name: Report coverage results
run: |
coverage combine .coverage-results/.coverage*
coverage report --show-missing --fail-under=100
coverage report --show-missing --fail-under=90
1 change: 1 addition & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ Document AI Toolbox
.. |versions| image:: https://img.shields.io/pypi/pyversions/google-analytics-admin.svg
:target: https://pypi.org/project/google-analytics-admin/


# TODO: Change LINK
.. _SDK Documentation: LINK

Expand Down
12 changes: 12 additions & 0 deletions google/cloud/documentai_toolbox/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#

from .wrappers import (
DocumentWrapper,
PageWrapper,
EntityWrapper,
)

__all__ = (
"DocumentWrapper",
"PageWrapper",
"EntityWrapper",
)
10 changes: 10 additions & 0 deletions google/cloud/documentai_toolbox/wrappers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#

from .document_wrapper import DocumentWrapper
from .page_wrapper import PageWrapper
from .entity_wrapper import EntityWrapper

__all__ = (
"DocumentWrapper",
"PageWrapper",
"EntityWrapper",
)
106 changes: 106 additions & 0 deletions google/cloud/documentai_toolbox/wrappers/document_wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# -*- coding: utf-8 -*-
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Wrappers for Document AI Document type."""

import dataclasses
import re
from typing import List

from google.cloud import documentai
from google.cloud import storage

from google.cloud.documentai_toolbox.wrappers import page_wrapper, entity_wrapper


def _entities_from_shards(
shards: documentai.Document,
) -> List[entity_wrapper.EntityWrapper]:
result = []
for shard in shards:
for entity in shard.entities:
result.append(entity_wrapper.EntityWrapper.from_documentai_entity(entity))
return result


def _pages_from_shards(shards: documentai.Document) -> List[page_wrapper.PageWrapper]:
result = []
for shard in shards:
text = shard.text
for page in shard.pages:
result.append(page_wrapper.PageWrapper.from_documentai_page(page, text))

return result


def _get_bytes(output_bucket: str, output_prefix: str):
galz10 marked this conversation as resolved.
Show resolved Hide resolved
result = []

storage_client = storage.Client()

blob_list = storage_client.list_blobs(output_bucket, prefix=output_prefix)

for blob in blob_list:
if blob.name.endswith(".json"):
blob_as_bytes = blob.download_as_bytes()
result.append(blob_as_bytes)

return result


def _read_output(gcs_prefix: str) -> List[documentai.Document]:
galz10 marked this conversation as resolved.
Show resolved Hide resolved
"""Returns a list of Document shards."""

shards = []

try:
output_bucket, output_prefix = re.match(r"gs://(.*?)/(.*)", gcs_prefix).groups()
except Exception:
galz10 marked this conversation as resolved.
Show resolved Hide resolved
raise TypeError("gcs_prefix does not match accepted format")
galz10 marked this conversation as resolved.
Show resolved Hide resolved

file_check = re.match(r"(.*[.].*$)", output_prefix)
galz10 marked this conversation as resolved.
Show resolved Hide resolved

if file_check is not None:
raise TypeError("gcs_prefix cannot contain file types")
galz10 marked this conversation as resolved.
Show resolved Hide resolved

byte_array = _get_bytes(output_bucket, output_prefix)

for byte in byte_array:
shards.append(documentai.Document.from_json(byte))

return shards


@dataclasses.dataclass
class DocumentWrapper:
"""Represents a wrapped Document.

A single Document protobuf message might be written as several JSON files on
GCS by Document AI's BatchProcessDocuments method. This class hides away the
shards from the users and implements convenient methods for searching and
extracting information within the Document.
"""

_shards: List[documentai.Document] = dataclasses.field(init=False, repr=False)
pages: List[page_wrapper.PageWrapper] = dataclasses.field(init=False, repr=False)
entities: List[entity_wrapper.EntityWrapper] = dataclasses.field(
init=False, repr=False
)
gcs_prefix: str
galz10 marked this conversation as resolved.
Show resolved Hide resolved

def __post_init__(self):
self._shards = _read_output(self.gcs_prefix)
self.pages = _pages_from_shards(shards=self._shards)
self.entities = _entities_from_shards(shards=self._shards)
40 changes: 40 additions & 0 deletions google/cloud/documentai_toolbox/wrappers/entity_wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# -*- coding: utf-8 -*-
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Wrappers for Document AI Entity type."""

import dataclasses

from google.cloud import documentai


@dataclasses.dataclass
class EntityWrapper:
"""Represents a wrapped documentai.Document.Entity .

This class hides away the complexity of documentai Entity message type.
"""

_proto_entity: documentai.Document.Entity
type_: str
mention_text: str
galz10 marked this conversation as resolved.
Show resolved Hide resolved

@classmethod
def from_documentai_entity(
cls, documentai_entity: documentai.Document.Entity
) -> "EntityWrapper":
return EntityWrapper(
documentai_entity, documentai_entity.type, documentai_entity.mention_text
)
62 changes: 62 additions & 0 deletions google/cloud/documentai_toolbox/wrappers/page_wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# -*- coding: utf-8 -*-
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Wrappers for Document AI Page type."""

import dataclasses
from typing import List

from google.cloud import documentai


def _text_from_layout(page_entities, text: str) -> List[str]:
galz10 marked this conversation as resolved.
Show resolved Hide resolved
"""Returns a list of texts from Document.page ."""
result = []
# If a text segment spans several lines, it will
# be stored in different text segments.
for entity in page_entities:
result_text = ""
for text_segment in entity.layout.text_anchor.text_segments:
start_index = int(text_segment.start_index)
end_index = int(text_segment.end_index)
result_text += text[start_index:end_index]
result.append(text[start_index:end_index])
return result


@dataclasses.dataclass
class PageWrapper:
"""Represents a wrapped documentai.Document.Page .

This class hides away the complexity of documentai page message type and
implements convenient methods for searching and extracting information within
the Document.
"""

_proto_page: documentai.Document.Page
galz10 marked this conversation as resolved.
Show resolved Hide resolved
lines: List[str]
galz10 marked this conversation as resolved.
Show resolved Hide resolved
paragraphs: List[str]
tokens: List[str]

@classmethod
def from_documentai_page(
cls, documentai_page: documentai.Document.Page, text: str
) -> "PageWrapper":
return PageWrapper(
documentai_page,
_text_from_layout(documentai_page.lines, text),
_text_from_layout(documentai_page.paragraphs, text),
_text_from_layout(documentai_page.tokens, text),
)
2 changes: 1 addition & 1 deletion noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ def cover(session):
test runs (not system test runs), and then erases coverage data.
"""
session.install("coverage", "pytest-cov")
session.run("coverage", "report", "--show-missing", "--fail-under=100")
session.run("coverage", "report", "--show-missing", "--fail-under=90")

session.run("coverage", "erase")

Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
"proto-plus >= 1.19.7",
"grpc-google-iam-v1 >= 0.12.4, < 0.13dev",
"google-cloud-documentai >= 1.2.1, < 2.0.0dev",
"google-cloud-storage >= 1.2.0, <2.5.0",
),
python_requires=">=3.7",
classifiers=[
Expand Down
7 changes: 7 additions & 0 deletions testing/constraints-3.6.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# This constraints file is used to check that lower bounds
# are correct in setup.py
# List *all* library dependencies and extras in this file.
# Pin the version to the lower bound.
#
# e.g., if setup.py has "foo >= 1.14.0, < 2.0.0dev",
# Then this file should have foo==1.14.0