Skip to content

Commit

Permalink
feat: added hOCR export functionality (#123)
Browse files Browse the repository at this point in the history
* chore: edit get_storage_client to add module name

* added module name to get_bytes

* fixed failing test

* chore: added hocr

* removed test files

* revised code per comments

* feat: added hOCR export functionality

* changed line_text to use line.text

* added tests

* fix lint failure

* revised code

* revise code

* refactored code

* refactored code

* expanded test_Page

* refactored code

* refactored code

* refactored code

* fix failing tests

* 🦉 Updates from OwlBot post-processor

See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md

* refactored code

* 🦉 Updates from OwlBot post-processor

See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md

* templated hocr file format

* refactored code

* fixed failing test

* 🦉 Updates from OwlBot post-processor

See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md

---------

Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
Co-authored-by: Holt Skinner <13262395+holtskinner@users.noreply.github.com>
  • Loading branch information
3 people committed Jun 28, 2023
1 parent 646ab69 commit 87d2fc1
Show file tree
Hide file tree
Showing 13 changed files with 14,775 additions and 169 deletions.
11 changes: 11 additions & 0 deletions google/cloud/documentai_toolbox/constants.py
Expand Up @@ -14,6 +14,9 @@
# limitations under the License.
#

from typing import Union
from google.cloud.documentai import Document

USER_AGENT_PRODUCT = "documentai-toolbox"

JSON_EXTENSION = ".json"
Expand All @@ -39,3 +42,11 @@
}

IMAGE_ENTITIES = {"Portrait"}

ElementWithLayout = Union[
Document.Page.Paragraph,
Document.Page,
Document.Page.Token,
Document.Page.Block,
Document.Page.Symbol,
]
27 changes: 9 additions & 18 deletions google/cloud/documentai_toolbox/converters/vision_helpers.py
Expand Up @@ -16,12 +16,11 @@
"""Helper functions for docproto to vision conversion."""

import dataclasses
from typing import List, Union
from typing import List

import immutabledict

from google.cloud.documentai import Document
from google.cloud.vision_v1.types import geometry
from google.cloud.vision import (
EntityAnnotation,
TextAnnotation,
Expand All @@ -35,6 +34,8 @@
)
from google.cloud import vision

from google.cloud.documentai_toolbox.constants import ElementWithLayout


_BREAK_TYPE_MAP = immutabledict.immutabledict(
{
Expand All @@ -51,15 +52,6 @@
)


ElementWithLayout = Union[
Document.Page.Paragraph,
Document.Page,
Document.Page.Token,
Document.Page.Block,
Document.Page.Symbol,
]


@dataclasses.dataclass
class PageInfo:
page: Document.Page
Expand Down Expand Up @@ -254,7 +246,6 @@ def _generate_entity_annotations(
entity_annotations: List[EntityAnnotation] = []
for token in page_info.page.tokens:
v: vision.Vertex = []
bounding_box = geometry.BoundingPoly()
if token.layout.bounding_poly.vertices:
for vertex in token.layout.bounding_poly.vertices:
v.append({"x": int(vertex.x), "y": int(vertex.y)})
Expand All @@ -266,7 +257,6 @@ def _generate_entity_annotations(
"y": int(normalized_vertex.y * page_info.page.dimension.height),
}
)
bounding_box = geometry.BoundingPoly(vertices=v)

text_start_index = token.layout.text_anchor.text_segments[0].start_index
text_end_index = token.layout.text_anchor.text_segments[0].end_index
Expand All @@ -277,12 +267,13 @@ def _generate_entity_annotations(
):
text_end_index -= 1

entity_annotations.append(
EntityAnnotation(
description=page_info.text[text_start_index:text_end_index],
bounding_poly=bounding_box,
)
e = EntityAnnotation(
description=page_info.text[text_start_index:text_end_index]
)

e.bounding_poly.vertices = v

entity_annotations.append(e)
return entity_annotations


Expand Down
30 changes: 28 additions & 2 deletions google/cloud/documentai_toolbox/wrappers/document.py
Expand Up @@ -43,6 +43,8 @@

from pikepdf import Pdf

from jinja2 import Environment, FileSystemLoader


def _entities_from_shards(
shards: List[documentai.Document],
Expand Down Expand Up @@ -86,7 +88,7 @@ def _pages_from_shards(shards: List[documentai.Document]) -> List[Page]:
result = []
for shard in shards:
for shard_page in shard.pages:
result.append(Page(documentai_page=shard_page, document_text=shard.text))
result.append(Page(documentai_object=shard_page, document_text=shard.text))

if len(result) > 1 and result[0].page_number:
result.sort(key=lambda x: x.page_number)
Expand Down Expand Up @@ -755,7 +757,7 @@ def export_images(
index = 0
for entity in self.entities:
image = entity.crop_image(
documentai_page=self.pages[entity.start_page].documentai_page
documentai_page=self.pages[entity.start_page].documentai_object
)
if not image:
continue
Expand All @@ -767,3 +769,27 @@ def export_images(
index += 1

return output_filenames

def export_hocr_str(self, title: str) -> str:
r"""Exports a string hOCR version of the Document.
Args:
title (str):
Required. The title for hocr_page and head.
Returns:
str:
A string hOCR version of the Document
"""
environment = Environment(loader=FileSystemLoader("templates/"))
template = environment.get_template("hocr_xml_template.txt")
hocr_pages = ""
number_of_pages = len(self.pages)
for page_to_export in self.pages:
hocr_pages += page_to_export.to_hocr()

content = template.render(
hocr_pages=hocr_pages, number_of_pages=number_of_pages, title=title
)

return content

0 comments on commit 87d2fc1

Please sign in to comment.