feat: added hOCR export functionality (#123)

* chore: edit get_storage_client to add module name * added module name to get_bytes * fixed failing test * chore: added hocr * removed test files * revised code per comments * feat: added hOCR export functionality * changed line_text to use line.text * added tests * fix lint failure * revised code * revise code * refactored code * refactored code * expanded test_Page * refactored code * refactored code * refactored code * fix failing tests * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * refactored code * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * templated hocr file format * refactored code * fixed failing test * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com> Co-authored-by: Holt Skinner <13262395+holtskinner@users.noreply.github.com>
googleapis · Jun 28, 2023 · 87d2fc1 · 87d2fc1
1 parent 646ab69
commit 87d2fc1
Show file tree

Hide file tree

Showing 13 changed files with 14,775 additions and 169 deletions.
diff --git a/google/cloud/documentai_toolbox/constants.py b/google/cloud/documentai_toolbox/constants.py
@@ -14,6 +14,9 @@
 # limitations under the License.
 #
 
+from typing import Union
+from google.cloud.documentai import Document
+
 USER_AGENT_PRODUCT = "documentai-toolbox"
 
 JSON_EXTENSION = ".json"
@@ -39,3 +42,11 @@
 }
 
 IMAGE_ENTITIES = {"Portrait"}
+
+ElementWithLayout = Union[
+    Document.Page.Paragraph,
+    Document.Page,
+    Document.Page.Token,
+    Document.Page.Block,
+    Document.Page.Symbol,
+]
diff --git a/google/cloud/documentai_toolbox/converters/vision_helpers.py b/google/cloud/documentai_toolbox/converters/vision_helpers.py
@@ -16,12 +16,11 @@
 """Helper functions for docproto to vision conversion."""
 
 import dataclasses
-from typing import List, Union
+from typing import List
 
 import immutabledict
 
 from google.cloud.documentai import Document
-from google.cloud.vision_v1.types import geometry
 from google.cloud.vision import (
     EntityAnnotation,
     TextAnnotation,
@@ -35,6 +34,8 @@
 )
 from google.cloud import vision
 
+from google.cloud.documentai_toolbox.constants import ElementWithLayout
+
 
 _BREAK_TYPE_MAP = immutabledict.immutabledict(
     {
@@ -51,15 +52,6 @@
 )
 
 
-ElementWithLayout = Union[
-    Document.Page.Paragraph,
-    Document.Page,
-    Document.Page.Token,
-    Document.Page.Block,
-    Document.Page.Symbol,
-]
-
-
 @dataclasses.dataclass
 class PageInfo:
     page: Document.Page
@@ -254,7 +246,6 @@ def _generate_entity_annotations(
     entity_annotations: List[EntityAnnotation] = []
     for token in page_info.page.tokens:
         v: vision.Vertex = []
-        bounding_box = geometry.BoundingPoly()
         if token.layout.bounding_poly.vertices:
             for vertex in token.layout.bounding_poly.vertices:
                 v.append({"x": int(vertex.x), "y": int(vertex.y)})
@@ -266,7 +257,6 @@ def _generate_entity_annotations(
                         "y": int(normalized_vertex.y * page_info.page.dimension.height),
                     }
                 )
-        bounding_box = geometry.BoundingPoly(vertices=v)
 
         text_start_index = token.layout.text_anchor.text_segments[0].start_index
         text_end_index = token.layout.text_anchor.text_segments[0].end_index
@@ -277,12 +267,13 @@ def _generate_entity_annotations(
         ):
             text_end_index -= 1
 
-        entity_annotations.append(
-            EntityAnnotation(
-                description=page_info.text[text_start_index:text_end_index],
-                bounding_poly=bounding_box,
-            )
+        e = EntityAnnotation(
+            description=page_info.text[text_start_index:text_end_index]
         )
+
+        e.bounding_poly.vertices = v
+
+        entity_annotations.append(e)
     return entity_annotations
 
 

diff --git a/google/cloud/documentai_toolbox/wrappers/document.py b/google/cloud/documentai_toolbox/wrappers/document.py
@@ -43,6 +43,8 @@
 
 from pikepdf import Pdf
 
+from jinja2 import Environment, FileSystemLoader
+
 
 def _entities_from_shards(
     shards: List[documentai.Document],
@@ -86,7 +88,7 @@ def _pages_from_shards(shards: List[documentai.Document]) -> List[Page]:
     result = []
     for shard in shards:
         for shard_page in shard.pages:
-            result.append(Page(documentai_page=shard_page, document_text=shard.text))
+            result.append(Page(documentai_object=shard_page, document_text=shard.text))
 
     if len(result) > 1 and result[0].page_number:
         result.sort(key=lambda x: x.page_number)
@@ -755,7 +757,7 @@ def export_images(
         index = 0
         for entity in self.entities:
             image = entity.crop_image(
-                documentai_page=self.pages[entity.start_page].documentai_page
+                documentai_page=self.pages[entity.start_page].documentai_object
             )
             if not image:
                 continue
@@ -767,3 +769,27 @@ def export_images(
             index += 1
 
         return output_filenames
+
+    def export_hocr_str(self, title: str) -> str:
+        r"""Exports a string hOCR version of the Document.
+
+        Args:
+            title (str):
+                Required. The title for hocr_page and head.
+
+        Returns:
+            str:
+                A string hOCR version of the Document
+        """
+        environment = Environment(loader=FileSystemLoader("templates/"))
+        template = environment.get_template("hocr_xml_template.txt")
+        hocr_pages = ""
+        number_of_pages = len(self.pages)
+        for page_to_export in self.pages:
+            hocr_pages += page_to_export.to_hocr()
+
+        content = template.render(
+            hocr_pages=hocr_pages, number_of_pages=number_of_pages, title=title
+        )
+
+        return content