From df7dfe7b79d39010d5addb3fa861a9c803caae45 Mon Sep 17 00:00:00 2001 From: Anirudh Sharma Date: Tue, 18 Apr 2023 00:39:49 +0530 Subject: [PATCH] feat: Add blocks to PageWrapper (#107) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: Add blocks to PageWrapper Add feature to get the blocks present in the Document AI JSON response as a Python list of documentai.Document.Page.Block objects, similar to the way to get the paragraphs and lines. * feat: Add tests for blocks in PageWrapper Add Unit Tests for #d9e6ada * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md --------- Co-authored-by: Owl Bot --- .../cloud/documentai_toolbox/wrappers/page.py | 47 +++++++++++++++++++ samples/snippets/quickstart_sample.py | 2 + tests/unit/test_page.py | 18 +++++++ 3 files changed, 67 insertions(+) diff --git a/google/cloud/documentai_toolbox/wrappers/page.py b/google/cloud/documentai_toolbox/wrappers/page.py index c3116500..9c2d876d 100644 --- a/google/cloud/documentai_toolbox/wrappers/page.py +++ b/google/cloud/documentai_toolbox/wrappers/page.py @@ -120,6 +120,21 @@ def _table_wrapper_from_documentai_table( ) +@dataclasses.dataclass +class Block: + """Represents a wrapped documentai.Document.Page.Block. + + Attributes: + documentai_block (google.cloud.documentai.Document.Page.Block): + Required. The original google.cloud.documentai.Document.Page.Block object. + text (str): + Required. UTF-8 encoded text. + """ + + documentai_block: documentai.Document.Page.Block + text: str + + @dataclasses.dataclass class Paragraph: """Represents a wrapped documentai.Document.Page.Paragraph. @@ -191,6 +206,32 @@ def _text_from_layout(layout: documentai.Document.Page.Layout, text: str) -> str return result_text +def _get_blocks(blocks: List[documentai.Document.Page.Block], text: str) -> List[Block]: + r"""Returns a list of Block. + + Args: + blocks (List[documentai.Document.Page.Block]): + Required. A list of documentai.Document.Page.Block objects. + text (str): + Required. UTF-8 encoded text in reading order + from the document. + Returns: + List[Block]: + A list of Blocks. + """ + result = [] + + for block in blocks: + result.append( + Block( + documentai_block=block, + text=_text_from_layout(layout=block.layout, text=text), + ) + ) + + return result + + def _get_paragraphs( paragraphs: List[documentai.Document.Page.Paragraph], text: str ) -> List[Paragraph]: @@ -339,6 +380,10 @@ class Page: Required. A list of visually detected text paragraphs on the page. A collection of lines that a human would perceive as a paragraph. + blocks (List[Block]): + Required. A list of visually detected text blocks + on the page. A collection of lines that a human + would perceive as a block. tables (List[Table]): Required. A list of visually detected tables on the page. @@ -350,6 +395,7 @@ class Page: form_fields: List[FormField] = dataclasses.field(init=False, repr=False) lines: List[Line] = dataclasses.field(init=False, repr=False) paragraphs: List[Paragraph] = dataclasses.field(init=False, repr=False) + blocks: List[Block] = dataclasses.field(init=False, repr=False) tables: List[Table] = dataclasses.field(init=False, repr=False) def __post_init__(self): @@ -369,4 +415,5 @@ def __post_init__(self): self.paragraphs = _get_paragraphs( paragraphs=self.documentai_page.paragraphs, text=self.text ) + self.blocks = _get_blocks(blocks=self.documentai_page.blocks, text=self.text) self.tables = tables diff --git a/samples/snippets/quickstart_sample.py b/samples/snippets/quickstart_sample.py index 33ff8c0f..ea59080f 100644 --- a/samples/snippets/quickstart_sample.py +++ b/samples/snippets/quickstart_sample.py @@ -41,6 +41,8 @@ def quickstart_sample(gcs_bucket_name: str, gcs_prefix: str) -> None: for idx, page in enumerate(wrapped_document.pages): print(f"Page {idx}") + for block in page.blocks: + print(block.text) for paragraph in page.paragraphs: print(paragraph.text) diff --git a/tests/unit/test_page.py b/tests/unit/test_page.py index 4299fc3f..75915aae 100644 --- a/tests/unit/test_page.py +++ b/tests/unit/test_page.py @@ -172,6 +172,15 @@ def test_text_from_element_with_layout(docproto): assert text == "Invoice\n" +def test_get_blocks(docproto): + docproto_blocks = docproto.pages[0].blocks + + blocks = page._get_blocks(blocks=docproto_blocks, text=docproto.text) + + assert len(blocks) == 31 + assert blocks[0].text == "Invoice\n" + + def test_get_paragraphs(docproto): docproto_paragraphs = docproto.pages[0].paragraphs @@ -218,6 +227,13 @@ def test_FormField(): assert form_field.field_value == "Sally Walker" +def test_Block(): + docai_block = documentai.Document.Page.Block() + block = page.Block(documentai_block=docai_block, text="test_block") + + assert block.text == "test_block" + + def test_Paragraph(): docai_paragraph = documentai.Document.Page.Paragraph() paragraph = page.Paragraph( @@ -254,5 +270,7 @@ def test_Page(docproto): assert len(wrapped_page.lines) == 37 assert len(wrapped_page.paragraphs) == 31 + assert len(wrapped_page.blocks) == 31 assert wrapped_page.lines[0].text == "Invoice\n" assert wrapped_page.paragraphs[30].text == "Supplies used for Project Q.\n" + assert wrapped_page.blocks[30].text == "Supplies used for Project Q.\n"