Skip to content

Commit

Permalink
feat: Add blocks to PageWrapper (#107)
Browse files Browse the repository at this point in the history
* feat: Add blocks to PageWrapper

Add feature to get the blocks present in the Document AI JSON response
as a Python list of documentai.Document.Page.Block objects, similar to
the way to get the paragraphs and lines.

* feat: Add tests for blocks in PageWrapper

Add Unit Tests for #d9e6ada

* 🦉 Updates from OwlBot post-processor

See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md

---------

Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
  • Loading branch information
sharmanirudh and gcf-owl-bot[bot] committed Apr 17, 2023
1 parent 60e1999 commit df7dfe7
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 0 deletions.
47 changes: 47 additions & 0 deletions google/cloud/documentai_toolbox/wrappers/page.py
Expand Up @@ -120,6 +120,21 @@ def _table_wrapper_from_documentai_table(
)


@dataclasses.dataclass
class Block:
"""Represents a wrapped documentai.Document.Page.Block.
Attributes:
documentai_block (google.cloud.documentai.Document.Page.Block):
Required. The original google.cloud.documentai.Document.Page.Block object.
text (str):
Required. UTF-8 encoded text.
"""

documentai_block: documentai.Document.Page.Block
text: str


@dataclasses.dataclass
class Paragraph:
"""Represents a wrapped documentai.Document.Page.Paragraph.
Expand Down Expand Up @@ -191,6 +206,32 @@ def _text_from_layout(layout: documentai.Document.Page.Layout, text: str) -> str
return result_text


def _get_blocks(blocks: List[documentai.Document.Page.Block], text: str) -> List[Block]:
r"""Returns a list of Block.
Args:
blocks (List[documentai.Document.Page.Block]):
Required. A list of documentai.Document.Page.Block objects.
text (str):
Required. UTF-8 encoded text in reading order
from the document.
Returns:
List[Block]:
A list of Blocks.
"""
result = []

for block in blocks:
result.append(
Block(
documentai_block=block,
text=_text_from_layout(layout=block.layout, text=text),
)
)

return result


def _get_paragraphs(
paragraphs: List[documentai.Document.Page.Paragraph], text: str
) -> List[Paragraph]:
Expand Down Expand Up @@ -339,6 +380,10 @@ class Page:
Required. A list of visually detected text paragraphs
on the page. A collection of lines that a human
would perceive as a paragraph.
blocks (List[Block]):
Required. A list of visually detected text blocks
on the page. A collection of lines that a human
would perceive as a block.
tables (List[Table]):
Required. A list of visually detected tables on the
page.
Expand All @@ -350,6 +395,7 @@ class Page:
form_fields: List[FormField] = dataclasses.field(init=False, repr=False)
lines: List[Line] = dataclasses.field(init=False, repr=False)
paragraphs: List[Paragraph] = dataclasses.field(init=False, repr=False)
blocks: List[Block] = dataclasses.field(init=False, repr=False)
tables: List[Table] = dataclasses.field(init=False, repr=False)

def __post_init__(self):
Expand All @@ -369,4 +415,5 @@ def __post_init__(self):
self.paragraphs = _get_paragraphs(
paragraphs=self.documentai_page.paragraphs, text=self.text
)
self.blocks = _get_blocks(blocks=self.documentai_page.blocks, text=self.text)
self.tables = tables
2 changes: 2 additions & 0 deletions samples/snippets/quickstart_sample.py
Expand Up @@ -41,6 +41,8 @@ def quickstart_sample(gcs_bucket_name: str, gcs_prefix: str) -> None:

for idx, page in enumerate(wrapped_document.pages):
print(f"Page {idx}")
for block in page.blocks:
print(block.text)
for paragraph in page.paragraphs:
print(paragraph.text)

Expand Down
18 changes: 18 additions & 0 deletions tests/unit/test_page.py
Expand Up @@ -172,6 +172,15 @@ def test_text_from_element_with_layout(docproto):
assert text == "Invoice\n"


def test_get_blocks(docproto):
docproto_blocks = docproto.pages[0].blocks

blocks = page._get_blocks(blocks=docproto_blocks, text=docproto.text)

assert len(blocks) == 31
assert blocks[0].text == "Invoice\n"


def test_get_paragraphs(docproto):
docproto_paragraphs = docproto.pages[0].paragraphs

Expand Down Expand Up @@ -218,6 +227,13 @@ def test_FormField():
assert form_field.field_value == "Sally Walker"


def test_Block():
docai_block = documentai.Document.Page.Block()
block = page.Block(documentai_block=docai_block, text="test_block")

assert block.text == "test_block"


def test_Paragraph():
docai_paragraph = documentai.Document.Page.Paragraph()
paragraph = page.Paragraph(
Expand Down Expand Up @@ -254,5 +270,7 @@ def test_Page(docproto):

assert len(wrapped_page.lines) == 37
assert len(wrapped_page.paragraphs) == 31
assert len(wrapped_page.blocks) == 31
assert wrapped_page.lines[0].text == "Invoice\n"
assert wrapped_page.paragraphs[30].text == "Supplies used for Project Q.\n"
assert wrapped_page.blocks[30].text == "Supplies used for Project Q.\n"

0 comments on commit df7dfe7

Please sign in to comment.