Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add PDF Splitter #51

Merged
merged 13 commits into from Feb 15, 2023
43 changes: 43 additions & 0 deletions google/cloud/documentai_toolbox/wrappers/document.py
Expand Up @@ -29,6 +29,8 @@
from google.cloud.documentai_toolbox.wrappers.page import Page
from google.cloud.documentai_toolbox.wrappers.entity import Entity

from pikepdf import Pdf


def _entities_from_shards(
shards: List[documentai.Document],
Expand Down Expand Up @@ -344,3 +346,44 @@ def get_entity_by_type(self, target_type: str) -> List[Entity]:

"""
return [entity for entity in self.entities if entity.type_ == target_type]

def split_pdf(self, pdf_path: str, output_path: str) -> List[str]:
holtskinner marked this conversation as resolved.
Show resolved Hide resolved
r"""Splits local PDF file into multiple PDF files based on output from a Splitter/Classifier processor.

Args:
pdf_path (str):
Required. The path to the PDF file.
output_path (str):
Required. The path to the output directory.
Returns:
List[str]:
A list of output pdf files.
"""
output_files: List[str] = []
input_filename, input_extension = os.path.splitext(os.path.basename(pdf_path))
with Pdf.open(pdf_path) as f:
for entity in self.entities:
subdoc_type = entity.type_ or "subdoc"

if entity.start_page == entity.end_page:
page_range = f"pg{entity.start_page + 1}"
else:
page_range = f"pg{entity.start_page + 1}-{entity.end_page + 1}"
holtskinner marked this conversation as resolved.
Show resolved Hide resolved

output_filename = (
f"{input_filename}_{page_range}_{subdoc_type}{input_extension}"
)

subdoc = Pdf.new()
for page_num in range(entity.start_page, entity.end_page + 1):
subdoc.pages.append(f.pages[page_num])

subdoc.save(
os.path.join(
output_path,
output_filename,
),
min_version=f.pdf_version,
)
output_files.append(output_filename)
return output_files
6 changes: 6 additions & 0 deletions google/cloud/documentai_toolbox/wrappers/entity.py
Expand Up @@ -37,7 +37,13 @@ class Entity:
documentai_entity: documentai.Document.Entity = dataclasses.field(repr=False)
type_: str = dataclasses.field(init=False)
mention_text: str = dataclasses.field(init=False, default="")
# Only Populated for Splitter/Classifier Output
start_page: int = dataclasses.field(init=False)
end_page: int = dataclasses.field(init=False)

def __post_init__(self):
self.type_ = self.documentai_entity.type_
self.mention_text = self.documentai_entity.mention_text
if self.documentai_entity.page_anchor.page_refs:
self.start_page = int(self.documentai_entity.page_anchor.page_refs[0].page)
self.end_page = int(self.documentai_entity.page_anchor.page_refs[-1].page)
1 change: 1 addition & 0 deletions setup.py
Expand Up @@ -52,6 +52,7 @@
"google-cloud-documentai >= 1.2.1, < 3.0.0dev",
"google-cloud-storage >= 1.31.0, < 3.0.0dev",
"numpy >= 1.18.1",
"pikepdf >= 6.2.9, < 8.0.0",
holtskinner marked this conversation as resolved.
Show resolved Hide resolved
holtskinner marked this conversation as resolved.
Show resolved Hide resolved
),
python_requires=">=3.7",
classifiers=[
Expand Down
Binary file not shown.
@@ -0,0 +1,256 @@
{
"text": "Google\nINVOICE\n# 23413561D\nJohn Smith\nDate:\nSep 24, 2019\nBill To:\nJane Smith,\n1600 Amphitheatre Pkway\nMountain View, CA 94043\nDue Date:\nSep 30, 2019\nBalance Due:\n$4,647.68\nItem\nQuantity\nRate\nAmount\n$9",
"pages": [
{
"pageNumber": 1,
"dimension": {
"width": 1758,
"height": 2275,
"unit": "pixels"
},
"layout": {
"textAnchor": {
"textSegments": [
{
"endIndex": "665"
}
]
},
"boundingPoly": {
"vertices": [
{},
{
"x": 1758
},
{
"x": 1758,
"y": 2275
},
{
"y": 2275
}
],
"normalizedVertices": [
{},
{
"x": 1
},
{
"x": 1,
"y": 1
},
{
"y": 1
}
]
},
"orientation": "PAGE_UP"
},
"detectedLanguages": [
{
"languageCode": "en"
},
{
"languageCode": "und"
}
],
"blocks": [
{}
],
"paragraphs": [
{}
],
"lines": [
{}
],
"tokens": [
{}
],
"image": {
"content": "iVBORw0KGgoAAAANSUhE...",
"mimeType": "image/png",
"width": 1758,
"height": 2275
}
},
{
"pageNumber": 2,
"dimension": {
"width": 2275,
"height": 1758,
"unit": "pixels"
},
"layout": {
"textAnchor": {
"textSegments": [
{
"startIndex": "665",
"endIndex": "1236"
}
]
},
"boundingPoly": {
"vertices": [
{},
{
"x": 2275
},
{
"x": 2275,
"y": 1758
},
{
"y": 1758
}
],
"normalizedVertices": [
{},
{
"x": 1
},
{
"x": 1,
"y": 1
},
{
"y": 1
}
]
},
"orientation": "PAGE_UP"
},
"detectedLanguages": [
{
"languageCode": "en"
},
{
"languageCode": "und"
},
{
"languageCode": "da"
},
{
"languageCode": "es"
},
{
"languageCode": "fi"
},
{
"languageCode": "sv"
}
],
"blocks": [
{}
],
"paragraphs": [
{}
],
"lines": [
{}
],
"tokens": [
{}
],
"image": {
"content": "/9j/4AAQSkZJRgABAQAA...",
"mimeType": "image/jpeg",
"width": 2275,
"height": 1758
}
}
],
"entities": [
{
"textAnchor": {
"textSegments": [
{
"endIndex": "665"
}
]
},
"type": "invoice_statement",
"confidence": 0.995982,
"pageAnchor": {
"pageRefs": [
{}
]
}
},
{
"textAnchor": {
"textSegments": [
{
"startIndex": "665",
"endIndex": "1236"
}
]
},
"type": "receipt_statement",
"confidence": 0.9840074,
"pageAnchor": {
"pageRefs": [
{
"page": "1"
}
]
}
},
{
"textAnchor": {
"textSegments": [
{
"startIndex": "1236",
"endIndex": "1954"
}
]
},
"type": "other",
"confidence": 0.8117405,
"pageAnchor": {
"pageRefs": [
{
"page": "2"
}
]
}
},
{
"textAnchor": {
"textSegments": [
{
"startIndex": "1954",
"endIndex": "3924"
}
]
},
"type": "utility_statement",
"confidence": 0.9991843,
"pageAnchor": {
"pageRefs": [
{
"page": "3"
}
]
}
},
{
"textAnchor": {
"textSegments": [
{
"startIndex": "3924",
"endIndex": "4201"
}
]
},
"type": "restaurant_statement",
"confidence": 0.9970099,
"pageAnchor": {
"pageRefs": [
{
"page": "4"
}
]
}
}
]
}