Skip to content

Commit

Permalink
feat: Add PDF Splitter (#51)
Browse files Browse the repository at this point in the history
* feat: Add PDF Splitter

* fix: Updated setup.py syntax

* fix: Fixed initializer error

* Updated Test to include a multi-page split

* formatting fix

* Add Pdf Split Example

* Adjusted mkdir in tests

* Added pikepdf to test dependencies

---------

Co-authored-by: Gal Zahavi <38544478+galz10@users.noreply.github.com>
  • Loading branch information
holtskinner and galz10 committed Feb 15, 2023
1 parent 622959f commit 8359911
Show file tree
Hide file tree
Showing 14 changed files with 609 additions and 5 deletions.
43 changes: 43 additions & 0 deletions google/cloud/documentai_toolbox/wrappers/document.py
Expand Up @@ -30,6 +30,8 @@
from google.cloud.documentai_toolbox.wrappers.page import FormField
from google.cloud.documentai_toolbox.wrappers.entity import Entity

from pikepdf import Pdf


def _entities_from_shards(
shards: List[documentai.Document],
Expand Down Expand Up @@ -365,3 +367,44 @@ def get_entity_by_type(self, target_type: str) -> List[Entity]:
"""
return [entity for entity in self.entities if entity.type_ == target_type]

def split_pdf(self, pdf_path: str, output_path: str) -> List[str]:
r"""Splits local PDF file into multiple PDF files based on output from a Splitter/Classifier processor.
Args:
pdf_path (str):
Required. The path to the PDF file.
output_path (str):
Required. The path to the output directory.
Returns:
List[str]:
A list of output pdf files.
"""
output_files: List[str] = []
input_filename, input_extension = os.path.splitext(os.path.basename(pdf_path))
with Pdf.open(pdf_path) as f:
for entity in self.entities:
subdoc_type = entity.type_ or "subdoc"

if entity.start_page == entity.end_page:
page_range = f"pg{entity.start_page + 1}"
else:
page_range = f"pg{entity.start_page + 1}-{entity.end_page + 1}"

output_filename = (
f"{input_filename}_{page_range}_{subdoc_type}{input_extension}"
)

subdoc = Pdf.new()
for page_num in range(entity.start_page, entity.end_page + 1):
subdoc.pages.append(f.pages[page_num])

subdoc.save(

This comment has been minimized.

Copy link
@piergiorgiorayme

piergiorgiorayme May 24, 2023

Hi! Is there is a way to handle the case in which, in output_filename:

subdoc_type is a string like line_item/quantity

In that case, I think you might get a problem with the directory because of the / character.

os.path.join(
output_path,
output_filename,
),
min_version=f.pdf_version,
)
output_files.append(output_filename)
return output_files
6 changes: 6 additions & 0 deletions google/cloud/documentai_toolbox/wrappers/entity.py
Expand Up @@ -37,7 +37,13 @@ class Entity:
documentai_entity: documentai.Document.Entity = dataclasses.field(repr=False)
type_: str = dataclasses.field(init=False)
mention_text: str = dataclasses.field(init=False, default="")
# Only Populated for Splitter/Classifier Output
start_page: int = dataclasses.field(init=False)
end_page: int = dataclasses.field(init=False)

def __post_init__(self):
self.type_ = self.documentai_entity.type_
self.mention_text = self.documentai_entity.mention_text
if self.documentai_entity.page_anchor.page_refs:
self.start_page = int(self.documentai_entity.page_anchor.page_refs[0].page)
self.end_page = int(self.documentai_entity.page_anchor.page_refs[-1].page)
40 changes: 40 additions & 0 deletions samples/snippets/split_pdf_sample.py
@@ -0,0 +1,40 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#


# [START documentai_toolbox_split_pdf]

from google.cloud.documentai_toolbox import document

# TODO(developer): Uncomment these variables before running the sample.
# Given a local document.proto or sharded document.proto from a splitter/classifier in path
# document_path = "path/to/local/document.json"
# pdf_path = "path/to/local/document.pdf"
# output_path = "resources/output/"


def split_pdf_sample(document_path: str, pdf_path: str, output_path: str) -> None:
wrapped_document = document.Document.from_document_path(document_path=document_path)

output_files = wrapped_document.split_pdf(
pdf_path=pdf_path, output_path=output_path
)

print("Document Successfully Split")
for output_file in output_files:
print(output_file)


# [END documentai_toolbox_split_pdf]
42 changes: 42 additions & 0 deletions samples/snippets/test_split_pdf_sample.py
@@ -0,0 +1,42 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os
import shutil

import pytest
from samples.snippets import split_pdf_sample

document_path = "../../tests/unit/resources/splitter/procurement_splitter_output.json"
pdf_path = "../../tests/unit/resources/procurement_multi_document.pdf"
output_path = "resources/output/"


def test_split_pdf_sample(capsys: pytest.CaptureFixture) -> None:
os.makedirs(output_path)
current_directory = os.path.dirname(__file__)
rel_document_path = os.path.relpath(document_path, current_directory)
rel_pdf_path = os.path.relpath(pdf_path, current_directory)

split_pdf_sample.split_pdf_sample(
document_path=rel_document_path, pdf_path=rel_pdf_path, output_path=output_path
)
out, _ = capsys.readouterr()

assert "Document Successfully Split" in out
assert "procurement_multi_document_pg1_invoice_statement.pdf" in out

assert os.path.exists(output_path)
shutil.rmtree(output_path)
1 change: 1 addition & 0 deletions setup.py
Expand Up @@ -52,6 +52,7 @@
"google-cloud-documentai >= 1.2.1, < 3.0.0dev",
"google-cloud-storage >= 1.31.0, < 3.0.0dev",
"numpy >= 1.18.1",
"pikepdf >= 6.2.9, < 8.0.0",
),
python_requires=">=3.7",
classifiers=[
Expand Down
3 changes: 2 additions & 1 deletion testing/constraints-3.10.txt
Expand Up @@ -7,4 +7,5 @@ pandas
proto-plus
grpc-google-iam-v1
google-cloud-documentai
google-cloud-storage
google-cloud-storage
pikepdf
3 changes: 2 additions & 1 deletion testing/constraints-3.11.txt
Expand Up @@ -7,4 +7,5 @@ pandas
proto-plus
grpc-google-iam-v1
google-cloud-documentai
google-cloud-storage
google-cloud-storage
pikepdf
3 changes: 2 additions & 1 deletion testing/constraints-3.7.txt
Expand Up @@ -11,4 +11,5 @@ proto-plus== 1.22.0
grpc-google-iam-v1==0.12.4
google-cloud-documentai==1.2.1
google-cloud-storage== 1.31.0
numpy==1.18.1
numpy==1.18.1
pikepdf==6.2.9
3 changes: 2 additions & 1 deletion testing/constraints-3.8.txt
Expand Up @@ -7,4 +7,5 @@ pandas
proto-plus
grpc-google-iam-v1
google-cloud-documentai
google-cloud-storage
google-cloud-storage
pikepdf
3 changes: 2 additions & 1 deletion testing/constraints-3.9.txt
Expand Up @@ -7,4 +7,5 @@ pandas
proto-plus
grpc-google-iam-v1
google-cloud-documentai
google-cloud-storage
google-cloud-storage
pikepdf
Binary file not shown.

0 comments on commit 8359911

Please sign in to comment.