feat: Add PDF Splitter (#51)

* feat: Add PDF Splitter * fix: Updated setup.py syntax * fix: Fixed initializer error * Updated Test to include a multi-page split * formatting fix * Add Pdf Split Example * Adjusted mkdir in tests * Added pikepdf to test dependencies --------- Co-authored-by: Gal Zahavi <38544478+galz10@users.noreply.github.com>
googleapis · Feb 15, 2023 · 8359911 · piergiorgiorayme · May 24, 2023 · 8359911
1 parent 622959f
commit 8359911
Show file tree

Hide file tree

Showing 14 changed files with 609 additions and 5 deletions.
diff --git a/google/cloud/documentai_toolbox/wrappers/document.py b/google/cloud/documentai_toolbox/wrappers/document.py
@@ -30,6 +30,8 @@
 from google.cloud.documentai_toolbox.wrappers.page import FormField
 from google.cloud.documentai_toolbox.wrappers.entity import Entity
 
+from pikepdf import Pdf
+
 
 def _entities_from_shards(
     shards: List[documentai.Document],
@@ -365,3 +367,44 @@ def get_entity_by_type(self, target_type: str) -> List[Entity]:
 
         """
         return [entity for entity in self.entities if entity.type_ == target_type]
+
+    def split_pdf(self, pdf_path: str, output_path: str) -> List[str]:
+        r"""Splits local PDF file into multiple PDF files based on output from a Splitter/Classifier processor.
+
+        Args:
+            pdf_path (str):
+                Required. The path to the PDF file.
+            output_path (str):
+                Required. The path to the output directory.
+        Returns:
+            List[str]:
+                A list of output pdf files.
+        """
+        output_files: List[str] = []
+        input_filename, input_extension = os.path.splitext(os.path.basename(pdf_path))
+        with Pdf.open(pdf_path) as f:
+            for entity in self.entities:
+                subdoc_type = entity.type_ or "subdoc"
+
+                if entity.start_page == entity.end_page:
+                    page_range = f"pg{entity.start_page + 1}"
+                else:
+                    page_range = f"pg{entity.start_page + 1}-{entity.end_page + 1}"
+
+                output_filename = (
+                    f"{input_filename}_{page_range}_{subdoc_type}{input_extension}"
+                )
+
+                subdoc = Pdf.new()
+                for page_num in range(entity.start_page, entity.end_page + 1):
+                    subdoc.pages.append(f.pages[page_num])
+
+                subdoc.save(
+                    os.path.join(
+                        output_path,
+                        output_filename,
+                    ),
+                    min_version=f.pdf_version,
+                )
+                output_files.append(output_filename)
+        return output_files
diff --git a/google/cloud/documentai_toolbox/wrappers/entity.py b/google/cloud/documentai_toolbox/wrappers/entity.py
@@ -37,7 +37,13 @@ class Entity:
     documentai_entity: documentai.Document.Entity = dataclasses.field(repr=False)
     type_: str = dataclasses.field(init=False)
     mention_text: str = dataclasses.field(init=False, default="")
+    # Only Populated for Splitter/Classifier Output
+    start_page: int = dataclasses.field(init=False)
+    end_page: int = dataclasses.field(init=False)
 
     def __post_init__(self):
         self.type_ = self.documentai_entity.type_
         self.mention_text = self.documentai_entity.mention_text
+        if self.documentai_entity.page_anchor.page_refs:
+            self.start_page = int(self.documentai_entity.page_anchor.page_refs[0].page)
+            self.end_page = int(self.documentai_entity.page_anchor.page_refs[-1].page)
diff --git a/samples/snippets/split_pdf_sample.py b/samples/snippets/split_pdf_sample.py
@@ -0,0 +1,40 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+# [START documentai_toolbox_split_pdf]
+
+from google.cloud.documentai_toolbox import document
+
+# TODO(developer): Uncomment these variables before running the sample.
+# Given a local document.proto or sharded document.proto from a splitter/classifier in path
+# document_path = "path/to/local/document.json"
+# pdf_path = "path/to/local/document.pdf"
+# output_path = "resources/output/"
+
+
+def split_pdf_sample(document_path: str, pdf_path: str, output_path: str) -> None:
+    wrapped_document = document.Document.from_document_path(document_path=document_path)
+
+    output_files = wrapped_document.split_pdf(
+        pdf_path=pdf_path, output_path=output_path
+    )
+
+    print("Document Successfully Split")
+    for output_file in output_files:
+        print(output_file)
+
+
+# [END documentai_toolbox_split_pdf]
diff --git a/samples/snippets/test_split_pdf_sample.py b/samples/snippets/test_split_pdf_sample.py
@@ -0,0 +1,42 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+import shutil
+
+import pytest
+from samples.snippets import split_pdf_sample
+
+document_path = "../../tests/unit/resources/splitter/procurement_splitter_output.json"
+pdf_path = "../../tests/unit/resources/procurement_multi_document.pdf"
+output_path = "resources/output/"
+
+
+def test_split_pdf_sample(capsys: pytest.CaptureFixture) -> None:
+    os.makedirs(output_path)
+    current_directory = os.path.dirname(__file__)
+    rel_document_path = os.path.relpath(document_path, current_directory)
+    rel_pdf_path = os.path.relpath(pdf_path, current_directory)
+
+    split_pdf_sample.split_pdf_sample(
+        document_path=rel_document_path, pdf_path=rel_pdf_path, output_path=output_path
+    )
+    out, _ = capsys.readouterr()
+
+    assert "Document Successfully Split" in out
+    assert "procurement_multi_document_pg1_invoice_statement.pdf" in out
+
+    assert os.path.exists(output_path)
+    shutil.rmtree(output_path)
diff --git a/setup.py b/setup.py
@@ -52,6 +52,7 @@
         "google-cloud-documentai >= 1.2.1, < 3.0.0dev",
         "google-cloud-storage >= 1.31.0, < 3.0.0dev",
         "numpy >= 1.18.1",
+        "pikepdf >= 6.2.9, < 8.0.0",
     ),
     python_requires=">=3.7",
     classifiers=[

diff --git a/testing/constraints-3.10.txt b/testing/constraints-3.10.txt
@@ -7,4 +7,5 @@ pandas
 proto-plus
 grpc-google-iam-v1
 google-cloud-documentai
-google-cloud-storage
+google-cloud-storage
+pikepdf
diff --git a/testing/constraints-3.11.txt b/testing/constraints-3.11.txt
@@ -7,4 +7,5 @@ pandas
 proto-plus
 grpc-google-iam-v1
 google-cloud-documentai
-google-cloud-storage
+google-cloud-storage
+pikepdf
diff --git a/testing/constraints-3.7.txt b/testing/constraints-3.7.txt
@@ -11,4 +11,5 @@ proto-plus== 1.22.0
 grpc-google-iam-v1==0.12.4
 google-cloud-documentai==1.2.1
 google-cloud-storage== 1.31.0
-numpy==1.18.1
+numpy==1.18.1
+pikepdf==6.2.9
diff --git a/testing/constraints-3.8.txt b/testing/constraints-3.8.txt
@@ -7,4 +7,5 @@ pandas
 proto-plus
 grpc-google-iam-v1
 google-cloud-documentai
-google-cloud-storage
+google-cloud-storage
+pikepdf
diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt
@@ -7,4 +7,5 @@ pandas
 proto-plus
 grpc-google-iam-v1
 google-cloud-documentai
-google-cloud-storage
+google-cloud-storage
+pikepdf
diff --git a/tests/unit/resources/procurement_multi_document.pdf b/tests/unit/resources/procurement_multi_document.pdf