docs(samples): Added Processor Version Samples (#382)

* docs(samples): Added Processor Version Samples To Be Published in documentation: https://cloud.google.com/document-ai/docs/manage-processor - `get_processor_version` - `list_processor_versions` - `set_default_processor_version` * docs(samples): Adjusted Bad Batch Input test to * docs(samples): Added Deploy/Undeploy Samples * docs(samples): Added process & batchProcess examples for processorVersions - Removed Processor Version from basic process and batchProcess examples - Removed Note about must create processors in the Cloud Console - Added note that processor must be created before running sample where missing * docs(samples): Adjusted Enable/Disable Processor Test to avoid Race Conditions * docs(samples): Added Delete Processor Version Sample - Also Fixed Spelling error in Undeploy Comments * docs(samples): Updated non-idempotent unit tests to use mocks - Also replaced test ocr processor id after making a breaking change to the project - Added `field_mask` to process_documents tests
googleapis · Sep 26, 2022 · f9ce801 · f9ce801
1 parent 6ef8b9d
commit f9ce801
Show file tree

Hide file tree

Showing 30 changed files with 935 additions and 36 deletions.
diff --git a/samples/snippets/batch_process_documents_processor_version_sample.py b/samples/snippets/batch_process_documents_processor_version_sample.py
@@ -0,0 +1,153 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# [START documentai_batch_process_documents_processor_version]
+import re
+
+from google.api_core.client_options import ClientOptions
+from google.cloud import documentai, storage
+
+# TODO(developer): Uncomment these variables before running the sample.
+# project_id = 'YOUR_PROJECT_ID'
+# location = 'YOUR_PROCESSOR_LOCATION' # Format is 'us' or 'eu'
+# processor_id = 'YOUR_PROCESSOR_ID' # Example: aeb8cea219b7c272
+# processor_version_id = "YOUR_PROCESSOR_VERSION_ID" # Example: pretrained-ocr-v1.0-2020-09-23
+# gcs_input_uri = "YOUR_INPUT_URI" # Format: gs://bucket/directory/file.pdf
+# input_mime_type = "application/pdf"
+# gcs_output_bucket = "YOUR_OUTPUT_BUCKET_NAME" # Format: gs://bucket
+# gcs_output_uri_prefix = "YOUR_OUTPUT_URI_PREFIX" # Format: directory/subdirectory/
+
+
+def batch_process_documents_processor_version(
+    project_id: str,
+    location: str,
+    processor_id: str,
+    processor_version_id: str,
+    gcs_input_uri: str,
+    input_mime_type: str,
+    gcs_output_bucket: str,
+    gcs_output_uri_prefix: str,
+    timeout: int = 300,
+):
+
+    # You must set the api_endpoint if you use a location other than 'us', e.g.:
+    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
+
+    client = documentai.DocumentProcessorServiceClient(client_options=opts)
+
+    gcs_document = documentai.GcsDocument(
+        gcs_uri=gcs_input_uri, mime_type=input_mime_type
+    )
+
+    # Load GCS Input URI into a List of document files
+    gcs_documents = documentai.GcsDocuments(documents=[gcs_document])
+    input_config = documentai.BatchDocumentsInputConfig(gcs_documents=gcs_documents)
+
+    # NOTE: Alternatively, specify a GCS URI Prefix to process an entire directory
+    #
+    # gcs_input_uri = "gs://bucket/directory/"
+    # gcs_prefix = documentai.GcsPrefix(gcs_uri_prefix=gcs_input_uri)
+    # input_config = documentai.BatchDocumentsInputConfig(gcs_prefix=gcs_prefix)
+    #
+
+    # Cloud Storage URI for the Output Directory
+    destination_uri = f"{gcs_output_bucket}/{gcs_output_uri_prefix}/"
+
+    gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig(
+        gcs_uri=destination_uri
+    )
+
+    # Where to write results
+    output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)
+
+    # The full resource name of the processor version
+    # e.g. projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}
+    name = client.processor_version_path(
+        project_id, location, processor_id, processor_version_id
+    )
+
+    request = documentai.BatchProcessRequest(
+        name=name,
+        input_documents=input_config,
+        document_output_config=output_config,
+    )
+
+    # BatchProcess returns a Long Running Operation (LRO)
+    operation = client.batch_process_documents(request)
+
+    # Continually polls the operation until it is complete.
+    # This could take some time for larger files
+    # Format: projects/PROJECT_NUMBER/locations/LOCATION/operations/OPERATION_ID
+    print(f"Waiting for operation {operation.operation.name} to complete...")
+    operation.result(timeout=timeout)
+
+    # NOTE: Can also use callbacks for asynchronous processing
+    #
+    # def my_callback(future):
+    #   result = future.result()
+    #
+    # operation.add_done_callback(my_callback)
+
+    # Once the operation is complete,
+    # get output document information from operation metadata
+    metadata = documentai.BatchProcessMetadata(operation.metadata)
+
+    if metadata.state != documentai.BatchProcessMetadata.State.SUCCEEDED:
+        raise ValueError(f"Batch Process Failed: {metadata.state_message}")
+
+    storage_client = storage.Client()
+
+    print("Output files:")
+    # One process per Input Document
+    for process in metadata.individual_process_statuses:
+        # output_gcs_destination format: gs://BUCKET/PREFIX/OPERATION_NUMBER/INPUT_FILE_NUMBER/
+        # The Cloud Storage API requires the bucket name and URI prefix separately
+        matches = re.match(r"gs://(.*?)/(.*)", process.output_gcs_destination)
+        if not matches:
+            print(
+                "Could not parse output GCS destination:",
+                process.output_gcs_destination,
+            )
+            continue
+
+        output_bucket, output_prefix = matches.groups()
+
+        # Get List of Document Objects from the Output Bucket
+        output_blobs = storage_client.list_blobs(output_bucket, prefix=output_prefix)
+
+        # Document AI may output multiple JSON files per source file
+        for blob in output_blobs:
+            # Document AI should only output JSON files to GCS
+            if ".json" not in blob.name:
+                print(
+                    f"Skipping non-supported file: {blob.name} - Mimetype: {blob.content_type}"
+                )
+                continue
+
+            # Download JSON File as bytes object and convert to Document Object
+            print(f"Fetching {blob.name}")
+            document = documentai.Document.from_json(
+                blob.download_as_bytes(), ignore_unknown_fields=True
+            )
+
+            # For a full list of Document object attributes, please reference this page:
+            # https://cloud.google.com/python/docs/reference/documentai/latest/google.cloud.documentai_v1.types.Document
+
+            # Read the text recognition output from the processor
+            print("The document contains the following text:")
+            print(document.text)
+
+
+# [END documentai_batch_process_documents_processor_version]
diff --git a/samples/snippets/batch_process_documents_processor_version_sample_test.py b/samples/snippets/batch_process_documents_processor_version_sample_test.py
@@ -0,0 +1,64 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+from uuid import uuid4
+
+from google.cloud import storage
+from google.cloud.exceptions import NotFound
+import pytest
+from samples.snippets import batch_process_documents_processor_version_sample
+
+location = "us"
+project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
+processor_id = "90484cfdedb024f6"
+processor_version_id = "pretrained-form-parser-v1.0-2020-09-23"
+gcs_input_uri = "gs://cloud-samples-data/documentai/invoice.pdf"
+input_mime_type = "application/pdf"
+gcs_output_uri_prefix = uuid4()
+BUCKET_NAME = f"document-ai-python-{uuid4()}"
+
+
+@pytest.fixture(scope="module")
+def test_bucket():
+    storage_client = storage.Client()
+    bucket = storage_client.create_bucket(BUCKET_NAME)
+    yield bucket.name
+
+    try:
+        blobs = list(bucket.list_blobs())
+        for blob in blobs:
+            blob.delete()
+        bucket.delete()
+    except NotFound:
+        print("Bucket already deleted.")
+
+
+def test_batch_process_documents_processor_version(capsys, test_bucket):
+    batch_process_documents_processor_version_sample.batch_process_documents_processor_version(
+        project_id=project_id,
+        location=location,
+        processor_id=processor_id,
+        processor_version_id=processor_version_id,
+        gcs_input_uri=gcs_input_uri,
+        input_mime_type=input_mime_type,
+        gcs_output_bucket=f"gs://{test_bucket}",
+        gcs_output_uri_prefix=gcs_output_uri_prefix,
+    )
+    out, _ = capsys.readouterr()
+
+    assert "operation" in out
+    assert "Fetching" in out
+    assert "text:" in out
diff --git a/samples/snippets/batch_process_documents_sample.py b/samples/snippets/batch_process_documents_sample.py
@@ -23,7 +23,6 @@
 # project_id = 'YOUR_PROJECT_ID'
 # location = 'YOUR_PROCESSOR_LOCATION' # Format is 'us' or 'eu'
 # processor_id = 'YOUR_PROCESSOR_ID' # Create processor before running sample
-# processor_version = "pretrained" # Optional. Processor version to use
 # gcs_input_uri = "YOUR_INPUT_URI" # Format: gs://bucket/directory/file.pdf
 # input_mime_type = "application/pdf"
 # gcs_output_bucket = "YOUR_OUTPUT_BUCKET_NAME" # Format: gs://bucket
@@ -73,17 +72,8 @@ def batch_process_documents(
 
     # The full resource name of the processor, e.g.:
     # projects/project_id/locations/location/processor/processor_id
-    # You must create new processors in the Cloud Console first
     name = client.processor_path(project_id, location, processor_id)
 
-    # NOTE: Alternatively, specify the processor_version to specify a particular version of the processor to use
-    # projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processorVersion}
-    #
-    # name = client.processor_version_path(
-    #     project_id, location, processor_id, processor_version
-    # )
-    #
-
     request = documentai.BatchProcessRequest(
         name=name,
         input_documents=input_config,

diff --git a/samples/snippets/batch_process_documents_sample_bad_input_test.py b/samples/snippets/batch_process_documents_sample_bad_input_test.py
@@ -44,4 +44,4 @@ def test_batch_process_documents_with_bad_input(capsys):
         out, _ = capsys.readouterr()
         assert "Failed" in out
     except Exception as e:
-        assert "Internal error" in e.message
+        assert "Failed" in e.message
diff --git a/samples/snippets/delete_processor_version_sample.py b/samples/snippets/delete_processor_version_sample.py
@@ -0,0 +1,58 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# [START documentai_delete_processor_version]
+
+from google.api_core.client_options import ClientOptions
+from google.api_core.exceptions import FailedPrecondition, InvalidArgument
+from google.cloud import documentai
+
+# TODO(developer): Uncomment these variables before running the sample.
+# project_id = 'YOUR_PROJECT_ID'
+# location = 'YOUR_PROCESSOR_LOCATION' # Format is 'us' or 'eu'
+# processor_id = 'YOUR_PROCESSOR_ID' # Create processor before running sample
+# processor_version_id = 'YOUR_PROCESSOR_VERSION_ID'
+
+
+def delete_processor_version_sample(
+    project_id: str, location: str, processor_id: str, processor_version_id: str
+):
+    # You must set the api_endpoint if you use a location other than 'us', e.g.:
+    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")
+
+    client = documentai.DocumentProcessorServiceClient(client_options=opts)
+
+    # The full resource name of the processor version
+    # e.g.: projects/project_id/locations/location/processors/processor_id/processorVersions/processor_version_id
+    name = client.processor_version_path(
+        project_id, location, processor_id, processor_version_id
+    )
+
+    # Make DeleteProcessorVersion request
+    try:
+        operation = client.delete_processor_version(name=name)
+        # Print operation details
+        print(operation.operation.name)
+        # Wait for operation to complete
+        operation.result()
+    # Delete request will fail if the
+    # processor version doesn't exist
+    # or if a request is made on a pretrained processor version
+    # or the default processor version
+    except (FailedPrecondition, InvalidArgument) as e:
+        print(e.message)
+
+
+# [END documentai_delete_processor_version]
diff --git a/samples/snippets/delete_processor_version_sample_test.py b/samples/snippets/delete_processor_version_sample_test.py
@@ -0,0 +1,47 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+
+import mock
+from samples.snippets import delete_processor_version_sample
+
+location = "us"
+project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
+processor_id = "aaaaaaaaa"
+processor_version_id = "xxxxxxxxxx"
+
+
+@mock.patch(
+    "google.cloud.documentai.DocumentProcessorServiceClient.delete_processor_version"
+)
+@mock.patch("google.api_core.operation.Operation")
+def test_delete_processor_version(
+    operation_mock, delete_processor_version_mock, capsys
+):
+    delete_processor_version_mock.return_value = operation_mock
+
+    delete_processor_version_sample.delete_processor_version_sample(
+        project_id=project_id,
+        location=location,
+        processor_id=processor_id,
+        processor_version_id=processor_version_id,
+    )
+
+    delete_processor_version_mock.assert_called_once()
+
+    out, _ = capsys.readouterr()
+
+    assert "operation" in out