Skip to content
Permalink
Browse files
docs(samples): new Doc AI samples for v1beta3 (#44)
* batch_process_sample. changing from async to synchronous

* add quick start and process_document samples and tests

* add test and sample for batch_process

* add test and sample for batch_process

* resolve formatting

* use os.environ

* remove os.path.join

* move tests

* descriptive variable

* specific Exception, formatting

* parse all pages in process_document

* add more helpful comments

* remove unused imports

* better exception handling

* rename test files

* ran linter, removed nested function in batch predict

* refactor tests

* format imports

* format imports

* format imports

* serialize as Document object

* extract get_text helper function

* fix file path

* delete test bucket

* Update samples/snippets/batch_process_documents_sample_v1beta3_test.py

Co-authored-by: Leah E. Cole <6719667+leahecole@users.noreply.github.com>

* Update samples/snippets/batch_process_documents_sample_v1beta3_test.py

Co-authored-by: Leah E. Cole <6719667+leahecole@users.noreply.github.com>

* add more specific assertion in batch_process

* add more specific assertion in process_document and quickstart

* fix output_uri name

* Apply suggestions from code review to resolve exception

Co-authored-by: Leah E. Cole <6719667+leahecole@users.noreply.github.com>

* resolve exception

* lint

Co-authored-by: Leah E. Cole <6719667+leahecole@users.noreply.github.com>
  • Loading branch information
aribray and leahecole committed Oct 21, 2020
1 parent 5162674 commit cc8c58d1bade4be53fde08f6a3497eb3f79f63b1
Empty file.
Empty file.
@@ -0,0 +1,121 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


# [START documentai_batch_process_document]
import re

from google.cloud import documentai_v1beta3 as documentai
from google.cloud import storage

# TODO(developer): Uncomment these variables before running the sample.
# project_id= 'YOUR_PROJECT_ID'
# location = 'YOUR_PROJECT_LOCATION' # Format is 'us' or 'eu'
# processor_id = 'YOUR_PROCESSOR_ID' # Create processor in Cloud Console
# input_uri = "YOUR_INPUT_URI"
# gcs_output_uri = "YOUR_OUTPUT_BUCKET_URI"
# gcs_output_uri_prefix = "YOUR_OUTPUT_URI_PREFIX"


def batch_process_documents(
project_id,
location,
processor_id,
gcs_input_uri,
gcs_output_uri,
gcs_output_uri_prefix,
):

client = documentai.DocumentProcessorServiceClient()

destination_uri = f"{gcs_output_uri}/{gcs_output_uri_prefix}/"

# 'mime_type' can be 'application/pdf', 'image/tiff',
# and 'image/gif', or 'application/json'
input_config = documentai.types.document_processor_service.BatchProcessRequest.BatchInputConfig(
gcs_source=gcs_input_uri, mime_type="application/pdf"
)

# Where to write results
output_config = documentai.types.document_processor_service.BatchProcessRequest.BatchOutputConfig(
gcs_destination=destination_uri
)

# Location can be 'us' or 'eu'
name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
request = documentai.types.document_processor_service.BatchProcessRequest(
name=name,
input_configs=[input_config],
output_config=output_config,
)

operation = client.batch_process_documents(request)

# Wait for the operation to finish
operation.result()

# Results are written to GCS. Use a regex to find
# output files
match = re.match(r"gs://([^/]+)/(.+)", destination_uri)
output_bucket = match.group(1)
prefix = match.group(2)

storage_client = storage.Client()
bucket = storage_client.get_bucket(output_bucket)
blob_list = list(bucket.list_blobs(prefix=prefix))
print("Output files:")

for i, blob in enumerate(blob_list):
# Download the contents of this blob as a bytes object.
blob_as_bytes = blob.download_as_bytes()
document = documentai.types.Document.from_json(blob_as_bytes)

print(f"Fetched file {i + 1}")

# For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document

# Read the text recognition output from the processor
for page in document.pages:
for form_field in page.form_fields:
field_name = get_text(form_field.field_name, document)
field_value = get_text(form_field.field_value, document)
print("Extracted key value pair:")
print(f"\t{field_name}, {field_value}")
for paragraph in document.pages:
paragraph_text = get_text(paragraph.layout, document)
print(f"Paragraph text:\n{paragraph_text}")


# Extract shards from the text field
def get_text(doc_element: dict, document: dict):
"""
Document AI identifies form fields by their offsets
in document text. This function converts offsets
to text snippets.
"""
response = ""
# If a text segment spans several lines, it will
# be stored in different text segments.
for segment in doc_element.text_anchor.text_segments:
start_index = (
int(segment.start_index)
if "start_index" in doc_element.text_anchor.__dict__
else 0
)
end_index = int(segment.end_index)
response += document.text[start_index:end_index]
return response


# [END documentai_batch_process_document]
@@ -0,0 +1,62 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os
from uuid import uuid4

from google.cloud import storage
from google.cloud.exceptions import NotFound

import pytest

from samples.snippets import batch_process_documents_sample_v1beta3

location = "us"
project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
processor_id = "90484cfdedb024f6"
gcs_input_uri = "gs://cloud-samples-data/documentai/invoice.pdf"
gcs_output_uri_prefix = uuid4()
BUCKET_NAME = f"document-ai-python-{uuid4()}"


@pytest.fixture(scope="module")
def test_bucket():
storage_client = storage.Client()
bucket = storage_client.create_bucket(BUCKET_NAME)
yield bucket.name

try:
blobs = list(bucket.list_blobs())
for blob in blobs:
blob.delete()
bucket.delete()
except NotFound:
print("Bucket already deleted.")


def test_batch_process_documents(capsys, test_bucket):
batch_process_documents_sample_v1beta3.batch_process_documents(
project_id=project_id,
location=location,
processor_id=processor_id,
gcs_input_uri=gcs_input_uri,
gcs_output_uri=f"gs://{test_bucket}",
gcs_output_uri_prefix=gcs_output_uri_prefix,
)
out, _ = capsys.readouterr()

assert "Extracted" in out
assert "Paragraph" in out
assert "Invoice" in out
@@ -37,24 +37,22 @@

TEST_CONFIG = {
# You can opt out from the test for specific Python versions.
'ignored_versions': ["2.7"],

"ignored_versions": ["2.7"],
# An envvar key for determining the project id to use. Change it
# to 'BUILD_SPECIFIC_GCLOUD_PROJECT' if you want to opt in using a
# build specific Cloud project. You can also use your own string
# to use your own Cloud project.
'gcloud_project_env': 'GOOGLE_CLOUD_PROJECT',
"gcloud_project_env": "GOOGLE_CLOUD_PROJECT",
# 'gcloud_project_env': 'BUILD_SPECIFIC_GCLOUD_PROJECT',

# A dictionary you want to inject into your test. Don't put any
# secrets here. These values will override predefined values.
'envs': {},
"envs": {},
}


try:
# Ensure we can import noxfile_config in the project's directory.
sys.path.append('.')
sys.path.append(".")
from noxfile_config import TEST_CONFIG_OVERRIDE
except ImportError as e:
print("No user noxfile_config found: detail: {}".format(e))
@@ -69,13 +67,13 @@ def get_pytest_env_vars():
ret = {}

# Override the GCLOUD_PROJECT and the alias.
env_key = TEST_CONFIG['gcloud_project_env']
env_key = TEST_CONFIG["gcloud_project_env"]
# This should error out if not set.
ret['GOOGLE_CLOUD_PROJECT'] = os.environ[env_key]
ret['GCLOUD_PROJECT'] = os.environ[env_key] # deprecated
ret["GOOGLE_CLOUD_PROJECT"] = os.environ[env_key]
ret["GCLOUD_PROJECT"] = os.environ[env_key] # deprecated

# Apply user supplied envs.
ret.update(TEST_CONFIG['envs'])
ret.update(TEST_CONFIG["envs"])
return ret


@@ -84,7 +82,7 @@ def get_pytest_env_vars():
ALL_VERSIONS = ["2.7", "3.6", "3.7", "3.8"]

# Any default versions that should be ignored.
IGNORED_VERSIONS = TEST_CONFIG['ignored_versions']
IGNORED_VERSIONS = TEST_CONFIG["ignored_versions"]

TESTED_VERSIONS = sorted([v for v in ALL_VERSIONS if v not in IGNORED_VERSIONS])

@@ -138,7 +136,7 @@ def lint(session):
args = FLAKE8_COMMON_ARGS + [
"--application-import-names",
",".join(local_names),
"."
".",
]
session.run("flake8", *args)

@@ -147,6 +145,7 @@ def lint(session):
# Black
#


@nox.session
def blacken(session):
session.install("black")
@@ -194,9 +193,9 @@ def py(session):
if session.python in TESTED_VERSIONS:
_session_tests(session)
else:
session.skip("SKIPPED: {} tests are disabled for this sample.".format(
session.python
))
session.skip(
"SKIPPED: {} tests are disabled for this sample.".format(session.python)
)


#
@@ -0,0 +1,88 @@
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from google.cloud import documentai_v1beta3 as documentai

# [START documentai_process_document]

# TODO(developer): Uncomment these variables before running the sample.
# project_id= 'YOUR_PROJECT_ID';
# location = 'YOUR_PROJECT_LOCATION'; // Format is 'us' or 'eu'
# processor_id = 'YOUR_PROCESSOR_ID'; // Create processor in Cloud Console
# file_path = '/path/to/local/pdf';


def process_document_sample(
project_id: str, location: str, processor_id: str, file_path: str
):
# Instantiates a client
client = documentai.DocumentProcessorServiceClient()

# The full resource name of the processor, e.g.:
# projects/project-id/locations/location/processor/processor-id
# You must create new processors in the Cloud Console first
name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"

with open(file_path, "rb") as image:
image_content = image.read()

# Read the file into memory
document = {"content": image_content, "mime_type": "application/pdf"}

# Configure the process request
request = {"name": name, "document": document}

# Recognizes text entities in the PDF document
result = client.process_document(request=request)

document = result.document

print("Document processing complete.")

# For a full list of Document object attributes, please reference this page: https://googleapis.dev/python/documentai/latest/_modules/google/cloud/documentai_v1beta3/types/document.html#Document

document_pages = document.pages

# Read the text recognition output from the processor
print("The document contains the following paragraphs:")
for page in document_pages:
paragraphs = page.paragraphs
for paragraph in paragraphs:
paragraph_text = get_text(paragraph.layout, document)
print(f"Paragraph text: {paragraph_text}")


# Extract shards from the text field
def get_text(doc_element: dict, document: dict):
"""
Document AI identifies form fields by their offsets
in document text. This function converts offsets
to text snippets.
"""
response = ""
# If a text segment spans several lines, it will
# be stored in different text segments.
for segment in doc_element.text_anchor.text_segments:
start_index = (
int(segment.start_index)
if segment.start_index in doc_element.text_anchor.text_segments
else 0
)
end_index = int(segment.end_index)
response += document.text[start_index:end_index]
return response


# [END documentai_process_document]
@@ -0,0 +1,37 @@
# # Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import os

from samples.snippets import process_document_sample_v1beta3


location = "us"
project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
processor_id = "90484cfdedb024f6"
file_path = "resources/invoice.pdf"


def test_process_documents(capsys):
process_document_sample_v1beta3.process_document_sample(
project_id=project_id,
location=location,
processor_id=processor_id,
file_path=file_path,
)
out, _ = capsys.readouterr()

assert "Paragraph" in out
assert "Invoice" in out

0 comments on commit cc8c58d

Please sign in to comment.