Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add Import Document from Batch Process Metadata & Operation #88

Merged
merged 33 commits into from
Apr 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
0f0f702
feat: Add utility functions for GCS URIs
holtskinner Mar 24, 2023
059f676
test: Add tests for gcs_uri functions.
holtskinner Mar 24, 2023
5dce30c
feat: Add Import Document from batch process metadata
holtskinner Mar 24, 2023
5486543
refactor: Moved `_get_storage_client` and `get_bytes` to utilities
holtskinner Mar 24, 2023
0639ac5
test: Attempt to fix mock patch
holtskinner Mar 24, 2023
ccdf954
test: Attempt to fix test import errors
holtskinner Mar 27, 2023
646ec40
Change utility import in converter_helpers.py
holtskinner Mar 27, 2023
9a2758f
Update utilities import
holtskinner Mar 27, 2023
3d48aaa
Added Inline samples for `from_document_path()` and `from_documentai_…
holtskinner Mar 27, 2023
0780e4a
feat: Add utility functions for GCS URIs
holtskinner Mar 24, 2023
92a4154
test: Add tests for gcs_uri functions.
holtskinner Mar 24, 2023
081d420
feat: Add Import Document from batch process metadata
holtskinner Mar 24, 2023
93eceda
refactor: Moved `_get_storage_client` and `get_bytes` to utilities
holtskinner Mar 24, 2023
fe9f83a
test: Attempt to fix mock patch
holtskinner Mar 24, 2023
4b4f8d6
test: Attempt to fix test import errors
holtskinner Mar 27, 2023
21fa38c
Change utility import in converter_helpers.py
holtskinner Mar 27, 2023
7955f91
Update utilities import
holtskinner Mar 27, 2023
0bad4d4
Added Inline samples for `from_document_path()` and `from_documentai_…
holtskinner Mar 27, 2023
2803d20
Merge branch 'lint_fix' of https://github.com/googleapis/python-docum…
holtskinner Mar 27, 2023
d75eddc
test: Add check for Failed BatchProcessMetadata
holtskinner Mar 27, 2023
bac637e
fix: Update imports based on Gal's feedback
holtskinner Mar 27, 2023
436508f
Merge branch 'main' into lint_fix
holtskinner Mar 29, 2023
74801fd
Merge branch 'main' into lint_fix
holtskinner Mar 30, 2023
f0a73d1
refactor: Rename `utilities.py` to `gcs_utilities.py`
holtskinner Apr 3, 2023
3f635a5
Add alias for gcs_utilities in `__init__.py`
holtskinner Apr 3, 2023
f076b8c
Update mock.patch for gcs_utilities in `test_converter.py`
holtskinner Apr 3, 2023
37d07e3
Removed alias for gcs_utilities. Changed Samples to follow
holtskinner Apr 3, 2023
d7690a6
Merge branch 'main' into lint_fix
holtskinner Apr 3, 2023
fe41729
Added `Document.from_batch_process_operation()`
holtskinner Apr 4, 2023
b9ddb55
Merge branch 'main' into lint_fix
holtskinner Apr 6, 2023
9133cc8
Merge branch 'main' of https://github.com/googleapis/python-documenta…
holtskinner Apr 6, 2023
e89056d
Fixed mock.patch for `get_bytes_images_mock`
holtskinner Apr 6, 2023
0712c84
Remove underscore from `get_bytes` in `get_bytes_images_mock`
holtskinner Apr 6, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/documentai_toolbox/utilities.rst
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Document AI Toolbox Utilities
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

.. automodule:: google.cloud.documentai_toolbox.utilities.utilities
.. automodule:: google.cloud.documentai_toolbox.utilities.gcs_utilities
:members:
:private-members:
:noindex:
4 changes: 2 additions & 2 deletions google/cloud/documentai_toolbox/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
)

from .utilities import (
utilities,
gcs_utilities,
)

__all__ = (document, page, entity, converter, utilities)
__all__ = (document, page, entity, converter, gcs_utilities)
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@
_load_blocks_from_schema,
)

from google.cloud.documentai_toolbox import document, constants
from google.cloud.documentai_toolbox import constants
from google.cloud.documentai_toolbox.utilities import gcs_utilities

from google.cloud import documentai, storage


Expand Down Expand Up @@ -86,7 +88,6 @@ def _get_entity_content(
entity_id = 0

for block in blocks:

docai_entity = documentai.Document.Entity()
if block.confidence:
docai_entity.confidence = block.confidence
Expand Down Expand Up @@ -233,7 +234,7 @@ def _get_bytes(

"""

storage_client = document._get_storage_client()
storage_client = gcs_utilities._get_storage_client()
bucket = storage_client.bucket(bucket_name=bucket_name)
blobs = storage_client.list_blobs(bucket_or_name=bucket_name, prefix=prefix)

Expand Down Expand Up @@ -287,7 +288,7 @@ def _upload_file(
None.

"""
storage_client = document._get_storage_client()
storage_client = gcs_utilities._get_storage_client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(output_prefix)

Expand Down Expand Up @@ -494,7 +495,7 @@ def _convert_documents_with_config(
if file_check:
raise ValueError("gcs_prefix cannot contain file types")

storage_client = document._get_storage_client()
storage_client = gcs_utilities._get_storage_client()

blob_list = storage_client.list_blobs(input_bucket, prefix=input_prefix)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,113 @@
"""Document AI utilities."""
import os
import re
from typing import Dict, List, Optional
from typing import Dict, List, Tuple

from google.api_core import client_info
from google.cloud import documentai
from google.cloud import storage
from google.cloud import documentai_toolbox

from google.cloud.documentai_toolbox import constants
from google.cloud.documentai_toolbox.wrappers.document import _get_storage_client


def _get_storage_client():
r"""Returns a Storage client with custom user agent header.

Returns:
storage.Client.

"""
user_agent = f"{constants.USER_AGENT_PRODUCT}/{documentai_toolbox.__version__}"

info = client_info.ClientInfo(
client_library_version=documentai_toolbox.__version__,
user_agent=user_agent,
)

return storage.Client(client_info=info)


def get_bytes(gcs_bucket_name: str, gcs_prefix: str) -> List[bytes]:
r"""Returns a list of bytes of json files from Cloud Storage.

Args:
gcs_bucket_name (str):
Required. The name of the gcs bucket.

Format: `gs://{bucket_name}/{optional_folder}/{target_folder}/` where gcs_bucket_name=`bucket`.
gcs_prefix (str):
Required. The prefix of the json files in the target_folder

Format: `gs://{bucket_name}/{optional_folder}/{target_folder}/` where gcs_prefix=`{optional_folder}/{target_folder}`.
Returns:
List[bytes]:
A list of bytes.

"""
result = []

storage_client = _get_storage_client()
blob_list = storage_client.list_blobs(gcs_bucket_name, prefix=gcs_prefix)

for blob in blob_list:
if (
blob.name.endswith(constants.JSON_EXTENSION)
or blob.content_type == constants.JSON_MIMETYPE
):
result.append(blob.download_as_bytes())

return result


def split_gcs_uri(gcs_uri: str) -> Tuple[str, str]:
r"""Splits a Cloud Storage uri into the bucket_name and prefix.

Args:
gcs_uri (str):
Required. The full Cloud Storage URI.

Format: `gs://{bucket_name}/{gcs_prefix}`.
Returns:
Tuple[str, str]:
The Cloud Storage Bucket and Prefix.

"""
matches = re.match("gs://(.*?)/(.*)", gcs_uri)

if not matches:
raise ValueError(
"gcs_uri must follow format 'gs://{bucket_name}/{gcs_prefix}'."
)
bucket, prefix = matches.groups()
return str(bucket), str(prefix)


def create_gcs_uri(gcs_bucket_name: str, gcs_prefix: str) -> str:
r"""Creates a Cloud Storage uri from the bucket_name and prefix.

Args:
gcs_bucket_name (str):
Required. The name of the gcs bucket.

Format: `gs://{bucket_name}/{optional_folder}/{target_folder}/` where gcs_bucket_name=`bucket`.
gcs_prefix (str):
Required. The prefix of the files in the target_folder.

Format: `gs://{bucket_name}/{optional_folder}/{target_folder}/` where gcs_prefix=`{optional_folder}/{target_folder}`.
Returns:
str
The full Cloud Storage uri.
Format: `gs://{gcs_bucket_name}/{gcs_prefix}`

"""
return f"gs://{gcs_bucket_name}/{gcs_prefix}"


def list_gcs_document_tree(
gcs_bucket_name: str, gcs_prefix: str
) -> Dict[str, List[str]]:
r"""Returns a list path to files in Cloud Storage folder and prints the tree to terminal.
r"""Returns a list path to files in Cloud Storage folder.

Args:
gcs_bucket_name (str):
Expand Down Expand Up @@ -64,8 +159,10 @@ def list_gcs_document_tree(
return path_list


def print_gcs_document_tree(gcs_bucket_name: str, gcs_prefix: str) -> None:
r"""Prints a tree of filenames in Cloud Storage folder..
def print_gcs_document_tree(
gcs_bucket_name: str, gcs_prefix: str, files_to_display: int = 4
) -> None:
r"""Prints a tree of filenames in a Cloud Storage folder.

Args:
gcs_bucket_name (str):
Expand All @@ -76,13 +173,14 @@ def print_gcs_document_tree(gcs_bucket_name: str, gcs_prefix: str) -> None:
Required. The prefix of the json files in the target_folder.

Format: `gs://{bucket_name}/{optional_folder}/{target_folder}/` where gcs_prefix=`{optional_folder}/{target_folder}`.
files_to_display (int):
Optional. The amount of files to display. Default is `4`.
Returns:
None.

"""
FILENAME_TREE_MIDDLE = "├──"
FILENAME_TREE_LAST = "└──"
FILES_TO_DISPLAY = 4

path_list = list_gcs_document_tree(
gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix
Expand All @@ -93,18 +191,18 @@ def print_gcs_document_tree(gcs_bucket_name: str, gcs_prefix: str) -> None:
dir_size = len(files)
for idx, file_name in enumerate(files):
if idx == dir_size - 1:
if dir_size > FILES_TO_DISPLAY:
if dir_size > files_to_display:
print("│ ....")
print(f"{FILENAME_TREE_LAST}{file_name}\n")
break
if idx <= FILES_TO_DISPLAY:
if idx <= files_to_display:
print(f"{FILENAME_TREE_MIDDLE}{file_name}")


def create_batches(
gcs_bucket_name: str,
gcs_prefix: str,
batch_size: Optional[int] = constants.BATCH_MAX_FILES,
batch_size: int = constants.BATCH_MAX_FILES,
) -> List[documentai.BatchDocumentsInputConfig]:
"""Create batches of documents in Cloud Storage to process with `batch_process_documents()`.

Expand All @@ -117,7 +215,7 @@ def create_batches(
Required. The prefix of the json files in the `target_folder`

Format: `gs://bucket/optional_folder/target_folder/` where gcs_prefix=`optional_folder/target_folder`.
batch_size (Optional[int]):
batch_size (int):
Optional. Size of each batch of documents. Default is `50`.

Returns:
Expand All @@ -143,7 +241,7 @@ def create_batches(
print(f"Skipping file {blob.name}. Invalid Mime Type {blob.content_type}.")
continue

if blob.size > constants.BATCH_MAX_FILE_SIZE:
if int(blob.size) > constants.BATCH_MAX_FILE_SIZE:
print(
f"Skipping file {blob.name}. File size must be less than {constants.BATCH_MAX_FILE_SIZE} bytes. File size is {blob.size} bytes."
)
Expand All @@ -159,7 +257,7 @@ def create_batches(

batch.append(
documentai.GcsDocument(
gcs_uri=f"gs://{gcs_bucket_name}/{blob.name}",
gcs_uri=create_gcs_uri(gcs_bucket_name, blob.name),
mime_type=blob.content_type,
)
)
Expand Down