Skip to content

Commit

Permalink
feat: Add Import Document from Batch Process Metadata & Operation (#88)
Browse files Browse the repository at this point in the history
* feat: Add utility functions for GCS URIs

- Updates to types to fix lint errors
- Add `files_to_display` optional parameter to `print_document_tree()`
- Other formatting improvements for docs.

* test: Add tests for gcs_uri functions.

* feat: Add Import Document from batch process metadata

* refactor: Moved `_get_storage_client` and `get_bytes` to utilities

- Resolves Circular dependency

* test: Attempt to fix mock patch

* test: Attempt to fix test import errors

* Change utility import in converter_helpers.py

* Update utilities import

* Added Inline samples for `from_document_path()` and `from_documentai_document()`

* feat: Add utility functions for GCS URIs

- Updates to types to fix lint errors
- Add `files_to_display` optional parameter to `print_document_tree()`
- Other formatting improvements for docs.

* test: Add tests for gcs_uri functions.

* feat: Add Import Document from batch process metadata

* refactor: Moved `_get_storage_client` and `get_bytes` to utilities

- Resolves Circular dependency

* test: Attempt to fix mock patch

* test: Attempt to fix test import errors

* Change utility import in converter_helpers.py

* Update utilities import

* Added Inline samples for `from_document_path()` and `from_documentai_document()`

* test: Add check for Failed BatchProcessMetadata

* fix: Update imports based on Gal's feedback

* refactor: Rename `utilities.py` to `gcs_utilities.py`

* Add alias for gcs_utilities in `__init__.py`

* Update mock.patch for gcs_utilities in `test_converter.py`

* Removed alias for gcs_utilities. Changed Samples to follow

* Added `Document.from_batch_process_operation()`
- Gets operation information and passes it to `from_batch_process_metadata()`

* Fixed mock.patch for `get_bytes_images_mock`

* Remove underscore from `get_bytes` in `get_bytes_images_mock`
  • Loading branch information
holtskinner committed Apr 7, 2023
1 parent 383e105 commit f95bbea
Show file tree
Hide file tree
Showing 12 changed files with 500 additions and 166 deletions.
2 changes: 1 addition & 1 deletion docs/documentai_toolbox/utilities.rst
@@ -1,7 +1,7 @@
Document AI Toolbox Utilities
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

.. automodule:: google.cloud.documentai_toolbox.utilities.utilities
.. automodule:: google.cloud.documentai_toolbox.utilities.gcs_utilities
:members:
:private-members:
:noindex:
4 changes: 2 additions & 2 deletions google/cloud/documentai_toolbox/__init__.py
Expand Up @@ -29,7 +29,7 @@
)

from .utilities import (
utilities,
gcs_utilities,
)

__all__ = (document, page, entity, converter, utilities)
__all__ = (document, page, entity, converter, gcs_utilities)
Expand Up @@ -28,7 +28,9 @@
_load_blocks_from_schema,
)

from google.cloud.documentai_toolbox import document, constants
from google.cloud.documentai_toolbox import constants
from google.cloud.documentai_toolbox.utilities import gcs_utilities

from google.cloud import documentai, storage


Expand Down Expand Up @@ -86,7 +88,6 @@ def _get_entity_content(
entity_id = 0

for block in blocks:

docai_entity = documentai.Document.Entity()
if block.confidence:
docai_entity.confidence = block.confidence
Expand Down Expand Up @@ -233,7 +234,7 @@ def _get_bytes(
"""

storage_client = document._get_storage_client()
storage_client = gcs_utilities._get_storage_client()
bucket = storage_client.bucket(bucket_name=bucket_name)
blobs = storage_client.list_blobs(bucket_or_name=bucket_name, prefix=prefix)

Expand Down Expand Up @@ -287,7 +288,7 @@ def _upload_file(
None.
"""
storage_client = document._get_storage_client()
storage_client = gcs_utilities._get_storage_client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(output_prefix)

Expand Down Expand Up @@ -494,7 +495,7 @@ def _convert_documents_with_config(
if file_check:
raise ValueError("gcs_prefix cannot contain file types")

storage_client = document._get_storage_client()
storage_client = gcs_utilities._get_storage_client()

blob_list = storage_client.list_blobs(input_bucket, prefix=input_prefix)

Expand Down
Expand Up @@ -16,18 +16,113 @@
"""Document AI utilities."""
import os
import re
from typing import Dict, List, Optional
from typing import Dict, List, Tuple

from google.api_core import client_info
from google.cloud import documentai
from google.cloud import storage
from google.cloud import documentai_toolbox

from google.cloud.documentai_toolbox import constants
from google.cloud.documentai_toolbox.wrappers.document import _get_storage_client


def _get_storage_client():
r"""Returns a Storage client with custom user agent header.
Returns:
storage.Client.
"""
user_agent = f"{constants.USER_AGENT_PRODUCT}/{documentai_toolbox.__version__}"

info = client_info.ClientInfo(
client_library_version=documentai_toolbox.__version__,
user_agent=user_agent,
)

return storage.Client(client_info=info)


def get_bytes(gcs_bucket_name: str, gcs_prefix: str) -> List[bytes]:
r"""Returns a list of bytes of json files from Cloud Storage.
Args:
gcs_bucket_name (str):
Required. The name of the gcs bucket.
Format: `gs://{bucket_name}/{optional_folder}/{target_folder}/` where gcs_bucket_name=`bucket`.
gcs_prefix (str):
Required. The prefix of the json files in the target_folder
Format: `gs://{bucket_name}/{optional_folder}/{target_folder}/` where gcs_prefix=`{optional_folder}/{target_folder}`.
Returns:
List[bytes]:
A list of bytes.
"""
result = []

storage_client = _get_storage_client()
blob_list = storage_client.list_blobs(gcs_bucket_name, prefix=gcs_prefix)

for blob in blob_list:
if (
blob.name.endswith(constants.JSON_EXTENSION)
or blob.content_type == constants.JSON_MIMETYPE
):
result.append(blob.download_as_bytes())

return result


def split_gcs_uri(gcs_uri: str) -> Tuple[str, str]:
r"""Splits a Cloud Storage uri into the bucket_name and prefix.
Args:
gcs_uri (str):
Required. The full Cloud Storage URI.
Format: `gs://{bucket_name}/{gcs_prefix}`.
Returns:
Tuple[str, str]:
The Cloud Storage Bucket and Prefix.
"""
matches = re.match("gs://(.*?)/(.*)", gcs_uri)

if not matches:
raise ValueError(
"gcs_uri must follow format 'gs://{bucket_name}/{gcs_prefix}'."
)
bucket, prefix = matches.groups()
return str(bucket), str(prefix)


def create_gcs_uri(gcs_bucket_name: str, gcs_prefix: str) -> str:
r"""Creates a Cloud Storage uri from the bucket_name and prefix.
Args:
gcs_bucket_name (str):
Required. The name of the gcs bucket.
Format: `gs://{bucket_name}/{optional_folder}/{target_folder}/` where gcs_bucket_name=`bucket`.
gcs_prefix (str):
Required. The prefix of the files in the target_folder.
Format: `gs://{bucket_name}/{optional_folder}/{target_folder}/` where gcs_prefix=`{optional_folder}/{target_folder}`.
Returns:
str
The full Cloud Storage uri.
Format: `gs://{gcs_bucket_name}/{gcs_prefix}`
"""
return f"gs://{gcs_bucket_name}/{gcs_prefix}"


def list_gcs_document_tree(
gcs_bucket_name: str, gcs_prefix: str
) -> Dict[str, List[str]]:
r"""Returns a list path to files in Cloud Storage folder and prints the tree to terminal.
r"""Returns a list path to files in Cloud Storage folder.
Args:
gcs_bucket_name (str):
Expand Down Expand Up @@ -64,8 +159,10 @@ def list_gcs_document_tree(
return path_list


def print_gcs_document_tree(gcs_bucket_name: str, gcs_prefix: str) -> None:
r"""Prints a tree of filenames in Cloud Storage folder..
def print_gcs_document_tree(
gcs_bucket_name: str, gcs_prefix: str, files_to_display: int = 4
) -> None:
r"""Prints a tree of filenames in a Cloud Storage folder.
Args:
gcs_bucket_name (str):
Expand All @@ -76,13 +173,14 @@ def print_gcs_document_tree(gcs_bucket_name: str, gcs_prefix: str) -> None:
Required. The prefix of the json files in the target_folder.
Format: `gs://{bucket_name}/{optional_folder}/{target_folder}/` where gcs_prefix=`{optional_folder}/{target_folder}`.
files_to_display (int):
Optional. The amount of files to display. Default is `4`.
Returns:
None.
"""
FILENAME_TREE_MIDDLE = "├──"
FILENAME_TREE_LAST = "└──"
FILES_TO_DISPLAY = 4

path_list = list_gcs_document_tree(
gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix
Expand All @@ -93,18 +191,18 @@ def print_gcs_document_tree(gcs_bucket_name: str, gcs_prefix: str) -> None:
dir_size = len(files)
for idx, file_name in enumerate(files):
if idx == dir_size - 1:
if dir_size > FILES_TO_DISPLAY:
if dir_size > files_to_display:
print("│ ....")
print(f"{FILENAME_TREE_LAST}{file_name}\n")
break
if idx <= FILES_TO_DISPLAY:
if idx <= files_to_display:
print(f"{FILENAME_TREE_MIDDLE}{file_name}")


def create_batches(
gcs_bucket_name: str,
gcs_prefix: str,
batch_size: Optional[int] = constants.BATCH_MAX_FILES,
batch_size: int = constants.BATCH_MAX_FILES,
) -> List[documentai.BatchDocumentsInputConfig]:
"""Create batches of documents in Cloud Storage to process with `batch_process_documents()`.
Expand All @@ -117,7 +215,7 @@ def create_batches(
Required. The prefix of the json files in the `target_folder`
Format: `gs://bucket/optional_folder/target_folder/` where gcs_prefix=`optional_folder/target_folder`.
batch_size (Optional[int]):
batch_size (int):
Optional. Size of each batch of documents. Default is `50`.
Returns:
Expand All @@ -143,7 +241,7 @@ def create_batches(
print(f"Skipping file {blob.name}. Invalid Mime Type {blob.content_type}.")
continue

if blob.size > constants.BATCH_MAX_FILE_SIZE:
if int(blob.size) > constants.BATCH_MAX_FILE_SIZE:
print(
f"Skipping file {blob.name}. File size must be less than {constants.BATCH_MAX_FILE_SIZE} bytes. File size is {blob.size} bytes."
)
Expand All @@ -159,7 +257,7 @@ def create_batches(

batch.append(
documentai.GcsDocument(
gcs_uri=f"gs://{gcs_bucket_name}/{blob.name}",
gcs_uri=create_gcs_uri(gcs_bucket_name, blob.name),
mime_type=blob.content_type,
)
)
Expand Down

0 comments on commit f95bbea

Please sign in to comment.