Skip to content

Commit

Permalink
chore: changed get_storage_client to add module name for analytics (#103
Browse files Browse the repository at this point in the history
)

* chore: edit get_storage_client to add module name

* added module name to get_bytes

* fixed failing test
  • Loading branch information
galz10 committed Apr 13, 2023
1 parent ebf1c6c commit 60e1999
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,7 @@ def _get_bytes(
annotation_file_prefix: str,
config_file_prefix: str,
config_path: str = None,
storage_client: storage.Client = None,
) -> List[bytes]:
r"""Downloads documents and returns them as bytes.
Expand All @@ -233,8 +234,9 @@ def _get_bytes(
List[bytes].
"""
if not storage_client:
storage_client = gcs_utilities._get_storage_client(module="get-bytes")

storage_client = gcs_utilities._get_storage_client()
bucket = storage_client.bucket(bucket_name=bucket_name)
blobs = storage_client.list_blobs(bucket_or_name=bucket_name, prefix=prefix)

Expand Down Expand Up @@ -273,6 +275,7 @@ def _upload_file(
bucket_name: str,
output_prefix: str,
file: str,
storage_client: storage.Client = None,
) -> None:
r"""Uploads the converted docproto to gcs.
Expand All @@ -288,7 +291,9 @@ def _upload_file(
None.
"""
storage_client = gcs_utilities._get_storage_client()
if not storage_client:
storage_client = gcs_utilities._get_storage_client(module="upload-file")

bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(output_prefix)

Expand All @@ -301,6 +306,7 @@ def _get_files(
input_bucket: str,
input_prefix: str,
config_path: str = None,
storage_client: storage.Client = None,
):
r"""Returns a list of Futures of documents as bytes.
Expand Down Expand Up @@ -340,6 +346,7 @@ def _get_files(
"annotation",
"config",
config_path,
storage_client,
)
downloads.append(download)

Expand Down Expand Up @@ -399,7 +406,9 @@ def _get_docproto_files(
return files, unique_types, did_not_convert


def _upload(files: dict, gcs_output_path: str) -> None:
def _upload(
files: dict, gcs_output_path: str, storage_client: storage.Client = None
) -> None:
r"""Upload converted document.proto to gcs location.
Args:
Expand Down Expand Up @@ -440,6 +449,7 @@ def _upload(files: dict, gcs_output_path: str) -> None:
output_bucket,
f"{output_prefix}/{key}.json",
files[key],
storage_client,
)
uploads.append(upload)

Expand Down Expand Up @@ -495,7 +505,7 @@ def _convert_documents_with_config(
if file_check:
raise ValueError("gcs_prefix cannot contain file types")

storage_client = gcs_utilities._get_storage_client()
storage_client = gcs_utilities._get_storage_client(module="config-converter")

blob_list = storage_client.list_blobs(input_bucket, prefix=input_prefix)

Expand All @@ -504,6 +514,7 @@ def _convert_documents_with_config(
input_prefix=input_prefix,
input_bucket=input_bucket,
config_path=config_path,
storage_client=storage_client,
)

f, _ = futures.wait(downloads)
Expand All @@ -525,7 +536,7 @@ def _convert_documents_with_config(
print(f"Did not convert {len(did_not_convert)} documents")
print(did_not_convert)

_upload(files, gcs_output_path)
_upload(files, gcs_output_path, storage_client)

print("-------- Finished Uploading --------")
print("-------- Schema Information --------")
Expand Down
20 changes: 16 additions & 4 deletions google/cloud/documentai_toolbox/utilities/gcs_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,25 @@
from google.cloud.documentai_toolbox import constants


def _get_storage_client():
def _get_storage_client(module: str = None):
r"""Returns a Storage client with custom user agent header.
Returns:
storage.Client.
"""

if module:
user_agent = (
f"{constants.USER_AGENT_PRODUCT}/{documentai_toolbox.__version__}-{module}"
)

info = client_info.ClientInfo(
client_library_version=f"{documentai_toolbox.__version__}-{module}",
user_agent=user_agent,
)
return storage.Client(client_info=info)

user_agent = f"{constants.USER_AGENT_PRODUCT}/{documentai_toolbox.__version__}"

info = client_info.ClientInfo(
Expand Down Expand Up @@ -62,7 +74,7 @@ def get_bytes(gcs_bucket_name: str, gcs_prefix: str) -> List[bytes]:
"""
result = []

storage_client = _get_storage_client()
storage_client = _get_storage_client(module="get-bytes")
blob_list = storage_client.list_blobs(gcs_bucket_name, prefix=gcs_prefix)

for blob in blob_list:
Expand Down Expand Up @@ -143,7 +155,7 @@ def list_gcs_document_tree(
if file_check is not None:
raise ValueError("gcs_prefix cannot contain file types")

storage_client = _get_storage_client()
storage_client = _get_storage_client(module="list-document")
blob_list = storage_client.list_blobs(gcs_bucket_name, prefix=gcs_prefix)

path_list: Dict[str, List[str]] = {}
Expand Down Expand Up @@ -227,7 +239,7 @@ def create_batches(
f"Batch size must be less than {constants.BATCH_MAX_FILES}. You provided {batch_size}."
)

storage_client = _get_storage_client()
storage_client = _get_storage_client(module="create-batches")
blob_list = storage_client.list_blobs(gcs_bucket_name, prefix=gcs_prefix)
batches: List[documentai.BatchDocumentsInputConfig] = []
batch: List[documentai.GcsDocument] = []
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/test_converter_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,7 @@ def test_upload(mock_upload_file):
files["document_1"] = "Document"
converter_helpers._upload(files, gcs_output_path="gs://output/")

mock_upload_file.assert_called_with("output", "/document_1.json", "Document")
mock_upload_file.assert_called_with("output", "/document_1.json", "Document", None)


def test_upload_with_format_error():
Expand Down

0 comments on commit 60e1999

Please sign in to comment.