Skip to content

Commit

Permalink
Merge branch 'analytic-changes' of https://github.com/galz10/python-d…
Browse files Browse the repository at this point in the history
…ocumentai-toolbox into analytic-changes
  • Loading branch information
galz10 committed Apr 26, 2023
2 parents 9d264a3 + f9be4ee commit 61e5ed8
Show file tree
Hide file tree
Showing 11 changed files with 304 additions and 53 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
# Changelog

## [0.6.0-alpha](https://github.com/googleapis/python-documentai-toolbox/compare/v0.5.0-alpha...v0.6.0-alpha) (2023-04-17)


### Features

* Add blocks to PageWrapper ([#107](https://github.com/googleapis/python-documentai-toolbox/issues/107)) ([df7dfe7](https://github.com/googleapis/python-documentai-toolbox/commit/df7dfe7b79d39010d5addb3fa861a9c803caae45))
* Added `form_fields_to_bigquery()` method ([#104](https://github.com/googleapis/python-documentai-toolbox/issues/104)) ([96abe22](https://github.com/googleapis/python-documentai-toolbox/commit/96abe220c9909bcc5642ea146c06fd082a2f8009))

## [0.5.0-alpha](https://github.com/googleapis/python-documentai-toolbox/compare/v0.4.1-alpha...v0.5.0-alpha) (2023-04-07)


Expand Down
33 changes: 16 additions & 17 deletions google/cloud/documentai_toolbox/utilities/gcs_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,33 +26,32 @@
from google.cloud.documentai_toolbox import constants


def _get_storage_client(module: str = None):
r"""Returns a Storage client with custom user agent header.
def _get_client_info(module: str = None) -> client_info.ClientInfo:
r"""Returns a custom user agent header.
Returns:
storage.Client.
client_info.ClientInfo.
"""
client_library_version = documentai_toolbox.__version__

if module:
user_agent = (
f"{constants.USER_AGENT_PRODUCT}/{documentai_toolbox.__version__}-{module}"
)
client_library_version = f"{client_library_version}-{module}"

info = client_info.ClientInfo(
client_library_version=f"{documentai_toolbox.__version__}-{module}",
user_agent=user_agent,
)
return storage.Client(client_info=info)
return client_info.ClientInfo(
client_library_version=client_library_version,
user_agent=f"{constants.USER_AGENT_PRODUCT}/{client_library_version}",
)

user_agent = f"{constants.USER_AGENT_PRODUCT}/{documentai_toolbox.__version__}"

info = client_info.ClientInfo(
client_library_version=documentai_toolbox.__version__,
user_agent=user_agent,
)
def _get_storage_client(module: str = None) -> storage.Client:
r"""Returns a Storage client with custom user agent header.
return storage.Client(client_info=info)
Returns:
storage.Client.
"""
return storage.Client(client_info=_get_client_info(module))


def get_bytes(gcs_bucket_name: str, gcs_prefix: str) -> List[bytes]:
Expand Down
2 changes: 1 addition & 1 deletion google/cloud/documentai_toolbox/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
__version__ = "0.5.0-alpha"
__version__ = "0.6.0-alpha"
182 changes: 152 additions & 30 deletions google/cloud/documentai_toolbox/wrappers/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,106 @@ def _get_batch_process_metadata(
return metadata


def _insert_into_dictionary_with_list(dic: Dict, key: str, value: str) -> Dict:
r"""Inserts value into a dictionary that can contain lists.
Args:
dic (Dict):
Required. The dictionary to insert into.
key (str):
Required. The key to be created or inserted into.
value (str):
Required. The value to be inserted.
Returns:
Dict:
The dictionary after adding the key value pair.
"""
existing_value = dic.get(key)

if existing_value:
# For duplicate keys,
# Change Type to a List if not already
if not isinstance(existing_value, list):
existing_value = [existing_value]

existing_value.append(value)
dic[key] = existing_value
else:
dic[key] = value

return dic


def _bigquery_column_name(input_string: str) -> str:
r"""Converts a string into a BigQuery column name.
https://cloud.google.com/bigquery/docs/schemas#column_names
Args:
input_string (str):
Required: The string to convert.
Returns:
str
The converted string.
"""
char_map: Dict[str, str] = {
r":|;|\(|\)|\[|\]|,|\.|\?|\!|\'|\n": "",
r"/| ": "_",
r"#": "num",
r"@": "at",
}

for key, value in char_map.items():
input_string = re.sub(key, value, input_string)

return input_string.lower()


def _dict_to_bigquery(
dic: Dict,
dataset_name: str,
table_name: str,
project_id: Optional[str],
) -> bigquery.job.LoadJob:
r"""Loads dictionary to a BigQuery table.
Args:
dic (Dict):
Required: The dictionary to insert.
dataset_name (str):
Required. Name of the BigQuery dataset.
table_name (str):
Required. Name of the BigQuery table.
project_id (Optional[str]):
Optional. Project ID containing the BigQuery table. If not passed, falls back to the default inferred from the environment.
Returns:
bigquery.job.LoadJob:
The BigQuery LoadJob for adding the dictionary.
"""
bq_client = bigquery.Client(
project=project_id, client_info=gcs_utilities._get_client_info()
)
table_ref = bigquery.DatasetReference(
project=project_id, dataset_id=dataset_name
).table(table_name)

job_config = bigquery.LoadJobConfig(
schema_update_options=[
bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION,
bigquery.SchemaUpdateOption.ALLOW_FIELD_RELAXATION,
],
source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
)

return bq_client.load_table_from_json(
json_rows=[dic],
destination=table_ref,
job_config=job_config,
)


@dataclasses.dataclass
class Document:
r"""Represents a wrapped `Document`.
Expand Down Expand Up @@ -478,6 +578,49 @@ def get_form_field_by_name(self, target_field: str) -> List[FormField]:

return found_fields

def form_fields_to_dict(self) -> Dict:
r"""Returns Dictionary of form fields in document.
Returns:
Dict:
The Dict of the form fields indexed by type.
"""
form_fields_dict: Dict = {}
for p in self.pages:
for form_field in p.form_fields:
field_name = _bigquery_column_name(form_field.field_name)
form_fields_dict = _insert_into_dictionary_with_list(
form_fields_dict, field_name, form_field.field_value
)

return form_fields_dict

def form_fields_to_bigquery(
self, dataset_name: str, table_name: str, project_id: Optional[str] = None
) -> bigquery.job.LoadJob:
r"""Adds extracted form fields to a BigQuery table.
Args:
dataset_name (str):
Required. Name of the BigQuery dataset.
table_name (str):
Required. Name of the BigQuery table.
project_id (Optional[str]):
Optional. Project ID containing the BigQuery table. If not passed, falls back to the default inferred from the environment.
Returns:
bigquery.job.LoadJob:
The BigQuery LoadJob for adding the form fields.
"""

return _dict_to_bigquery(
self.form_fields_to_dict(),
dataset_name,
table_name,
project_id,
)

def get_entity_by_type(self, target_type: str) -> List[Entity]:
r"""Returns the list of Entities of target_type.
Expand All @@ -502,20 +645,10 @@ def entities_to_dict(self) -> Dict:
"""
entities_dict: Dict = {}
for entity in self.entities:
entity_type = entity.type_.replace("/", "_")

existing_entity = entities_dict.get(entity_type)
if not existing_entity:
entities_dict[entity_type] = entity.mention_text
continue

# For entities that can have multiple (e.g. line_item)
# Change Entity Type to a List
if not isinstance(existing_entity, list):
existing_entity = [existing_entity]

existing_entity.append(entity.mention_text)
entities_dict[entity_type] = existing_entity
entity_type = _bigquery_column_name(entity.type_)
entities_dict = _insert_into_dictionary_with_list(
entities_dict, entity_type, entity.mention_text
)

return entities_dict

Expand All @@ -536,23 +669,12 @@ def entities_to_bigquery(
The BigQuery LoadJob for adding the entities.
"""
bq_client = bigquery.Client(project=project_id)
table_ref = bigquery.DatasetReference(
project=project_id, dataset_id=dataset_name
).table(table_name)

job_config = bigquery.LoadJobConfig(
schema_update_options=[
bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION,
bigquery.SchemaUpdateOption.ALLOW_FIELD_RELAXATION,
],
source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON,
)

return bq_client.load_table_from_json(
json_rows=[self.entities_to_dict()],
destination=table_ref,
job_config=job_config,
return _dict_to_bigquery(
self.entities_to_dict(),
dataset_name,
table_name,
project_id,
)

def split_pdf(self, pdf_path: str, output_path: str) -> List[str]:
Expand Down
47 changes: 47 additions & 0 deletions google/cloud/documentai_toolbox/wrappers/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,21 @@ def _table_wrapper_from_documentai_table(
)


@dataclasses.dataclass
class Block:
"""Represents a wrapped documentai.Document.Page.Block.
Attributes:
documentai_block (google.cloud.documentai.Document.Page.Block):
Required. The original google.cloud.documentai.Document.Page.Block object.
text (str):
Required. UTF-8 encoded text.
"""

documentai_block: documentai.Document.Page.Block
text: str


@dataclasses.dataclass
class Paragraph:
"""Represents a wrapped documentai.Document.Page.Paragraph.
Expand Down Expand Up @@ -191,6 +206,32 @@ def _text_from_layout(layout: documentai.Document.Page.Layout, text: str) -> str
return result_text


def _get_blocks(blocks: List[documentai.Document.Page.Block], text: str) -> List[Block]:
r"""Returns a list of Block.
Args:
blocks (List[documentai.Document.Page.Block]):
Required. A list of documentai.Document.Page.Block objects.
text (str):
Required. UTF-8 encoded text in reading order
from the document.
Returns:
List[Block]:
A list of Blocks.
"""
result = []

for block in blocks:
result.append(
Block(
documentai_block=block,
text=_text_from_layout(layout=block.layout, text=text),
)
)

return result


def _get_paragraphs(
paragraphs: List[documentai.Document.Page.Paragraph], text: str
) -> List[Paragraph]:
Expand Down Expand Up @@ -339,6 +380,10 @@ class Page:
Required. A list of visually detected text paragraphs
on the page. A collection of lines that a human
would perceive as a paragraph.
blocks (List[Block]):
Required. A list of visually detected text blocks
on the page. A collection of lines that a human
would perceive as a block.
tables (List[Table]):
Required. A list of visually detected tables on the
page.
Expand All @@ -350,6 +395,7 @@ class Page:
form_fields: List[FormField] = dataclasses.field(init=False, repr=False)
lines: List[Line] = dataclasses.field(init=False, repr=False)
paragraphs: List[Paragraph] = dataclasses.field(init=False, repr=False)
blocks: List[Block] = dataclasses.field(init=False, repr=False)
tables: List[Table] = dataclasses.field(init=False, repr=False)

def __post_init__(self):
Expand All @@ -369,4 +415,5 @@ def __post_init__(self):
self.paragraphs = _get_paragraphs(
paragraphs=self.documentai_page.paragraphs, text=self.text
)
self.blocks = _get_blocks(blocks=self.documentai_page.blocks, text=self.text)
self.tables = tables
5 changes: 5 additions & 0 deletions samples/snippets/entities_to_bigquery_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ def entities_to_bigquery_sample(
dataset_name=dataset_name, table_name=table_name, project_id=project_id
)

# Also supported:
# job = wrapped_document.form_fields_to_bigquery(
# dataset_name=dataset_name, table_name=table_name, project_id=project_id
# )

print("Document entities loaded into BigQuery")
print(f"Job ID: {job.job_id}")
print(f"Table: {job.destination.path}")
Expand Down
2 changes: 2 additions & 0 deletions samples/snippets/quickstart_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ def quickstart_sample(gcs_bucket_name: str, gcs_prefix: str) -> None:

for idx, page in enumerate(wrapped_document.pages):
print(f"Page {idx}")
for block in page.blocks:
print(block.text)
for paragraph in page.paragraphs:
print(paragraph.text)

Expand Down
6 changes: 3 additions & 3 deletions samples/snippets/requirements-test.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
pytest==7.3.0
mock==5.0.1
google-cloud-bigquery==3.9.0
pytest==7.3.1
mock==5.0.2
google-cloud-bigquery==3.10.0
2 changes: 1 addition & 1 deletion samples/snippets/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
google-cloud-bigquery==3.9.0
google-cloud-bigquery==3.10.0
google-cloud-documentai==2.15.0
google-cloud-storage==2.8.0
google-cloud-documentai-toolbox==0.4.1a0

0 comments on commit 61e5ed8

Please sign in to comment.