<a href="https://colab.research.google.com/github/deltorobarba/machinelearning/blob/main/ocr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install google-cloud-documentai -q

In [None]:
from google.cloud import documentai_v1 as documentai

In [None]:
# Create a client
client = documentai.DocumentProcessorServiceClient()

# Set the project and location
project_id = "qwiklabs-gcp-00-d19555d21c13"
location = "us"  # e.g., 'us' or 'eu'
processor_id = "dc95d7a2e35aa05a"
name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"

# Read the document content
with open("sample-online-ocr.pdf", "rb") as image:
    image_content = image.read()

# Create the request
request = documentai.ProcessRequest(
    name=name,
    raw_document=documentai.RawDocument(content=image_content, mime_type="application/pdf"),
)

# Process the document
result = client.process_document(request=request)
document = result.document

# Access the results
print(f"Document processing complete.")

Document processing complete.


In [None]:
print("The document contains the following text:")
print(document.text)

The document contains the following text:
CHAPTER I
IN WHICH We Are Introduced to
Winnie-the-Pooh and Some
Bees, and the Stories Begin
HERE is Edward Bear, coming
downstairs now, bump, bump, bump, on the back
of his head, behind Christopher Robin. It is, as far
as he knows, the only way of coming downstairs,
but sometimes he feels that there really is another
way, if only he could stop bumping for a moment
and think of it. And then he feels that perhaps there
isn't. Anyhow, here he is at the bottom, and ready
to be introduced to you. Winnie-the-Pooh.
When I first heard his name, I said, just as you
are going to say, “But I thought he was a boy?"
“So did I," said Christopher Robin.
"Then you can't call him Winnie?"
"I don't."
"But you said-
"
"He's Winnie-ther-Pooh. Don't you know what
'ther' means?"
I
Digitized by
Google
WINNIE-THE-POOH
“Ah, yes, now I do," I said quickly; and I hope
you do too, because it is all the explanation you are
going to get.
Sometimes Winnie-the-Pooh likes a g

In [None]:
!pip install google-cloud-storage -q

In [None]:
# Assuming 'document.text' holds the text you want to save
text_to_save = f"The document contains the following text:\n{document.text}"

# Specify the filename
filename = "cepf_online_ocr.txt"

# Save the text to the file
with open(filename, "w") as file:
    file.write(text_to_save)

print(f"Text saved to {filename}")

Text saved to cepf_online_ocr.txt


In [None]:
from google.cloud import storage

In [None]:
# Replace with your bucket name
bucket_name = "qwiklabs-gcp-00-d19555d21c13-cepf-documentai"

# Replace with the desired filename in Cloud Storage
destination_blob_name = "cepf_online_ocr.txt"

# Replace with the path to your text file in JupyterLab
source_file_name = "cepf_online_ocr.txt"

# Upload the file

storage_client = storage.Client()

bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(destination_blob_name)

blob.upload_from_filename(source_file_name)

print(f"File {source_file_name} uploaded to gs://{bucket_name}/{destination_blob_name}")

File cepf_online_ocr.txt uploaded to gs://qwiklabs-gcp-00-d19555d21c13-cepf-documentai/cepf_online_ocr.txt


In [None]:
# Input configuration:  Point to your documents in GCS
gcs_input_uri = "gs://qwiklabs-gcp-00-d19555d21c13-cepf-documentai/sample-batch-ocr.pdf"  # Replace with your GCS URI
input_config = documentai.BatchDocumentsInputConfig(
    gcs_prefix=documentai.GcsPrefix(gcs_uri_prefix=gcs_input_uri)
)


# Output configuration: Specify where processed documents should be stored
gcs_output_uri = "gs://qwiklabs-gcp-00-d19555d21c13-cepf-documentai/"
gcs_output_config = documentai.DocumentOutputConfig.GcsOutputConfig(
        gcs_uri=gcs_output_uri)


In [None]:
# Create a client
client = documentai.DocumentProcessorServiceClient()

# Set the project and location
#project_id = "qwiklabs-gcp-01-8126e1a18a93"
#location = "us"  # e.g., 'us' or 'eu'
#processor_id = "5f9516b5be867504"
name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"

output_config = documentai.DocumentOutputConfig(gcs_output_config=gcs_output_config)

# Create the batch processing request
request = documentai.BatchProcessRequest(
    name=name,
    input_documents=input_config,
    document_output_config=output_config,  # Pass the DocumentOutputConfig object
)

# Process the documents
operation = client.batch_process_documents(request=request)
operation.result()  # Wait for the operation to complete

print(f"Batch processing complete.")

Batch processing complete.


In [None]:
# Read the text recognition output from the processor
print("The document contains the following text:")
print(document.text)

The document contains the following text:
CHAPTER I
IN WHICH We Are Introduced to
Winnie-the-Pooh and Some
Bees, and the Stories Begin
HERE is Edward Bear, coming
downstairs now, bump, bump, bump, on the back
of his head, behind Christopher Robin. It is, as far
as he knows, the only way of coming downstairs,
but sometimes he feels that there really is another
way, if only he could stop bumping for a moment
and think of it. And then he feels that perhaps there
isn't. Anyhow, here he is at the bottom, and ready
to be introduced to you. Winnie-the-Pooh.
When I first heard his name, I said, just as you
are going to say, “But I thought he was a boy?"
“So did I," said Christopher Robin.
"Then you can't call him Winnie?"
"I don't."
"But you said-
"
"He's Winnie-ther-Pooh. Don't you know what
'ther' means?"
I
Digitized by
Google
WINNIE-THE-POOH
“Ah, yes, now I do," I said quickly; and I hope
you do too, because it is all the explanation you are
going to get.
Sometimes Winnie-the-Pooh likes a g

In [None]:
# save locally
import re

from google.cloud import documentai_v1 as documentai


def save_text_locally(operation, filename="cepf_batch_ocr.txt"):
    """Saves the extracted text from a batch processing operation to a local file."""

    metadata = documentai.BatchProcessMetadata(operation.metadata)

    if metadata.state != documentai.BatchProcessMetadata.State.SUCCEEDED:
        raise ValueError(f"Batch Process Failed: {metadata.state_message}")

    all_text = ""

    print("Output files:")
    for process in list(metadata.individual_process_statuses):
        matches = re.match(r"gs://(.*?)/(.*)", process.output_gcs_destination)
        if not matches:
            print(
                "Could not parse output GCS destination:",
                process.output_gcs_destination,
            )
            continue

        output_bucket, output_prefix = matches.groups()
        # Initialize a storage client
        storage_client = storage.Client()
        output_blobs = storage_client.list_blobs(output_bucket, prefix=output_prefix)

        for blob in output_blobs:
            if blob.content_type != "application/json":
                print(
                    f"Skipping non-supported file: {blob.name} - Mimetype: {blob.content_type}"
                )
                continue

            print(f"Fetching {blob.name}")
            document = documentai.Document.from_json(
                blob.download_as_bytes(), ignore_unknown_fields=True
            )

            all_text += document.text

    with open(filename, "w") as f:
        f.write(all_text)

    print(f"Text saved to {filename}")


# Example usage (assuming you have the 'operation' object from your batch processing job):
save_text_locally(operation)

Output files:
Fetching 145357395788435402/0/sample-batch-ocr-0.json
Fetching 145357395788435402/0/sample-batch-ocr-1.json
Fetching 145357395788435402/0/sample-batch-ocr-10.json
Fetching 145357395788435402/0/sample-batch-ocr-11.json
Fetching 145357395788435402/0/sample-batch-ocr-12.json
Fetching 145357395788435402/0/sample-batch-ocr-13.json
Fetching 145357395788435402/0/sample-batch-ocr-14.json
Fetching 145357395788435402/0/sample-batch-ocr-15.json
Fetching 145357395788435402/0/sample-batch-ocr-16.json
Fetching 145357395788435402/0/sample-batch-ocr-17.json
Fetching 145357395788435402/0/sample-batch-ocr-18.json
Fetching 145357395788435402/0/sample-batch-ocr-2.json
Fetching 145357395788435402/0/sample-batch-ocr-3.json
Fetching 145357395788435402/0/sample-batch-ocr-4.json
Fetching 145357395788435402/0/sample-batch-ocr-5.json
Fetching 145357395788435402/0/sample-batch-ocr-6.json
Fetching 145357395788435402/0/sample-batch-ocr-7.json
Fetching 145357395788435402/0/sample-batch-ocr-8.json
Fetch

In [None]:
# Replace with your bucket name
bucket_name = "qwiklabs-gcp-00-d19555d21c13-cepf-documentai"

# Replace with the desired filename in Cloud Storage
destination_blob_name = "cepf_batch_ocr.txt"

# Replace with the path to your text file in JupyterLab
source_file_name = "cepf_batch_ocr.txt"

# Upload the file

bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(destination_blob_name)

blob.upload_from_filename(source_file_name)

print(f"File {source_file_name} uploaded to gs://{bucket_name}/{destination_blob_name}")

File cepf_batch_ocr.txt uploaded to gs://qwiklabs-gcp-00-d19555d21c13-cepf-documentai/cepf_batch_ocr.txt


In [None]:
# Assuming 'document.text' holds the text you want to save
text_to_save = f"The document contains the following text:\n{document.text}"

# Specify the filename
filename = "cepf_batch_ocr.txt"

# Save the text to the file
with open(filename, "w") as file:
    file.write(text_to_save)

print(f"Text saved to {filename}")

# Replace with your bucket name
#bucket_name = "qwiklabs-gcp-01-8126e1a18a93-cepf-documentai"

# Replace with the desired filename in Cloud Storage
destination_blob_name = "cepf_batch_ocr.txt"

# Replace with the path to your text file in JupyterLab
source_file_name = "cepf_batch_ocr.txt"

# Upload the file

bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(destination_blob_name)

blob.upload_from_filename(source_file_name)

print(f"File {source_file_name} uploaded to gs://{bucket_name}/{destination_blob_name}")

Text saved to cepf_batch_ocr.txt
File cepf_batch_ocr.txt uploaded to gs://qwiklabs-gcp-00-d19555d21c13-cepf-documentai/cepf_batch_ocr.txt


In [None]:
# Create a client
client = documentai.DocumentProcessorServiceClient()

# Set the project and location
project_id = "qwiklabs-gcp-00-d19555d21c13"
location = "us"  # e.g., 'us' or 'eu'
processor_id = "c52b9af951178250"
name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"

# Read the document content
with open("sample-intake-form.pdf", "rb") as image:
    image_content = image.read()

# Create the request
request = documentai.ProcessRequest(
    name=name,
    raw_document=documentai.RawDocument(content=image_content, mime_type="application/pdf"),
)

# Process the document
result = client.process_document(request=request)
document = result.document

# Access the results
print(f"Document processing complete.")

In [None]:
import pandas as pd
from google.cloud import documentai_v1 as documentai


def online_process(
    project_id: str,
    location: str,
    processor_id: str,
    file_path: str,
    mime_type: str,
) -> documentai.Document:
    """
    Processes a document using the Document AI Online Processing API.
    """

    opts = {"api_endpoint": f"{location}-documentai.googleapis.com"}

    # Instantiates a client
    documentai_client = documentai.DocumentProcessorServiceClient(client_options=opts)

    # The full resource name of the processor, e.g.:
    # projects/project-id/locations/location/processor/processor-id
    # You must create new processors in the Cloud Console first
    resource_name = documentai_client.processor_path(project_id, location, processor_id)

    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

        # Load Binary Data into Document AI RawDocument Object
        raw_document = documentai.RawDocument(
            content=image_content, mime_type=mime_type
        )

        # Configure the process request
        request = documentai.ProcessRequest(
            name=resource_name, raw_document=raw_document
        )

        # Use the Document AI client to process the sample form
        result = documentai_client.process_document(request=request)

        return result.document


def trim_text(text: str):
    """
    Remove extra space characters from text (blank, newline, tab, etc.)
    """
    return text.strip().replace("\n", " ")


PROJECT_ID = "qwiklabs-gcp-00-d19555d21c13"
LOCATION = "us"  # Format is 'us' or 'eu'
PROCESSOR_ID = "c52b9af951178250"  # Create processor in Cloud Console

# The local file in your current working directory
FILE_PATH = "sample-intake-form.pdf"
# Refer to https://cloud.google.com/document-ai/docs/processors-list
# for supported file types
MIME_TYPE = "application/pdf"

document = online_process(
    project_id=PROJECT_ID,
    location=LOCATION,
    processor_id=PROCESSOR_ID,
    file_path=FILE_PATH,
    mime_type=MIME_TYPE,
)

names = []
name_confidence = []
values = []
value_confidence = []

for page in document.pages:
    for field in page.form_fields:
        # Get the extracted field names
        names.append(trim_text(field.field_name.text_anchor.content))
        # Confidence - How "sure" the Model is that the text is correct
        name_confidence.append(field.field_name.confidence)

        values.append(trim_text(field.field_value.text_anchor.content))
        value_confidence.append(field.field_value.confidence)

# Create a Pandas Dataframe to print the values in tabular format.
df = pd.DataFrame(
    {
        "Field Name": names,
        "Field Name Confidence": name_confidence,
        "Field Value": values,
        "Field Value Confidence": value_confidence,
    }
)

print(df)

                                           Field Name  Field Name Confidence  \
0   Are you currently taking any medication? (If y...               0.943337   
1                                           _Phone #:               0.932336   
2                                                Zip:               0.914201   
3                                               City:               0.900499   
4                                              State:               0.893907   
5                                                DOB:               0.885175   
6                                             Gender:               0.882370   
7                                               Name:               0.872789   
8                                     Marital Status:               0.852380   
9   Describe your medical concerns (symptoms, diag...               0.843905   
10                                              Date:               0.829963   
11                                      

In [None]:
# Replace with your bucket name
bucket_name = "qwiklabs-gcp-00-d19555d21c13-cepf-documentai"

# Replace with the desired filename in Cloud Storage
destination_blob_name = "cepf_form_parser.csv"

# Initialize a storage client
storage_client = storage.Client()

# Upload the file
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(destination_blob_name)

# Convert DataFrame to CSV string
csv_string = df.to_csv(index=False)  # Set index=False if you don't want to save the index

blob.upload_from_string(csv_string, content_type='text/csv')

print(f"DataFrame saved as gs://{bucket_name}/{destination_blob_name}")

DataFrame saved as gs://qwiklabs-gcp-00-d19555d21c13-cepf-documentai/cepf_form_parser.csv
