In [None]:
import json
import re
from google.cloud import vision
from google.cloud import storage

In [None]:
mime_type = "application/pdf"

In [None]:
# How many pages should be grouped into each json output file.
batch_size = 1

In [None]:
client = vision.ImageAnnotatorClient()

In [None]:
gcs_source_uri = "gs://idl-dsos-transcript/pdf_files/lfbp0045.pdf"
gcs_destination_uri = "gs://idl-dsos-transcript/json_files/"

In [None]:
feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)
gcs_source = vision.GcsSource(uri=gcs_source_uri)
gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)

In [None]:
input_config = vision.InputConfig(gcs_source=gcs_source, mime_type=mime_type)

In [None]:
output_config = vision.OutputConfig(
    gcs_destination=gcs_destination, batch_size=batch_size
)

In [None]:
async_request = vision.AsyncAnnotateFileRequest(
    features=[feature], input_config=input_config, output_config=output_config
)

operation = client.async_batch_annotate_files(requests=[async_request])

print("Waiting for the operation to finish.")
operation.result(timeout=480)

In [None]:
# Once the request has completed and the output has been
# written to GCS, we can list all the output files.
storage_client = storage.Client()

match = re.match(r"gs://([^/]+)/(.+)", gcs_destination_uri)
bucket_name = match.group(1)
prefix = match.group(2)

bucket = storage_client.get_bucket(bucket_name)

In [None]:
blob_list = [
    blob
    for blob in list(bucket.list_blobs(prefix=prefix))
    if not blob.name.endswith("/")
]
print("Output files:")
for blob in blob_list:
    print(blob.name)

In [None]:
output = blob_list[0]

json_string = output.download_as_bytes().decode("utf-8")
response = json.loads(json_string)

# The actual response for the first page of the input file.
first_page_response = response["responses"][0]
annotation = first_page_response["fullTextAnnotation"]

In [None]:
for output in blob_list:
    json_string = output.download_as_bytes().decode("utf-8")
    response = json.loads(json_string)

    for r in response["responses"]:
        print(r["fullTextAnnotation"]["text"])