In [None]:
import PyPDF2
import os

import base64
from google.cloud import vision
from google.cloud import storage
import io
import json
import re
import pandas as pd

In [None]:
# Get combined length of pdfs in directory
directory = 'test_files/'
total_pages = 0

for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    # checking if it is a file
    try:
        if os.path.isfile(f):
            file = open(f'files/{filename}', 'rb')
            readpdf = PyPDF2.PdfFileReader(file)
            total_pages += readpdf.numPages
    except Exception as e:
        print(filename)
print(total_pages)

In [None]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="credentials/google/gt-vip-ocr-aca80f32ce4e.json"

In [None]:
# https://cloud.google.com/vision/docs/samples/vision-text-detection-pdf-gcs#vision_text_detection_pdf_gcs-python
def async_detect_document(gcs_source_uri, gcs_destination_uri):
    """OCR with PDF/TIFF as source files on GCS"""
    import json
    import re
    from google.cloud import vision
    from google.cloud import storage

    # Supported mime_types are: 'application/pdf' and 'image/tiff'
    mime_type = 'application/pdf'

    # How many pages should be grouped into each json output file.
    batch_size = 100

    client = vision.ImageAnnotatorClient()

    feature = vision.Feature(
        type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)

    gcs_source = vision.GcsSource(uri=gcs_source_uri)
    input_config = vision.InputConfig(
        gcs_source=gcs_source, mime_type=mime_type)

    gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
    output_config = vision.OutputConfig(
        gcs_destination=gcs_destination, batch_size=batch_size)

    async_request = vision.AsyncAnnotateFileRequest(
        features=[feature], input_config=input_config,
        output_config=output_config)

    operation = client.async_batch_annotate_files(
        requests=[async_request])

    print('Waiting for the operation to finish.')
    operation.result(timeout=420)

    # Once the request has completed and the output has been
    # written to GCS, we can list all the output files.
    storage_client = storage.Client()

    match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
    bucket_name = match.group(1)
    prefix = match.group(2)

    bucket = storage_client.get_bucket(bucket_name)

    # List objects with the given prefix, filtering out folders.
    blob_list = [blob for blob in list(bucket.list_blobs(
        prefix=prefix)) if not blob.name.endswith('/')]
    print('Output files:')
    for blob in blob_list:
        print(blob.name)

    # Process the first output file from GCS.
    # Since we specified batch_size=2, the first response contains
    # the first two pages of the input file.
    # output = blob_list[0]

    # json_string = output.download_as_string()
    # response = json.loads(json_string)

    # The actual response for the first page of the input file.
    # first_page_response = response['responses'][0]
    # annotation = first_page_response['fullTextAnnotation']

    # Here we print the full text from the first page.
    # The response contains more information:
    # annotation/pages/blocks/paragraphs/words/symbols
    # including confidence scores and bounding boxes
    # print('Full text:\n')
    # print(annotation['text'])

In [None]:
# https://towardsdatascience.com/how-to-extract-the-text-from-pdfs-using-python-and-the-google-cloud-vision-api-7a0a798adc13
def write_to_text(gcs_destination_uri):
    # Once the request has completed and the output has been
    # written to GCS, we can list all the output files.
    storage_client = storage.Client()

    match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
    bucket_name = match.group(1)
    prefix = match.group(2)

    bucket = storage_client.get_bucket(bucket_name)

    # List objects with the given prefix.
    blob_list = list(bucket.list_blobs(prefix=prefix))
    print('Output files:')

    for blob in blob_list:
        print(blob.name)

    # Process the first output file from GCS.
    # Since we specified batch_size=2, the first response contains
    # the first two pages of the input file.

    doc_text = ''

    for n in  range(len(blob_list)):
        output = blob_list[n]

        json_string = output.download_as_string()
        response = json.loads(json_string)


        # The actual response for the first page of the input file.
        for m in range(len(response['responses'])):


            first_page_response = response['responses'][m]

            try:
                annotation = first_page_response['fullTextAnnotation']
            except(KeyError):
                print("No annotation for this page.")

            # Here we print the full text from the first page.
            # The response contains more information:
            # annotation/pages/blocks/paragraphs/words/symbols
            # including confidence scores and bounding boxes
            # print('Full text:\n')
            # print(annotation['text'])
            doc_text = doc_text + annotation['text']

    return doc_text


In [None]:
# https://stackoverflow.com/questions/68740510/python-read-all-files-as-gcs-uri-in-google-cloud-storage
def get_gcs_bucket_contents(bucket_name, directory, extension):
    bucket_list = []
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blobs = client.list_blobs(bucket_name)

    for blob in blobs:
        if directory in blob.name and extension in blob.name:
            bucket_list.append(blob.name)
    return bucket_list

In [None]:
bucket_list = get_gcs_bucket_contents('ivanallenarchive', 'input_dir', '.pdf')
bucket_list

In [None]:
try:
    ocr_compare_df = pd.read_parquet('ocr_compare_df.parquet.gzip', engine='pyarrow')
except Exception as e:
    df_dict = {'item_filename'      : [], 'Google_Vision': []}
    ocr_compare_df = pd.DataFrame(df_dict)

In [None]:
for i, item in enumerate(bucket_list):
    if i < 3:
        pass
    output_file_name =  re.sub('input_dir/', '', item)
    output_file_name =  re.sub('.pdf', '', output_file_name)

    gsurl_s = f'gs://ivanallenarchive/{item}'
    gsurl_d = f'gs://ivanallenarchive/output_dir/{output_file_name}'
    async_detect_document(gsurl_s, gsurl_d)
    doc_text = write_to_text(gsurl_d)
    new_text_dict = {'item_filename'    :  output_file_name , 'Google_Vision': doc_text}
    ocr_compare_df = pd.concat([ocr_compare_df, pd.DataFrame.from_records([new_text_dict])])


In [None]:
ocr_compare_df.to_parquet('raw_data_df.parquet.gzip', compression='gzip')

In [None]:
ocr_compare_df.to_excel('test.xlsx')

In [None]:
item = 'input_dir/0a57f7837a091ce523ddbca495a38198.pdf'
output_file_name =  re.sub('input_dir/', '', item)
output_file_name =  re.sub('.pdf', '', output_file_name)

gsurl_s = f'gs://ivanallenarchive/{item}'
gsurl_d = f'gs://ivanallenarchive/output_dir/{output_file_name}'
async_detect_document(gsurl_s, gsurl_d)
doc_text = write_to_text(gsurl_d)
new_text_dict = {'item_filename'    :  output_file_name , 'Google_Vision': doc_text}
ocr_compare_df = pd.concat([ocr_compare_df, pd.DataFrame.from_records([new_text_dict])])

In [None]:
gsurl_d = f'gs://ivanallenarchive/output_dir/0a57f7837a091ce523ddbca495a38198output-1-to-6'

doc_text = write_to_text(gsurl_d)
new_text_dict = {'item_filename'    :  output_file_name , 'Google_Vision': doc_text}
ocr_compare_df = pd.concat([ocr_compare_df, pd.DataFrame.from_records([new_text_dict])])

In [None]:
ocr_compare_df.to_excel('test.xlsx')