In [None]:
import io
import time

from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from azure.cognitiveservices.vision.computervision.models import ComputerVisionOcrErrorException
from msrest.authentication import CognitiveServicesCredentials
from PIL import Image


from pdf2image import convert_from_path
import os
import pandas as pd



def perform_ocr_(
        stream,
        endpoint="https://vipocrinstance1.cognitiveservices.azure.com/",
        subscription_key='PLACEHOLDER'  # Replace with your Azure subscription key,
):
    credentials = CognitiveServicesCredentials(subscription_key)
    client = ComputerVisionClient(endpoint, credentials)

    read_response = client.read_in_stream(stream, language='en', raw=True)
    if not read_response:
        raise Exception('Could not send OCR request to Azure')

    operation_id = read_response.headers["Operation-Location"].split("/")[-1]

    while True:
        results = client.get_read_result(operation_id)
        if results.status not in ['notStarted', 'running']:
            break
        time.sleep(1)

    for text_result in results.analyze_result.read_results:
        # width = text_result.width
        # height = text_result.height
        page = ""
        for line in text_result.lines:
            line_text = ""
            for word in line.words:
                line_text = line_text + " " + word.text
            line_text = line_text.strip()
            line_text = line_text + "\n"
            page = page + line_text
        yield page


def image_to_byte_stream(image):
    with io.BytesIO() as stream:
        image.save(stream, format='PNG')
        return io.BytesIO(stream.getvalue())


def perform_ocr(data):
    """Feed it PDF, JPG, PNG and you will get OCR data
    Args:
        data: can be a filepath, PIL.Image, bytes, or a binary datastream
    Returns:
        [[(text, top, left, width, height), ...], [...] ... ] - you will get list of pages, each page contains list of words with boxes.
    """
    try:
        stream = None

        if isinstance(data, str):
            stream = open(data, 'rb')
        elif isinstance(data, bytes):
            stream = io.BytesIO(data)
        elif isinstance(data, Image.Image):
            stream = image_to_byte_stream(data)
        elif hasattr(data, 'read'):
            stream = data

        if not stream:
            raise Exception(f'Could not open f{data}')

        return list(perform_ocr_(stream))
    except ComputerVisionOcrErrorException:
        return [[]]




In [None]:
ocr_df = pd.read_excel('ocr_df_azure_Test.xlsx', index_col=0)
ocr_df['azure_png'] = ''
ocr_df

In [None]:
# Note, free tier is limited to 20 calls per minute
call_cnt = 0
short_cnt = 2
while short_cnt > 0:
    for index, row in ocr_df.iterrows():
        if call_cnt >= 20: # clearly there's a more efficient way to do this, but for the moment it works
            time.sleep(60)
            call_cnt = 0
        if len(str(row['azure_png'])) < 5:
            print(index)
            short_cnt -= 1
            directory = 'test_files'
            scratch_dir = 'scratch_dir_2'
            for filename in os.listdir(scratch_dir):
                f = os.path.join(scratch_dir, filename)
                os.remove(f)

            pdf_text = ''
            images = convert_from_path(f'''{directory}/{row['item_filename']}.pdf''')
            for i in range(len(images)):
                time.sleep(1)
                ocr_text = perform_ocr(images[i])
                call_cnt += 1
                for line in ocr_text:
                    if len(line) > 0:
                        pdf_text = pdf_text + "\n" + str(line)
            ocr_df.at[index, 'azure_png'] = pdf_text
            if len(pdf_text) < 5:
                short_cnt += 1
                print(index, 'short!')
    if short_cnt > 0:
        time.sleep(300)
    ocr_df.to_excel('ocr_df_azure_Test.xlsx')



In [None]:
ocr_df = pd.read_excel('ocr_df_azure_Test.xlsx', index_col=0)
ocr_df['azure_pdf'] = ''
ocr_df

In [None]:
# Note, free tier is limited to 20 calls per minute
call_cnt = 0
for index, row in ocr_df.iterrows():
    if call_cnt >= 20: # clearly there's a more efficient way to do this, but for the moment it works
        time.sleep(60)
        call_cnt = 0

    if len(ocr_df.at[index, 'azure_pdf']) == 0:
        directory = 'test_files'
        scratch_dir = 'scratch_dir_2'
        pdf_text = ''
        time.sleep(5)

        ocr_text = perform_ocr(f'''{directory}/{row['item_filename']}.pdf''')
        for page in ocr_text:
            if len(page) > 0:
                pdf_text = pdf_text + "\n" + str(page)
        ocr_df.at[index, 'azure_pdf'] = pdf_text
        call_cnt += 1
        print(index, len(ocr_text), len(pdf_text))

In [None]:
ocr_df.at[0, 'azure_pdf']

In [None]:
ocr_df.to_excel('ocr_df_azure_Test.xlsx')