In [None]:
from google.cloud import documentai_v1beta3 as documentai
from google.cloud.documentai_v1beta3 import types
from google.cloud.documentai_v1beta3 import Document
from google.cloud.documentai_toolbox import document
import json
import os
import re
import xml.etree.ElementTree as ET

In [None]:
def get_documentai_from_google(config, source_file_path):
    success_flag = True
    target_json_path = ""
    
    try:
        filename, file_ext = os.path.splitext(source_file_path)

        if file_ext not in [".jpg", ".jpeg", ".JPG"]:
            target_file_path = source_file_path.replace(file_ext, ".jpg")
            
            if file_ext in [".tiff", ".tif", ".TIF", ".TIFF"]:
                cmd = "convert " + source_file_path + "[0] " + target_file_path
                
            else:
                cmd = "convert " + source_file_path + " " + target_file_path          
            os.system(cmd)
            
        else:
            target_file_path = source_file_path

        with open(target_file_path, "rb") as image_file:
            image_content = image_file.read()

        document = types.Document(content=image_content, mime_type='image/jpeg')

        project_id = config["project_id"]
        location = config["location"]
        processor_id = config["processor_id"]
        name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"

        # Process the document
        request = {"name": name, "document": document}
        
        os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = config["GOOGLE_APPLICATION_CREDENTIALS"]
        client = documentai.DocumentProcessorServiceClient()
        
        result = client.process_document(request=request)
        document = result.document

        json_string = Document.to_json(document)

        filename, file_ext = os.path.splitext(target_file_path)

        target_json_path = target_file_path.replace(file_ext, ".json")

        with open(target_json_path, 'w') as f:
            json.dump(json.JSONDecoder().decode(json_string), f)
            
    except Exception as error:
        print(error)
        success_flag = False
    return success_flag, target_json_path

In [None]:
def convert_document_to_hocr(document_path: str, document_title: str) -> str:
    success_flag = True
    target_file_path = ""
    
    try:
        wrapped_document = document.Document.from_document_path(document_path=document_path)

        # Converting wrapped_document to hOCR format
        hocr_string = wrapped_document.export_hocr_str(title=document_title)
        
        updated_hocr_string = correct_hocr_string(hocr_string) 

        filename, file_ext = os.path.splitext(document_path)
        target_file_path = document_path.replace(file_ext, ".hocr")

        hocr_file = open(target_file_path, "w")
        hocr_file.write(updated_hocr_string)
        hocr_file.close()
    except Exception as error:
        print(error)
        success_flag = False
    return success_flag, target_file_path

In [None]:
def correct_hocr_string(hocr_string: str) -> str:
    root = ET.fromstring(hocr_string)
    for element in root.iter():
        # Check if the element has a class attribute and if 'ocr_line' is in the class list
        if 'class' in element.attrib and 'ocr_line' in element.attrib['class'].split():
            # Remove text within ocr_line elements but keep the span
            element.text = ''  # Clear direct text content of the element
            
            # Append "; baseline 0 0" to the title attribute
            if 'title' in element.attrib:
                element.attrib['title'] += "; baseline 0 0"
        elif 'name' in element.attrib and 'ocr-system' in element.attrib['name'].split():
            # Update the content attribute to 'scribeocr'
            element.set('content', 'scribeocr')
        elif 'title' in element.attrib:
            title = element.attrib['title']
            match = re.search(r'bbox (\d+) (\d+) (\d+) (\d+)(.*)', title)
            if match:
                x0, y0, x1, y1, rest = match.groups()
                x0, y0, x1, y1 = map(int, [x0, y0, x1, y1])
                if y0 > y1:
                    y0, y1 = y1, y0
                if x0 > x1:
                    x0, x1 = x1, x0
                if element.attrib['class'] == 'ocrx_word':
                    element.attrib['title'] = f'bbox {x0} {y0} {x1} {y1}{rest}'
                else:
                    element.attrib['title'] = f'bbox {x0} {y0} {x1} {y1}{rest}'
    modified_hocr_string = ET.tostring(root, encoding='utf-8').decode('utf-8')
    return re.sub(r'html:', '', modified_hocr_string)