In [None]:
%pip install ipympl
%pip install reportlab>=3.6.2
%pip install PyPDF2
%pip install ocrmypdf
%pip install pdf2jpg
%pip install PyMuPDF

In [1]:
from tempfile import TemporaryDirectory
import os
os.environ['USE_TORCH'] = '1'
import matplotlib.pyplot as plt
from pathlib import Path
import numpy as np
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
from PIL import Image
from PyPDF2 import PdfMerger
from ocrmypdf.hocrtransform import HocrTransform
import fitz

In [2]:
new_orig_pdf_path = Path('amendments/New_Exhibit_Redacted.pdf')
old_orig_pdf_path = Path('amendments/Old_Exhibit_Redacted.pdf')

det_arch_options  = ['linknet_resnet18',
                     'linknet_resnet34',
                     'linknet_resnet50',
                     'db_resnet50',
                     'db_mobilenet_v3_large',
                     'fast_tiny',
                     'fast_small',
                     'fast_base',]

reco_arch_options = ['crnn_vgg16_bn',
                     'crnn_mobilenet_v3_small',
                     'crnn_mobilenet_v3_large',
                     'sar_resnet31',
                     'master',
                     'vitstr_small',
                     'vitstr_base',
                     'parseq',]

predictor = ocr_predictor(det_arch='fast_base', reco_arch='crnn_vgg16_bn', pretrained=True)

In [3]:
def convert_pdf_page_to_image(fitz_doc, image_path, idx, zoom=4):
    mat = fitz.Matrix(zoom, zoom)
    page = fitz_doc.load_page(idx)
    pix = page.get_pixmap(matrix=mat)
    pix.save(image_path)

def write_ocr_text_to_pdfa(pdf_outpath, hocr_path, image_path):
    hocr = HocrTransform(hocr_filename=hocr_path,
                         dpi=1000,)

    # step to obtain ocirized pdf
    hocr.to_pdf(
                out_filename=pdf_outpath,
                image_filename=image_path,
                )
    
def write_hocr_xml_file(hocr_path, page_xml):
    with open(hocr_path, 'w') as f:
        f.write(page_xml[0].decode())

def create_pdfa_with_ocr(output_dir, output_stem, ocr_xml, fitz_doc):
    merger = PdfMerger()
    with TemporaryDirectory(dir= Path(os.getcwd())) as tmpdir:
        tmppath = Path(tmpdir)
        for idx, page_xml in enumerate(ocr_xml): 
            hocr_path   = tmppath / f'{output_stem}_hocr_page{idx}.xml'
            image_path  = tmppath / f'{output_stem}_image_page{idx}.png'
            pdf_outpath = tmppath / f'{output_stem}_docTR_page{idx}.pdf'
            write_hocr_xml_file(hocr_path, page_xml)
            convert_pdf_page_to_image(fitz_doc, image_path, idx, zoom=4) 
            write_ocr_text_to_pdfa(pdf_outpath, hocr_path, image_path)
            merger.append(pdf_outpath)
        merger.write(output_dir / f'{output_stem}_docTR.pdf' )
        merger.close()

# def write_ocr_text_to_pdfa(ocr_text, doc_pages, output_base_path):
#     # returns: list of tuple where the first element is the (bytes) xml string and the second is the ElementTree
#     xml_outputs = ocr_text.export_as_xml()

#     # init the above parser
#     parser = HocrParser()

#     merger = PdfMerger()
#     with TemporaryDirectory(dir= Path(os.getcwd())) as tmpdir:
#         tmppath = Path(tmpdir)
#         for i, (xml, img) in enumerate(zip(xml_outputs, doc_pages)):
#             xml_element_tree = xml[1]
#             pdf_file = str(tmppath / f'{i}.pdf')
#             parser.export_pdfa(pdf_file, hocr=xml_element_tree, image=img)
#             merger.append(pdf_file)
#         merger.write(f'{output_base_path}_docTR.pdf' )
#         merger.close()

In [5]:
for pdf_file in [old_orig_pdf_path, new_orig_pdf_path]:
    output_base_path = pdf_file.parent / pdf_file.stem
    doc_pages = DocumentFile.from_pdf(pdf_file)
    fitz_doc = fitz.open(pdf_file)
    ocr_text = predictor(doc_pages) 
    ocr_xml = ocr_text.export_as_xml()
    create_pdfa_with_ocr(output_dir  = pdf_file.parent, 
                         output_stem = pdf_file.stem, 
                         ocr_xml = ocr_xml, 
                         fitz_doc = fitz_doc)
    

In [6]:
ocr_dict = ocr_text.export()

In [18]:
for block in ocr_dict['pages'][0]['blocks']:
    for line in block['lines']:
        for word in line['words']:
            geo = word['geometry']
            text = word['value']
            print(f'left: {np.round(geo[0][0], 3)}, right: {np.round(geo[1][0], 3)}, top: {np.round(geo[0][1], 3)}, bottom: {np.round(geo[1][1], 3)}, {text}')


left: 0.47, right: 0.524, top: 0.065, bottom: 0.078, Exhibit
left: 0.521, right: 0.553, top: 0.065, bottom: 0.079, A-4
left: 0.369, right: 0.438, top: 0.133, bottom: 0.146, GROUND
left: 0.438, right: 0.494, top: 0.133, bottom: 0.146, SPACE
left: 0.492, right: 0.615, top: 0.133, bottom: 0.146, REQUIREMENTS
left: 0.5, right: 0.542, top: 0.16, bottom: 0.174, 10.00'
left: 0.5, right: 0.528, top: 0.175, bottom: 0.186, N/A
left: 0.367, right: 0.436, top: 0.188, bottom: 0.202, BACKUP
left: 0.434, right: 0.495, top: 0.188, bottom: 0.202, POWER
left: 0.495, right: 0.618, top: 0.188, bottom: 0.202, REQUIREMENTS
left: 0.494, right: 0.528, top: 0.201, bottom: 0.215, Fuel
left: 0.525, right: 0.564, top: 0.202, bottom: 0.217, Type:
left: 0.563, right: 0.591, top: 0.202, bottom: 0.214, N/A
left: 0.419, right: 0.47, top: 0.216, bottom: 0.229, UTILITY
left: 0.467, right: 0.563, top: 0.217, bottom: 0.227, REQUIREMENTS
left: 0.035, right: 0.073, top: 0.146, bottom: 0.16, Total
left: 0.069, right: 0.113, 

In [20]:
ocr_dict['pages'][0]['dimensions']

(1584, 1224)