In [None]:
# Import System Packages
import os

In [None]:
# Import Parsing Packages
from pdf2image import convert_from_path
import pytesseract
import cv2
import numpy as np
import pandas as pd
import re
from PIL import Image

In [None]:
# Import LLM Packages
from langchain.document_loaders import PDFMinerPDFasHTMLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.llms import OpenAI

In [None]:
# Retrieve OpenAI key from environment
OPENAI_KEY = os.environ['OPENAI_API_KEY']

In [None]:
# Helper funtion to show an image with cv2
def cv_show_img(title, image, wait=0):
    cv2.namedWindow(title)
    cv2.startWindowThread()
    cv2.imshow(title, image)
    cv2.waitKey(wait)
    cv2.waitKey(1)
    cv2.destroyAllWindows()
    cv2.waitKey(1)

# Helper funtion to show multiple images at the same time
def cv_show_mult_img(titleArr, imageArr, wait=0):
    for i in range(len(titleArr)):
        cv2.namedWindow(titleArr[i])
        cv2.startWindowThread()
        cv2.imshow(titleArr[i], imageArr[i])
    cv2.waitKey(wait)
    cv2.waitKey(1)
    cv2.destroyAllWindows()
    cv2.waitKey(1)

In [None]:
# Retrieve PDF Name, Metadata, and create Output Directories
pdf_name = 'Sample_Inpt_HR'
pdf_path = f'./records/{pdf_name}.pdf'

image_directory = f'./records/{pdf_name}_Images'
if not os.path.exists(image_directory):
    os.makedirs(image_directory)

pytesseract_output_directory = f'./records/{pdf_name}_Tesseract'
if not os.path.exists(pytesseract_output_directory):
    os.makedirs(pytesseract_output_directory)



In [None]:
# Create Image for each PDF page
pdf_pages = convert_from_path(pdf_path, 500, output_folder=image_directory, fmt='jpg', output_file=f'page')

In [None]:
# Create Tesseract Output for each PDF page image
for page in os.listdir(image_directory):
    img = cv2.imread(f'{image_directory}/{page}')

    # Process with Tesseract
    pytesseract_output = pytesseract.image_to_string(img, lang='eng')
    print(pytesseract_output)
    with open(f'{pytesseract_output_directory}/{page}.txt', 'w') as f:
        f.write(pytesseract_output)
        
    cv_show_img(page, img)
