In [2]:
# Testing Notbook for classifying PDFs as Text-Based or Image-Based
# Important for determining which algorithm we will use for text extraction of documents

# Imports
import os
import sys

# PDF Libraries
import fitz

In [11]:
def classifier(pdf_file_path):
    '''
        Function that takes a PDF file path as input and returns a string
        indicating whether the PDF is text-based or image-based.
        "text" or "image"
        If even a single page is an image, the PDF will be classified as image-based.
        Better safe than sorry.
    '''
    # Open PDF file
    with open(pdf_file_path, 'rb') as file:
        pdf = fitz.open(stream=file.read(), filetype="pdf")

        for page in pdf:
            img_area = 0.0
            text_area = 0.0

            for b in page.get_text("blocks"):
                rect = fitz.Rect(b[:4])
                if '<image' in b[4]:
                    img_area += img_area + rect.get_area()
                else:
                    text_area += text_area + rect.get_area()

            if text_area == 0.0 and img_area > 0.0:
                return "image"
            
    return "text"


In [12]:
# Loop over PDFs in the records directory and print the classification
for file in os.listdir('records'):
    if file.endswith('.pdf'):
        print(file, classifier(os.path.join('records', file)))

Sample_Inpt_HR.pdf image
Sample_HR_Denial_Letter.pdf text


In [22]:
# Function to retrieve all of the text in a text-based PDF
def get_pdf_text(pdf_file_path):
    '''
        Function that takes a PDF file path as input
        Returs an array of arrays of Rect objects and the text contained within them
        Each inner array represents a page
    '''
    # Open PDF file
    with open(pdf_file_path, 'rb') as file:
        pdf = fitz.open(stream=file.read(), filetype="pdf")

        pages = []

        for page in pdf:
            page_blocks = []
            page_rect = page.bound()
            page_width = page_rect.width
            page_height = page_rect.height

            for b in page.get_text("blocks"):
                rect = fitz.Rect([b[0]/page_width, b[1]/page_height, b[2]/page_width, b[3]/page_height])
                if '<image' not in b[4]:
                    page_blocks.append({
                        'left': rect.x0,
                        'top': rect.y0,
                        'right': rect.x1,
                        'bottom': rect.y1,
                        'text': b[4]
                    })
            
            pages.append(page_blocks)

    return pages

In [25]:
# Loop over PDFs in the records directory
# If it's a text-based PDF, extract the text and save it to a text file
for file in os.listdir('records'):
    if file.endswith('.pdf'):
        if classifier(os.path.join('records', file)) == 'text':
            print(file)
            pdf_text_output = get_pdf_text(os.path.join('records', file))
            for page_index in range(len(pdf_text_output)):
                print(f'\tPage: {page_index+1}')
                for block in pdf_text_output[page_index]:
                    print(f'\t\t{block["left"]}, {block["top"]}, {block["right"]}, {block["bottom"]}: {block["text"]}')
            print('\n\n')

                

Sample_HR_Denial_Letter.pdf
	Page: 1
		0.11764705882352941, 0.10655628551136363, 0.880282532935049, 0.16355572324810605: Important: This notice explains your right to appeal our decision. Read this notice
carefully. If you need help, you can call one of the numbers listed on the last page under
“Get help & more information.”

		0.23529411764705882, 0.20965568465415876, 0.6942595936893637, 0.2322251290986032: Notice of Denial of Medical Coverage

		0.11764705882352941, 0.25135687625769415, 0.8682669845281863, 0.26828395959102747: Date: 12/15/1999
Member Number: 12345

		0.11764705882352941, 0.3142186945134943, 0.3017578125, 0.33114577784682764: Name: John Brown

		0.11764705882352941, 0.3771631260110874, 0.8294527141097324, 0.41738313617128314: Your request was denied. We’ve denied the payment of right inguinal hernia
repair listed below requested by your doctor:

		0.11764705882352941, 0.46340048433554293, 0.4619179021299275, 0.4831487482244318: Why did we deny your request?

		0.11764