In [1]:
# Testing Notbook for classifying PDFs as Text-Based or Image-Based
# Important for determining which algorithm we will use for text extraction of documents

# Imports
import os
import sys

# PDF Libraries
import fitz

In [2]:
def classifier(pdf_file_path):
    '''
        Function that takes a PDF file path as input and returns a string
        indicating whether the PDF is text-based or image-based.
        "text" or "image"
        If even a single page is an image, the PDF will be classified as image-based.
        Better safe than sorry.
    '''
    # Open PDF file
    with open(pdf_file_path, 'rb') as file:
        pdf = fitz.open(stream=file.read(), filetype="pdf")

        for page in pdf:
            img_area = 0.0
            text_area = 0.0

            for b in page.get_text("blocks"):
                rect = fitz.Rect(b[:4])
                if '<image' in b[4]:
                    img_area += img_area + rect.get_area()
                else:
                    text_area += text_area + rect.get_area()

            if text_area == 0.0 and img_area > 0.0:
                return "image"
            
    return "text"


In [3]:
# Loop over PDFs in the records directory and print the classification
for file in os.listdir('records'):
    if file.endswith('.pdf'):
        print(file, classifier(os.path.join('records', file)))

Sample_Inpt_HR.pdf image
Sample_HR_Denial_Letter.pdf text


In [4]:
# Function to retrieve all of the text in a text-based PDF
def get_pdf_text(pdf_file_path):
    '''
        Function that takes a PDF file path as input
        Returs an array of arrays of Rect objects and the text contained within them
        Each inner array represents a page
    '''
    # Open PDF file
    with open(pdf_file_path, 'rb') as file:
        pdf = fitz.open(stream=file.read(), filetype="pdf")

        pages = []

        for page in pdf:
            page_blocks = []
            page_rect = page.bound()
            page_width = page_rect.width
            page_height = page_rect.height

            for b in page.get_text("blocks"):
                rect = fitz.Rect([b[0]/page_width, b[1]/page_height, b[2]/page_width, b[3]/page_height])
                if '<image' not in b[4]:
                    page_blocks.append({
                        'left': rect.x0,
                        'top': rect.y0,
                        'right': rect.x1,
                        'bottom': rect.y1,
                        'text': b[4]
                    })
            
            pages.append(page_blocks)

    return pages

In [5]:
# Loop over PDFs in the records directory
# If it's a text-based PDF, extract the text and save it to a text file
for file in os.listdir('records'):
    if file.endswith('.pdf'):
        if classifier(os.path.join('records', file)) == 'text':
            print(file)
            pdf_text_output = get_pdf_text(os.path.join('records', file))
            for page_index in range(len(pdf_text_output)):
                print(f'\tPage: {page_index+1}')
                for block in pdf_text_output[page_index]:
                    print(f'\t\t{block["left"]}, {block["top"]}, {block["right"]}, {block["bottom"]}: {block["text"]}')
            print('\n\n')

                

Sample_HR_Denial_Letter.pdf
	Page: 1
		0.11764705882352941, 0.10655628551136363, 0.880282532935049, 0.16355572324810605: Important: This notice explains your right to appeal our decision. Read this notice
carefully. If you need help, you can call one of the numbers listed on the last page under
“Get help & more information.”

		0.23529411764705882, 0.20965568465415876, 0.6942595936893637, 0.2322251290986032: Notice of Denial of Medical Coverage

		0.11764705882352941, 0.25135687625769415, 0.8682669845281863, 0.26828395959102747: Date: 12/15/1999
Member Number: 12345

		0.11764705882352941, 0.3142186945134943, 0.3017578125, 0.33114577784682764: Name: John Brown

		0.11764705882352941, 0.3771631260110874, 0.8294527141097324, 0.41738313617128314: Your request was denied. We’ve denied the payment of right inguinal hernia
repair listed below requested by your doctor:

		0.11764705882352941, 0.46340048433554293, 0.4619179021299275, 0.4831487482244318: Why did we deny your request?

		0.11764

In [6]:
# Trying a new classification method with a new package
import pdfplumber

In [7]:
def classifier(pdf_file_path):
    '''
        Function that takes a PDF file path as input and returns a string
        indicating whether the PDF is text-based or image-based.
        "text" or "image"
        If even a single page is an image, the PDF will be classified as image-based.
        Better safe than sorry.
    '''
    # Open PDF file
    with pdfplumber.open(pdf_file_path) as pdf:
        total_area = 0.0
        text_area = 0.0
        table_area = 0.0
        
        for page in pdf.pages:
            
            total_area += page.width * page.height
            
            for word in page.extract_words():
                x0, y0, x1, y1 = map(float, (word['x0'], word['top'], word['x1'], word['bottom']))
                text_area += (x1 - x0) * (y1 - y0)

            for table in page.extract_tables():
                for cell in table:
                    if cell is not None:
                        x0, y0, x1, y1 = map(float, cell)
                        table_area += (x1 - x0) * (y1 - y0)

            if (text_area + table_area) / total_area < 0.05:
                return "image"

    return "text"

In [8]:
# Loop over PDFs in the records directory and print the classification
for file in os.listdir('records'):
    if file.endswith('.pdf'):
        print(file, classifier(os.path.join('records', file)))

Sample_Inpt_HR.pdf image
Sample_HR_Denial_Letter.pdf text


In [20]:
# Function to retrieve all of the text & tables in a text-based PDF
def get_pdf_content(pdf_file_path):
    '''
        Function that takes a PDF file path as input
        Returns a tuple of arrays of arrays of 
    '''
    # Open PDF file
    text = []
    tables = []
    with pdfplumber.open(pdf_file_path) as pdf:

        for page in pdf.pages:
            page_height = page.height
            page_width = page.width
            text_arr = []
            table_arr = []
            lines = page.extract_text_lines()
            tables = page.extract_tables()

            for line in lines:
                text_arr.append({
                    'left': line['x0'] / page_width,
                    'top': line['top'] / page_height,
                    'right': line['x1'] / page_width,
                    'bottom': line['bottom'] / page_height,
                    'text': line['text']
                })

            for table in tables:
                row_arr = []
                for row in table:
                    for cell in row:
                        row_arr.append({
                            'left': cell[0] / page_width,
                            'top': cell[1] / page_height,
                            'right': cell[2] / page_width,
                            'bottom': cell[3] / page_height,
                            'text': cell[4]
                        })
                    table_arr.append(row_arr)

            text.append(text_arr)
            tables.append(table_arr)

    return {'text': text, 'tables': tables}


In [21]:
# Loop over PDFs in the records directory
# If it's a text-based PDF, extract the text and save it to a text file
for file in os.listdir('records'):
    if file.endswith('.pdf'):
        if classifier(os.path.join('records', file)) == 'text':
            print(get_pdf_text(os.path.join('records', file)))

{'text': [[{'left': 0.11764705882352941, 'top': 0.108331853655303, 'right': 0.8282207381372548, 'bottom': 0.12348336880681815, 'text': 'Important: This notice explains your right to appeal our decision. Read this notice'}, {'left': 0.11764705882352941, 'top': 0.12836803042297976, 'right': 0.8802945033333334, 'bottom': 0.14351954557449492, 'text': 'carefully. If you need help, you can call one of the numbers listed on the last page under'}, {'left': 0.11764705882352941, 'top': 0.14840420845328278, 'right': 0.38136680460784317, 'bottom': 0.16355572360479795, 'text': '“Get help & more information.”'}, {'left': 0.23529411764705882, 'top': 0.212023109837067, 'right': 0.6942596070148304, 'bottom': 0.23222513067040026, 'text': 'Notice of Denial of Medical Coverage'}, {'left': 0.11764705882352941, 'top': 0.2531324470896465, 'right': 0.8682789549019608, 'bottom': 0.2682839622411617, 'text': 'Date: 12/15/1999 Member Number: 12345'}, {'left': 0.11764705882352941, 'top': 0.31599426527146457, 'righ