# Aim : Code to convert pdf to text

So there are a number of libraries that can be used to convert pdf to text.

1. pymupdf4llm

Let's go through each of them.

## Loading our pdf files

In [6]:
# Let's load all the pdf files from the directory

import os
import dotenv

dotenv.load_dotenv()

folder_path = os.getenv("CLAIM_LOCATION")

pdf_files = [folder_path + "/" + f for f in os.listdir(folder_path) if f.endswith(".pdf")]

print(pdf_files)

['/Users/deveshsurve/UNIVERSITY/PROJECT/classify-pdf/data_files/Compliance Report 4.pdf', '/Users/deveshsurve/UNIVERSITY/PROJECT/classify-pdf/data_files/Compliance Report 1.pdf', '/Users/deveshsurve/UNIVERSITY/PROJECT/classify-pdf/data_files/Compliance Report 2.pdf', '/Users/deveshsurve/UNIVERSITY/PROJECT/classify-pdf/data_files/Compliance Report 3.pdf', '/Users/deveshsurve/UNIVERSITY/PROJECT/classify-pdf/data_files/Sleep Study Report 3.pdf', '/Users/deveshsurve/UNIVERSITY/PROJECT/classify-pdf/data_files/Sleep Study Report 2.pdf', '/Users/deveshsurve/UNIVERSITY/PROJECT/classify-pdf/data_files/Sleep Study Report 1.pdf', '/Users/deveshsurve/UNIVERSITY/PROJECT/classify-pdf/data_files/Sleep Study Report 4.pdf', '/Users/deveshsurve/UNIVERSITY/PROJECT/classify-pdf/data_files/Order 3.pdf', '/Users/deveshsurve/UNIVERSITY/PROJECT/classify-pdf/data_files/Delivery Ticket 2.pdf', '/Users/deveshsurve/UNIVERSITY/PROJECT/classify-pdf/data_files/Physician Notes 1.pdf', '/Users/deveshsurve/UNIVERSITY/P

# First, let's try pymupdf4llm

In [2]:
def extract_text_pymupdf4llm(pdf_file):
    import pymupdf4llm
    md_text = pymupdf4llm.to_markdown(pdf_file)
    return md_text


In [None]:
md_text = extract_text_pymupdf4llm(pdf_files[1])
print(md_text)


# Next, let's try pymupdf


In [4]:
def extract_text_pymupdf(pdf_file):
  import pymupdf # imports the pymupdf library

  total_text = ""

  doc = pymupdf.open(pdf_file) # open a document
  for page in doc: # iterate the document pages
        text = page.get_text() # get plain text encoded as UTF-8
        total_text += text
  return total_text

total_text = extract_text_pymupdf(pdf_files[1])

In [None]:
print(total_text)

# Next, let's try pdfminer-six

In [6]:
def extract_text_pdfminer(pdf_file):    
    from pdfminer.high_level import extract_text
    text = extract_text(pdf_file)
    return text

text = extract_text_pdfminer(pdf_files[1])


In [None]:
print(text)

# Next, let's try llama-index


In [None]:
def extract_text_llama_index(pdf_file):
    from llama_parse import LlamaParse
    from llama_index.core import SimpleDirectoryReader
    import nest_asyncio; nest_asyncio.apply()

    # set up parser
    parser = LlamaParse(
        result_type="markdown"  # "markdown" and "text" are available
    )

    # use SimpleDirectoryReader to parse our file
    file_extractor = {".pdf": parser}
    documents = SimpleDirectoryReader(input_files=[pdf_file], file_extractor=file_extractor).load_data()
    return documents[0].text

text = extract_text_llama_index(pdf_files[1])


In [None]:
print(text)

# Finally, Pypdf which also used in langchain


In [10]:
def extract_text_pypdf(pdf_file):
    from pypdf import PdfReader

    reader = PdfReader(pdf_file)
    number_of_pages = len(reader.pages)
    page = reader.pages[0]
    text = page.extract_text()
    return text

text = extract_text_pypdf(pdf_files[1])


In [None]:
print(text)

# Next, let's try pytesseract

In [24]:
def extract_text_ocr(pdf_file):
    from pdf2image import convert_from_path
    import pytesseract
    images = convert_from_path(pdf_file, dpi=300)
    text = pytesseract.image_to_string(images[0])
    return text

text = extract_text_ocr(pdf_files[7])   
print([text])


['SPLIT (NIGHT FRREPORT MUBISQONS STF Dl SOSTTAIRERG ENE BIRER\n\nNanna: StudyDBiate?/22/2B)32023\n\nDOB: Gender: Male Medira IRRedE\n\nHeight: 5"11\'1" Weigfit: 2220/t@ lbs BMI: 3088s Sleep SSpeiciaiis\n\nPhysician): ReiseceddoffraanD DO Scorimyl Tet h0gdanedves RPSGT\nType oof Tas G@PEALNIGMGHT\n\nChief CGomiplain$n orig apayne ated\n\nMedicatiosscimmanton mikktithist|4ishfish soily sitatvasketinppirilh hydroontnetiniaziglejocsannineaspinirB 18h ang,\niprattoopionbrdonoamide\n\ni ; ogra DGOjEN sleepforosisix monaore\nhowe wit aebysicion igexiew raed dhterpret atin Afbilowiymemanatercs wane: costck 3-2, FaLbM 1 Aa oe AM DJORIMA, OL Ma) BOC, Ml,\nLOC-M2, GhimE EMGR RA LAT hanasphessu ce tkenmistosnoressensoRiP (chest, abdtonten) pullseooniatetryodpadyitiposiAdinsaoniggofollews\nthe reno mencatiationéittre AAmerncahcatcade nt 5 of Sléep cid iinessAH I (Apa ai yalypoqarsdbnsite )isifveheumbenbé aporap remathipyqojeaeas\nwith aat40/ciecdetatuicatiper hoounfafistee|s MEMSqnequicethe fenite

# Time and Quality Analsis 

For the time comparison, let's run our script which calls all these functions for all our files and measures the time taken

In [20]:
# Let's create a function to call all these functions and measure the time taken
import time

def calculate_time(pdf_file, function):
    print(f"Running {function.__name__}")
    start_time = time.time()
    text = function(pdf_file)
    end_time = time.time()
    print(f"Time taken: {end_time - start_time} seconds")
    return text, end_time - start_time


In [25]:
# list_of_functions = [extract_text_pymupdf4llm, extract_text_pymupdf, extract_text_pdfminer, extract_text_llama_index, extract_text_pypdf]
list_of_functions = [extract_text_ocr]
results = {}

for function in list_of_functions:
    function_name = function.__name__
    results[function_name] = {
        'total_time': 0,
        'per_file_times': {},
        'texts': {}
    }
    
    for pdf_file in pdf_files:
        text, time_taken = calculate_time(pdf_file, function)
        file_name = pdf_file.split('/')[-1]  # Get just the filename
        
        results[function_name]['per_file_times'][file_name] = time_taken
        results[function_name]['total_time'] += time_taken
        results[function_name]['texts'][file_name] = text

# Save results to JSON
import json
with open('extraction_results.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=4)


Running extract_text_ocr
Time taken: 6.5277791023254395 seconds
Running extract_text_ocr
Time taken: 2.1395387649536133 seconds
Running extract_text_ocr
Time taken: 2.6687958240509033 seconds
Running extract_text_ocr
Time taken: 2.1920359134674072 seconds
Running extract_text_ocr
Time taken: 2.2098207473754883 seconds
Running extract_text_ocr
Time taken: 3.272150993347168 seconds
Running extract_text_ocr
Time taken: 3.0979392528533936 seconds
Running extract_text_ocr
Time taken: 3.54331111907959 seconds
Running extract_text_ocr
Time taken: 2.209604263305664 seconds
Running extract_text_ocr
Time taken: 2.1238410472869873 seconds
Running extract_text_ocr
Time taken: 6.477976083755493 seconds
Running extract_text_ocr
Time taken: 2.482509136199951 seconds
Running extract_text_ocr
Time taken: 2.0024020671844482 seconds
Running extract_text_ocr
Time taken: 3.73122501373291 seconds
Running extract_text_ocr
Time taken: 1.8468129634857178 seconds
Running extract_text_ocr
Time taken: 4.731431961

In [1]:
# Load the results
import json
with open('extraction_results.json', 'r', encoding='utf-8') as f:
    results = json.load(f)

# Print summary of total times
for function_name, data in results.items():
    print(f"\n{function_name} total time: {data['total_time']:.2f} seconds")




extract_text_pymupdf4llm total time: 19.03 seconds

extract_text_pymupdf total time: 0.09 seconds

extract_text_pdfminer total time: 2.24 seconds

extract_text_llama_index total time: 763.31 seconds

extract_text_pypdf total time: 0.32 seconds

extract_text_ocr total time: 30.01 seconds


In [2]:
results.keys()

dict_keys(['extract_text_pymupdf4llm', 'extract_text_pymupdf', 'extract_text_pdfminer', 'extract_text_llama_index', 'extract_text_pypdf', 'extract_text_ocr'])

In [3]:
import plotly.express as px

for file_name in list(results['extract_text_pymupdf4llm']['texts'].keys())[:5]:
    word_counts = {function_name: len(data['texts'][file_name].split()) for function_name, data in results.items()}

    # Generate a horizontal bar chart
    fig = px.bar(x=list(word_counts.values()), y=list(word_counts.keys()), orientation='h',
                labels={'x': 'Word Count', 'y': 'Method'}, title=f"Word Count by Method for {file_name}")
    fig.show()


# So we see that llama_index and pytessract are the best performing method

In [2]:
# Replace the text in the results json creating a final json file

final_results = results['extract_text_ocr']

import json
with open('selected_results.json', 'w', encoding='utf-8') as f:
    json.dump(final_results, f, ensure_ascii=False, indent=4)

NameError: name 'final_results' is not defined

### Special Case : Skewed Image : Sleep Report 4

In [14]:
pdf_files[7]

'/Users/deveshsurve/UNIVERSITY/PROJECT/classify-pdf/data_files/Sleep Study Report 4.pdf'

In [18]:
import fitz  # PyMuPDF
from pdf2image import convert_from_path
import cv2
import numpy as np
from PIL import Image

# Step 1: Extract PDF as Images
pdf_path = pdf_files[7]
images = convert_from_path(pdf_path, dpi=300)

# Step 2: Deskew and Enhance Contrast
def deskew_image(image):
    gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
    edges = cv2.Canny(gray, 50, 150)
    coords = np.column_stack(np.where(edges > 0))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = gray.shape[:2]
    M = cv2.getRotationMatrix2D((w // 2, h // 2), angle, 1.0)
    deskewed = cv2.warpAffine(gray, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return deskewed

def enhance_contrast(image):
    _, thresholded = cv2.threshold(image, 128, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    return thresholded

processed_images = [deskew_image(img) for img in images]

# Step 3: Save Back as PDF
pil_images = [Image.fromarray(img) for img in processed_images]
pil_images[0].save(folder_path + "/" + "Sleep Study Report 4 Deskew.pdf", save_all=True, append_images=pil_images[1:])
