In [7]:
import os
import re
import requests
from spellchecker import SpellChecker
import pytesseract
from tqdm import tqdm
from PIL import Image
import fitz  # PyMuPDF

def correct_grammar(text, server_url="http://127.0.0.1:8008/completion"):
    # Prepare the payload for the request
    payload = {"prompt": f"Please correct the grammar and spelling in the following text and provide only the corrected version: {text}"}
    
    try:
        # Send the request to the server
        response = requests.post(server_url, json=payload)

        # Check if the response is successful
        if response.status_code == 200:
            # Extract the corrected text from the 'content' key
            corrected_text = response.json().get('content', '')
            return corrected_text
        else:
            print("Server returned an error:", response.status_code, response.text)
            return text
    except Exception as e:
        print("Error communicating with the server:", e)
        return text

def assess_ocr_quality(text):
    spell = SpellChecker()

    # Number Heavy Check
    num_ratio = sum(c.isdigit() for c in text) / len(text) if text else 0
    if num_ratio > 0.5:
        return 'number_heavy'

    # Basic Length Check
    if len(text) < 20:  # arbitrary minimum length
        return 'unusable'

    # Spell Checking and Basic Grammar Analysis
    words = text.split()
    misspelled = spell.unknown(words)
    if len(misspelled) > 0.5 * len(words):  # if more than 40% of words are misspelled
        return 'unusable'

    # Simple Coherence Check (can be improved with NLP techniques)
    if re.search(r'[^\w\s]', text):  # regex for non-word, non-space characters
        return 'okay'

    # If none of the above, classify as good
    return 'good'

source_dir = "/Users/garfieldgreglim/Documents/JQ/Knowledgebase 2/pdf"
target_dirs = {
    'good': "/Users/garfieldgreglim/Documents/JQ/Knowledgebase 2/pdf_ocrs/good_ocr",
    'okay': "/Users/garfieldgreglim/Documents/JQ/Knowledgebase 2/pdf_ocrs/okay_ocr",
    'unusable': "/Users/garfieldgreglim/Documents/JQ/Knowledgebase 2/pdf_ocrs/unusable_ocr",
    'number_heavy': "/Users/garfieldgreglim/Documents/JQ/Knowledgebase 2/pdf_ocrs/number_heavy_ocr"
}

if not os.path.exists(source_dir):
    print("Source directory does not exist.")
    exit(1)

for dir in target_dirs.values():
    if not os.path.exists(dir):
        os.makedirs(dir)

# Get list of PDF files
pdf_files = [f for f in os.listdir(source_dir) if f.lower().endswith(".pdf")]

# Process each file with a progress bar
for filename in tqdm(pdf_files, desc="Processing PDFs"):
    pdf_path = os.path.join(source_dir, filename)
    try:
        doc = fitz.open(pdf_path)
        text = ""

        for page in doc:
            pix = page.get_pixmap()
            image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            text += pytesseract.image_to_string(image)

        text = correct_grammar(text)
        quality = assess_ocr_quality(text)
        with open(os.path.join(target_dirs[quality], filename + ".txt"), "w") as text_file:
            text_file.write(text)
    except Exception as e:
        print(f"Error processing {filename}: {e}")


Processing PDFs: 100%|████████████████████████| 103/103 [31:38<00:00, 18.43s/it]
