In [37]:
import easyocr
import cv2
import numpy as np
import torch
import os

## Module Two.
---

In [38]:
os.environ["CUDA_VISIBLE_DEVICES"] = "" 

original_is_available = torch.cuda.is_available
torch.cuda.is_available = lambda: False


image = cv2.imread('../datasets/Page_Level_Test_Set/image.png')
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Simple binary thresholding (less memory intensive)
_, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)

cv2.imwrite('preprocessed.jpg', thresh)

print("Initializing EasyOCR with CPU...")
reader = easyocr.Reader(['en'], gpu=False, download_enabled=False)

print("Performing OCR...")
results = reader.readtext('preprocessed.jpg', 
                        paragraph=False,
                        decoder='greedy',  
                        batch_size=1,      
                        text_threshold=0.7,
                        width_ths=0.7,
                        height_ths=0.7)

# Print raw results
print("Raw OCR Results:")
print("-" * 50)
for bbox, text, conf in results:
    print(f"Text: {text} | Confidence: {conf:.2f}")

Using CPU. Note: This module is much faster with a GPU.


Initializing EasyOCR with CPU...
Performing OCR...
Raw OCR Results:
--------------------------------------------------
Text: Gone With The Wind | Confidence: 0.97
Text: Chapter One | Confidence: 1.00
Text: Scarlett O'Hara was not beautiful; but men seldom realized it when caught | Confidence: 0.61
Text: by hcr charm as thc Tarlcton twins Wcrc: In hcr facc wcrc too sharply | Confidence: 0.60
Text: blendledl the clelicale {ealures of Hier molher; a Coasl arislocral o French | Confidence: 0.32
Text: descent; and the heavy ones of her florid Irish father, But it was an aresting | Confidence: 0.76
Text: face, pointed of chin; square of jaw. Iler eyes were | Confidence: 0.53
Text: green without a | Confidence: 0.92
Text: touch of hazel, starred with bristly black lashes and slightly tilted at the ends. | Confidence: 0.58
Text: Abovc   thcm, hcr  thick   black   brows   slantcd   upward, cutting a startling | Confidence: 0.34
Text: oblique line in her  magolia-whiite skin-that  skin $0 prized

## POST OCR PROCESSING. 
---

In [39]:
import re
from difflib import get_close_matches


In [40]:
from spellchecker import SpellChecker
spell = SpellChecker()


In [41]:

def post_process(results):
    processed_results = []
    concatenated_text = ""
    
    for bbox, text, conf in results:
        if isinstance(text, str):
            # Clean and process the text
            cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text).strip()

            if cleaned_text:
                processed_results.append({
                    'bbox': bbox,
                    'text': cleaned_text,
                    'confidence': conf
                })
                concatenated_text += cleaned_text + " "
    
    concatenated_text = concatenated_text.strip()
    
    print("\nPost-processed OCR Results:")
    print("-" * 50)
    for result in processed_results:
        print(f"Text: {result['text']}")
        print(f"Confidence: {result['confidence']:.2f}")
        print("-" * 30)
    
    print("\nConcatenated Text:")
    print("-" * 50)
    print(concatenated_text)
    
    return processed_results, concatenated_text

final_results, concatenated_text = post_process(results)
with open('ocr_output.txt', 'w', encoding='utf-8') as f:
    f.write(concatenated_text)
print(f"\nOutput saved to ocr_output.txt")



Post-processed OCR Results:
--------------------------------------------------
Text: Gone With The Wind
Confidence: 0.97
------------------------------
Text: Chapter One
Confidence: 1.00
------------------------------
Text: Scarlett OHara was not beautiful but men seldom realized it when caught
Confidence: 0.61
------------------------------
Text: by hcr charm as thc Tarlcton twins Wcrc In hcr facc wcrc too sharply
Confidence: 0.60
------------------------------
Text: blendledl the clelicale ealures of Hier molher a Coasl arislocral o French
Confidence: 0.32
------------------------------
Text: descent and the heavy ones of her florid Irish father But it was an aresting
Confidence: 0.76
------------------------------
Text: face pointed of chin square of jaw Iler eyes were
Confidence: 0.53
------------------------------
Text: green without a
Confidence: 0.92
------------------------------
Text: touch of hazel starred with bristly black lashes and slightly tilted at the ends
Confidence: