In [1]:
# Import dependencies
import pytesseract
import cv2
from PIL import Image
from langdetect import detect, DetectorFactory, detect_langs
from spellchecker import SpellChecker
import numpy as np
import re
import pandas as pd

In [2]:
# Read the image
image_path = "law_text.png"
image = cv2.imread(image_path)    

In [3]:
# Preprocess image for better OCR and save it
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
cv2.imwrite("law_text_processed.png", gray)  

True

In [4]:
# Extract text from processed image with Latin bias
custom_config = "--psm 3 -l eng+lat"
text = pytesseract.image_to_data(gray, config=custom_config,output_type=pytesseract.Output.DATAFRAME)

text_cleaned = text[text["text"].notnull() & (text["text"].str.strip() != "")]
text_cleaned = text_cleaned[['line_num', 'word_num', 'conf', 'text']].dropna().query("text != ''")

In [5]:
print(text_cleaned)
    

     line_num  word_num       conf      text
4           1         1  89.937950       the
5           1         2  92.879509  purpofe,
6           1         3  96.927704        as
7           1         4  94.146339      alfo
8           1         5  65.091309       Wb.
..        ...       ...        ...       ...
324        45         2  82.389305   feemeth
325        45         3  60.388592      that
326        45         4  62.285683       he:
327        45         5  84.209106      that
328        45         6  81.661011    fteal-

[281 rows x 4 columns]


In [6]:
# Detect language
DetectorFactory.seed = 0
text_string = pytesseract.image_to_string(gray, config=custom_config) 
detected_language = detect(text_string)
print("Detected Language:", detected_language)
print("Detected Language Probabilities:", detect_langs(text_string))

Detected Language: en
Detected Language Probabilities: [en:0.9999971679660928]


In [14]:
# Add spell checking to the text and add to DataFrame
spell = SpellChecker()

# Correcting Function
def corrected_word(word):
    corrected = spell.correction(word)
    return corrected if corrected else word

text_cleaned['corrected_text'] = text_cleaned['text'].apply(corrected_word)

print(text_cleaned)

     line_num  word_num       conf      text corrected_text
4           1         1  89.937950       the            the
5           1         2  92.879509  purpofe,        purpose
6           1         3  96.927704        as             as
7           1         4  94.146339      alfo           also
8           1         5  65.091309       Wb.             wb
..        ...       ...        ...       ...            ...
324        45         2  82.389305   feemeth        feemeth
325        45         3  60.388592      that           that
326        45         4  62.285683       he:             he
327        45         5  84.209106      that           that
328        45         6  81.661011    fteal-          steal

[281 rows x 5 columns]


In [15]:
#Save results to an excel file
excel_filename = "law_text.xlsx"
text_cleaned.to_excel(excel_filename, index=False, engine="openpyxl")