In [4]:
import cv2
import numpy as np
import pytesseract
import fitz  # PyMuPDF for PDF to image conversion
import camelot  # For table extraction from PDFs
import pandas as pd  # Import pandas for DataFrame creation
import tkinter as tk
from tkinter import Canvas, Scrollbar, Frame
from PIL import Image, ImageTk

In [11]:
# Set the path to the Tesseract OCR executable (modify this if needed)
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Adjust the path accordingly

# Path to the PDF file
pdf_path = r'D:\smarterdevOCRScanner\templates\LCP\WorksOrder-21.pdf'

# Open the PDF using PyMuPDF
pdf_document = fitz.open(pdf_path)

# Load the first page (0-indexed)
page = pdf_document.load_page(0)

# Convert the page to an image (increase DPI for better quality)
pix = page.get_pixmap(dpi=1300)  # 600 DPI for better quality
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

# Convert to OpenCV format (BGR)
img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)

# Preprocessing: Convert to grayscale and apply adaptive thresholding
img_gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
img_thresh = cv2.adaptiveThreshold(img_gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)

# Denoising
img_denoised = cv2.fastNlMeansDenoising(img_thresh, None, 30, 7, 21)

# OCR Configuration (using Tesseract with PSM 12 for better table detection)
config = '--oem 3 --psm 6'  # PSM 12 is good for table-like structures

# Extract text using Tesseract OCR
text_cv = pytesseract.image_to_string(img_denoised, lang='eng', config=config)

# Print the extracted text
#print("Extracted Text:")
#print(text_cv)

# Image to data
data = pytesseract.image_to_data(img_denoised)
dataList = list(map(lambda x:x.split('\t'), data.split('\n')))
df = pd.DataFrame(dataList[1:] , columns=dataList[0])
# Drop missing rows
df.dropna(inplace=True)
# Specify the columns you want to convert
col_int = ['level', 'page_num', 'block_num', 'par_num', 'line_num', 'word_num', 'left', 'top', 'width', 'height', 'conf']

# Convert the specified columns to float first, then to int
for col in col_int:
    df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert to numeric, setting errors to 'coerce' will convert invalid parsing to NaN
    df[col] = df[col].fillna(0).astype(int)  # Fill NaN with 0, then convert to int

# Create a copy of the original image
image = img_cv.copy()

# Draw bounding boxes and text
level = 'word'
for l, x, y, w, h, c, txt in df[['level', 'left', 'top', 'width', 'height', 'conf', 'text']].values:
    if level == 'page':
        if l == 1:
            cv2.rectangle(image, (x, y), (x + w, y + h), (0, 0, 0), 20)
        else:
            continue
    elif level == 'block':
        if l == 2:
            cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 0), 20)
        else:
            continue
    elif level == 'para':
        if l == 3:
            cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 20)
        else:
            continue
    elif level == 'line':
        if l == 4:
            cv2.rectangle(image, (x, y), (x + w, y + h), (0, 0, 255), 20)
        else:
            continue
    elif level == 'word':
        if l == 5:
            cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 20)
            cv2.putText(image, txt, (x, y), cv2.FONT_HERSHEY_PLAIN, 1, (255, 0, 0), 2)
        else:
            continue

# Resize the image to fit the screen size
desired_width = 1600  # Set desired width
desired_height = 1000  # Set desired height

# Resize the image while maintaining aspect ratio
aspect_ratio = image.shape[1] / image.shape[0]
if aspect_ratio > 1:  # Image is wider than tall
    new_width = desired_width
    new_height = int(desired_width / aspect_ratio)
else:  # Image is taller than wide
    new_height = desired_height
    new_width = int(desired_height * aspect_ratio)

# Resize the image
resized_image = cv2.resize(image, (new_width, new_height))

# Display the resized image
cv2.imshow("Bounding Box", resized_image)
cv2.waitKey(0)
cv2.destroyAllWindows()