In [None]:
from pdf2image import convert_from_path
import os

from common.path_setup import data_dir

# Path to the PDF file
pdf_path = os.path.join(data_dir,'electoral_rolls.pdf')

# Convert the first page of the PDF to an image
images = convert_from_path(pdf_path, first_page=3, last_page=3)


# Save the image of the first page
image_path = os.path.join(data_dir, 'pdf_images', 'page_3.png')
images[0].save(image_path, 'PNG')

In [66]:
import cv2
import numpy as np
from PIL import ImageEnhance, ImageFilter
from PIL import Image


# Load the image
image = cv2.imread(image_path)


# Convert to gray scale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Use adaptive thresholding to highlight the regions of interest
# This method is effective in varying lighting conditions and contrasting background
thresh = cv2.adaptiveThreshold(
    gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2)

# Find contours on the thresholded image
contours, _ = cv2.findContours(
    thresh, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)

# Filter contours based on area to remove noise
filtered_contours = [cnt for cnt in contours if cv2.contourArea(cnt) > 30000 and cv2.contourArea(cnt) < 250000]
areas = [cv2.contourArea(cnt) for cnt in filtered_contours]
# We'll store the bounding boxes of the regions of interest here
bounding_boxes = []

# For each contour, find the bounding rectangle and store it
for contour in filtered_contours:
    x, y, w, h = cv2.boundingRect(contour)
    bounding_boxes.append((x, y, x+w, y+h))

    # Draw a rectangle around the contour for visualization
    cv2.rectangle(image, (x, y), (x+w, y+h), (0, 255, 0), 2)

# Save the image with outlined regions of interest
outlined_image_path = os.path.join(data_dir, 'pdf_images', 'outlined_images','page_2_final.png')
cv2.imwrite(outlined_image_path, image)

print(len(filtered_contours))
print(sorted(areas))

30
[93517.5, 93520.5, 93539.0, 93539.0, 93544.0, 93546.5, 93634.5, 93643.0, 93654.5, 93657.5, 93667.0, 93667.0, 93911.0, 93935.0, 94011.5, 94014.0, 94015.0, 94034.5, 94034.5, 94035.5, 94038.5, 94039.0, 94115.0, 94136.5, 94136.5, 94435.0, 94515.0, 94534.5, 94535.5, 94536.0]


In [55]:
import pytesseract
from pytesseract import Output

# Initialize a dictionary to hold the OCR results
ocr_results = {}

# Function to extract text from an ROI in the image
def ocr_on_roi(image, contour, index):
    # Get the bounding box for the contour
    x, y, w, h = cv2.boundingRect(contour)

    # Extract the region of interest
    roi = image[y:y+h, x:x+w]

    # Use Tesseract to do OCR on the image
    text = pytesseract.image_to_string(roi, config='--psm 6', output_type=Output.STRING)

    return text.strip()  # Remove any leading/trailing white space

gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Process a reasonable number of regions (not all, to save time)
# and perform OCR on each
max_regions = 5  # Maximum number of regions to process for demonstration
for i, contour in enumerate(filtered_contours[2:max_regions]):
    text = ocr_on_roi(gray_image, contour, i)  # Use grayscale image for OCR
    if text:  # If text was detected, save it with an identifier
        ocr_results[f"Region_{i}"] = text

ocr_results  # Display the results from the OCR process

{'Region_0': 'Photo is\nAvailable',
 'Region_1': 'Photo is\nAvailable',
 'Region_2': 'Photo is\nAvailable'}