In [24]:
import os
import pytesseract
from PIL import Image, ImageEnhance, ImageFilter, ImageOps
import pandas as pd
import cv2

# Ensure the Tesseract OCR executable is in your PATH or specify the path
pytesseract.pytesseract.tesseract_cmd = r'C:/Program Files/Tesseract-OCR/tesseract.exe'

# Function to preprocess image for better OCR accuracy
def preprocess_image(image):
    # Convert to grayscale
    gray_image = image.convert('L')
    
    # Increase contrast
    enhancer = ImageEnhance.Contrast(gray_image)
    enhanced_image = enhancer.enhance(2)
    
    # Apply some blur to reduce noise
    blurred_image = enhanced_image.filter(ImageFilter.MedianFilter())
    
    # Apply adaptive thresholding
    threshold_image = ImageOps.invert(blurred_image)
    
    return threshold_image

# Hypothetical function to detect license plates in an image
def hypothetical_license_plate_detection(image):
    # Convert image to grayscale (assuming color image input)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Perform edge detection (example using Canny edge detector)
    edges = cv2.Canny(gray, 50, 150)
    
    # Find contours in the edge-detected image
    contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    # Iterate through contours to find potential license plate regions
    for contour in contours:
        # Get bounding box coordinates of each contour
        x, y, w, h = cv2.boundingRect(contour)
        
        # Assuming a basic aspect ratio check for license plate
        aspect_ratio = w / h
        
        # Adjust aspect ratio conditions based on typical license plate sizes
        if 2.5 <= aspect_ratio <= 5:
            # Return bounding box coordinates (xmin, ymin, xmax, ymax)
            xmin = x
            ymin = y
            xmax = x + w
            ymax = y + h
            return xmin, ymin, xmax, ymax
    
    # Return None if no license plate region found
    return None


# Function to extract license plate text from an image
def extract_license_plate(img_path, bbox, save_cropped=False, cropped_dir='cropped_images'):
    # Load the image
    image = Image.open(img_path)
    
    # Crop the image using the bounding box coordinates
    ymin, xmin, ymax, xmax = bbox
    cropped_image = image.crop((xmin, ymin, xmax, ymax))
    
    # Preprocess the cropped image for better OCR accuracy
    preprocessed_image = preprocess_image(cropped_image)
    
    # Optionally save the cropped image for inspection
    if save_cropped:
        if not os.path.exists(cropped_dir):
            os.makedirs(cropped_dir)
        cropped_image_path = os.path.join(cropped_dir, os.path.basename(img_path))
        preprocessed_image.save(cropped_image_path)
    
    # Use pytesseract to extract text from the preprocessed cropped image
    custom_config = r'--oem 3 --psm 8 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
    license_plate_text = pytesseract.image_to_string(preprocessed_image, config=custom_config)
    
    return license_plate_text.strip()

# Path to the directory containing the test images
test_image_dir = "C:/Users/GVJai/Desktop/Project/soul_page/test/"

# Loop through each row in the dataframe and extract license plate text for the test data
test_license_plates = []


for img_file in os.listdir(test_image_dir):
    if img_file.endswith('.jpg') or img_file.endswith('.png'):  # Adjust based on your image file types
        img_path = os.path.join(test_image_dir, img_file)
        img = cv2.imread(img_path)
        xmin, ymin, xmax, ymax = hypothetical_license_plate_detection(img)
        bbox= (ymin,xmin,ymax,xmax)
        license_plate_text = extract_license_plate(img_path, bbox, save_cropped=True)
        test_license_plates.append({'img_id': img_file, 'extracted_text': license_plate_text})

# Convert the results to a DataFrame
test_extracted_text_df = pd.DataFrame(test_license_plates)

# Save the extracted text to a CSV file
test_extracted_text_csv_path = "C:/Users/GVJai/Desktop/Project/soul_page/test_extracted_text.csv"
test_extracted_text_df.to_csv(test_extracted_text_csv_path, index=False)

print("Extracted text from test data saved to CSV file:", test_extracted_text_csv_path)


Extracted text from test data saved to CSV file: C:/Users/GVJai/Desktop/Project/soul_page/test_extracted_text.csv
