Uncoment and run to install the required dependencies

In [None]:
# !apt-get install tesseract-ocr
# !pip install pdf2image
# !pip install pytesseract
# !apt-get install -y poppler-utils

# Import necessary libraries

In [None]:
import os
from pdf2image import convert_from_path
from pytesseract import image_to_string
from PIL import Image, ImageEnhance, ImageFilter
import re


# Function to enhance an image with contrast and sharpening

In [None]:
def enhance_image(image, contrast_factor):
    # Enhance contrast
    contrast_enhancer = ImageEnhance.Contrast(image)
    enhanced_image = contrast_enhancer.enhance(contrast_factor)

    # Sharpen the image
    sharpened_image = enhanced_image.filter(ImageFilter.SHARPEN)

    return sharpened_image

# Function to convert a PDF to a list of images

In [None]:
def convert_pdf_to_img(pdf_file):
    return convert_from_path(pdf_file)


# Function to extract text from an image using Tesseract OCR

In [None]:
def convert_image_to_text(image):
    text = image_to_string(image)
    return text

# Function to extract parcel number from text using regular expressions

In [None]:

def extract_parcel_number(text):
    # Add your parcel number extraction logic here
    # This is just a placeholder example, you should replace it with your actual logic
    parcel_number = re.search(r'PARCEL NUMBER\s*(\d+)', text)
    if parcel_number:
        return parcel_number.group(1)
    else:
        return None

# Function to process a PDF, enhance images, and rename the file

In [None]:
def process_pdf_and_rename(pdf_file):
    contrast_factor = 1.2  # Initial contrast enhancement factor
    parcel_number = None
    max_contrast_factor = 2.0  # Maximum contrast enhancement factor to try

    while contrast_factor <= max_contrast_factor:
        images = convert_pdf_to_img(pdf_file)
        final_text = ''

        for pg, img in enumerate(images):
            enhanced_img = enhance_image(img, contrast_factor)  # Apply enhancement
            text = convert_image_to_text(enhanced_img)
            final_text += text

            parcel_number = extract_parcel_number(text)
            if parcel_number:
                break  # Exit the loop if parcel number is found

        if parcel_number:
            break  # Exit the loop if parcel number is found
        else:
            contrast_factor += 0.3  # Increase contrast factor and try again

    return final_text, parcel_number


# Function to rename a PDF file

In [None]:

def rename_pdf_file(old_path, new_name):
    folder_path = os.path.dirname(old_path)
    new_path = os.path.join(folder_path, f'LOC 4_GAKARARA_{new_name}.pdf')
    os.rename(old_path, new_path)


# Specify the folder path containing the PDF files

In [None]:
folder_path = r'/content/drive/MyDrive/Data Science/TRAY 1_2381-2398'

# Iterate through the PDF files in the specified folder

In [None]:

# Initialize a dictionary to keep track of renamed files
renamed_files = {}

# Iterate through the PDF files in the specified folder
for filename in os.listdir(folder_path):
    if filename.endswith('.pdf'):
        file_path = os.path.join(folder_path, filename)
        extracted_text, parcel_number = process_pdf_and_rename(file_path)

        # Print extracted text for reference
        print("Extracted Text:")
        print(extracted_text)

        # Check if a parcel number was extracted
        if parcel_number:
            print("Parcel Number:", parcel_number)
            new_name = f'{parcel_number}'  # Use the parcel number as the new file name
            rename_pdf_file(file_path, new_name)
            renamed_files[filename] = new_name  # Record the renamed file
            print(f"Renamed PDF to: {new_name}.pdf")
        else:
            # If parcel number not found, add a unique counter to the filename
            counter = 1
            new_name = f'1_pn_{counter}'
            while new_name in renamed_files.values():
                counter += 1
                new_name = f'1_pn_{counter}'

            rename_pdf_file(file_path, new_name)
            renamed_files[filename] = new_name  # Record the renamed file
            print(f"Renamed PDF to: {new_name}.pdf (Parcel Number not found)")
