In [9]:
import cv2
import numpy as np
import fitz  # PyMuPDF for PDF to image conversion
import os
from glob import glob
from PIL import Image
import re  # Import regex to sanitize filenames

# Path to the folder containing PDFs
pdf_folder = r'D:\smarterdevOCRScanner\templates\AlanDeMel'

# Path to save the images
#everytime use correct folder to save images
image_output_folder = r'D:\smarterdevOCRScanner\templates\AlanDeMelImages'

# Create the output folder if it doesn't exist
if not os.path.exists(image_output_folder):
    os.makedirs(image_output_folder)

# Function to sanitize filenames (removes/replace invalid characters)
def sanitize_filename(filename):
    # Replace any invalid characters with an underscore '_'
    return re.sub(r'[<>:"/\\|?*]', '_', filename)

# Get all PDF file paths from the folder
pdf_paths = glob(os.path.join(pdf_folder, '*.pdf'))

# Iterate through all PDFs in the folder
for pdf_path in pdf_paths:
    # Open the PDF using PyMuPDF
    pdf_document = fitz.open(pdf_path)
    
    # Extract the filename from the PDF path (without extension)
    pdf_name = sanitize_filename(os.path.splitext(os.path.basename(pdf_path))[0])
    
    # Iterate through all pages in the PDF
    for page_num in range(pdf_document.page_count):
        # Load the page
        page = pdf_document.load_page(page_num)
        
        # Check if the page contains embedded images; if so, skip
        #if page.get_images():
        #    print(f"Skipping embedded images on page {page_num + 1} in {pdf_name}")
         #   continue
        
        # Convert the page to an image (set DPI for quality)
        pix = page.get_pixmap(dpi=700)  # Adjust DPI for better quality if needed
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        
        # Define the output image path with sanitized PDF name and page number
        img_filename = f"{pdf_name}_page_{page_num + 1}.png"
        img_output_path = os.path.join(image_output_folder, img_filename)
        
        # Check if the file already exists
        if os.path.exists(img_output_path):
            print(f"File {img_output_path} already exists and will be replaced.")
        
        try:
            # Save the image in PNG format, replacing the existing image if present
            img.save(img_output_path, "PNG")
            print(f"Saved: {img_output_path}")
        except OSError as e:
            print(f"Error saving {img_output_path}: {e}")
    
    # Close the PDF after processing
    pdf_document.close()

print("PDF to image conversion completed for all PDFs.")


Saved: D:\smarterdevOCRScanner\templates\AlanDeMelImages\Alandemel-01_page_1.png
Saved: D:\smarterdevOCRScanner\templates\AlanDeMelImages\Alandemel-02_page_1.png
Saved: D:\smarterdevOCRScanner\templates\AlanDeMelImages\Alandemel-03_page_1.png
Saved: D:\smarterdevOCRScanner\templates\AlanDeMelImages\Alandemel-04_page_1.png
Saved: D:\smarterdevOCRScanner\templates\AlanDeMelImages\Alandemel-05_page_1.png
Saved: D:\smarterdevOCRScanner\templates\AlanDeMelImages\Alandemel-06_page_1.png
Saved: D:\smarterdevOCRScanner\templates\AlanDeMelImages\Alandemel-07_page_1.png
Saved: D:\smarterdevOCRScanner\templates\AlanDeMelImages\Alandemel-08_page_1.png
Saved: D:\smarterdevOCRScanner\templates\AlanDeMelImages\Alandemel-09_page_1.png
Saved: D:\smarterdevOCRScanner\templates\AlanDeMelImages\Alandemel-10_page_1.png
Saved: D:\smarterdevOCRScanner\templates\AlanDeMelImages\Alandemel-11_page_1.png
Saved: D:\smarterdevOCRScanner\templates\AlanDeMelImages\Alandemel-12_page_1.png
Saved: D:\smarterdevOCRScann