In [1]:
import fitz  # PyMuPDF
import cv2
import numpy as np
import os

In [2]:
def save_image(image, page_num, img_count, output_dir='extracted_images'):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    image_filename = f'page_{page_num}.{img_count}.png'
    image_filepath = os.path.join(output_dir, image_filename)
    cv2.imwrite(image_filepath, image)

def extract_and_crop_images_from_scanned_pdf(pdf_path, output_dir='extracted_images'):
    doc = fitz.open(pdf_path)

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        pix = page.get_pixmap()
        image = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)

        # Convert to grayscale for easier processing
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        
        # Use thresholding and contour detection to find images
        # These parameters might need adjustment for different types of scans
        _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)
        contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        img_count = 1
        for contour in contours:
            # Optionally, filter out too small areas which might not be images
            if cv2.contourArea(contour) < 1000: 
                continue

            x, y, w, h = cv2.boundingRect(contour)
            cropped_image = image[y:y+h, x:x+w]
            save_image(cropped_image, page_num + 1, img_count, output_dir)
            img_count += 1

    doc.close()

In [3]:
# Usage
pdf_file = "../input/test.pdf"  # Replace with your PDF file path
extract_and_crop_images_from_scanned_pdf(pdf_file)