In [14]:
import fitz
import os
import io
from PIL import Image
import hashlib
import base64


In [16]:
def base64_to_image(base64_string, output_path):
    image_data = base64.b64decode(base64_string)
    with open(output_path, "wb") as f:
        f.write(image_data)

def generate_id(data):
    return hashlib.md5(data.encode('utf-8') if isinstance(data, str) else data).hexdigest()[:10]

def extract_content(pdf_path, output_folder):
    doc = fitz.open(pdf_path)
    all_pages_data = []

    for page_num in range(len(doc)):
        page = doc[page_num]
        
        # 1. Generate unique ID for the page and extract text
        page_text = page.get_text()
        page_id = generate_id(f"{pdf_path}_page_{page_num}_{page_text[:100]}")
        
        # 2. Extract images
        image_list = page.get_images(full=True)
        images_data = []
        
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            
            # Generate unique ID for the image
            image_id = generate_id(image_bytes)
            
            # Get image size and position
            pix = fitz.Pixmap(doc, xref)
            img_rect = fitz.Rect(page.get_image_rects(img)[0])
            
            # Store image data
            images_data.append({
                "id": image_id,
                "filename": f"{image_id}.{base_image['ext']}",
                "position": {
                    "x0": img_rect.x0,
                    "y0": img_rect.y0,
                    "x1": img_rect.x1,
                    "y1": img_rect.y1
                },
                "size": {
                    "width": pix.width,
                    "height": pix.height
                },
                "imagedata": base64.b64encode(image_bytes).decode('utf-8')
            })
            
            # Save image file
            image_path = os.path.join(output_folder, f"{image_id}.{base_image['ext']}")
            with open(image_path, "wb") as image_file:
                image_file.write(image_bytes)
        
        # 3. Create JSON object for the page
        page_data = {
            "id": page_id,
            "page_number": page_num + 1,
            "text": page_text,
            "images": images_data,
            "metadata": {
                "title": doc.metadata.get("title", ""),
                "author": doc.metadata.get("author", ""),
                "subject": doc.metadata.get("subject", ""),
                "keywords": doc.metadata.get("keywords", ""),
                "page_size": {
                    "width": page.rect.width,
                    "height": page.rect.height
                }
            }
        }
        
        all_pages_data.append(page_data)
    
    doc.close()
    
    return all_pages_data
# Usage
pdf_path = "/Users/han/Desktop/Projects/elasticsearch-labs/supporting-blog-content/extracting-and-searching-diagrams/pdfs/Elastic Security overview _ Documentation.pdf"
output_folder = "/Users/han/Desktop/Projects/elasticsearch-labs/supporting-blog-content/extracting-and-searching-diagrams/images"
all_pages_data=extract_content(pdf_path, output_folder)

In [20]:
all_pages_data[1]['images'][0]['imagedata']

'iVBORw0KGgoAAAANSUhEUgAABaEAAANMCAIAAADKVvZiAAAACXBIWXMAAA7EAAAOxAGVKw4bAAPinElEQVR4nOzdB1hUZ9o//vef3WQT80/yJq9ZNauuLZogGhQiCMG+KJZVECsaUeyiYgcBxQJI771K770PQ5veGDrDwDAzDIhKs1dwfzccMyGauElEBvH+XPdyzZzznOc8ZyQ657vnPOd/HD19sbCwsLCwsLCwsLCwsLCwsN72+h/4338QQgghhBBCCCGE3mbPM45nz54peiQIIYQQQgghhBBCfx5mHAghhBBCCCGEEBoJ8F4VhBBCCCGEEEIIjQSYcSCEEEIIIYQQQmgkeJ5x9Pb2KnokCCGEEEIIIYQQQn8eXseBEEIIIYQQQgihkQDnHEUIIYQQQgghhNBIgNdxIIQQQgghhBBCaCTAjAMhhBBCCCGEEEIjAWYcCCGEEEIIIYQQGgkw40AIIYQQQgghhNBIgHOOIoQQQgghhBBCaCTA6zgQQgghhBBCCCE0EjzPOHp7exU9EoQQQgghhBBCCKE/D+9VQQghhBBCCCGE0EiAGQdCCCGEEEIIIYRGApyPAyGEEEIIIYQQQiMBZhwIIYQQQgghhBAaCTDjQAghhBBCCCGE0EiAGQdCCCGEEEIIIYRGAsw4EEIIIYQQQgghNBJgxoEQQgghhBBCCKGRADMOhBBCCCGEEEIIjQSYcSCE0DvhwYMH3d3dnb/U1dV19+7dZ8+e/c5Onjx50traWlVV9ejRoz86ANgLjKGioqK5ufn373EYunPnTn19vUQiGZ5HcffRI+HN6xypGGv4lKSr80lPj6J/NRBCCKF3wvOMY3h+UUMIITRYBAJBcnJybGxs3ABJSUk0Gq3nd5993bt3j0qlBgUF3bp1648OAP6huX37dkpKSllZ2Sv+0YHBdHV13bhx4/ePagg8ffoUhgRHDSNvb28vLCzkcrnD85/Oxo72S6Qsg1A/rOFTAUzarYcPFP2

In [23]:
base64_to_image(all_pages_data[1]['images'][0]['imagedata'], './images/test.jpg')