In [None]:
import fitz  # PyMuPDF
import io
import os
from PIL import Image

def find_front_page_photo_and_caption(pdf_path, output_image_path="front_page_photo.png"):
    """
    Extracts the largest image from the first page of a PDF and
    finds the closest text block underneath it, assuming it's the caption.

    Args:
        pdf_path (str): Path to the input PDF file.
        output_image_path (str): Path to save the extracted image.
    """
    
    # --- Part 1: Find and Extract the Largest Image ---
    
    doc = None
    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        print(f"Error: Could not open file '{pdf_path}'. {e}")
        return

    if doc.page_count == 0:
        print("Error: PDF has no pages.")
        doc.close()
        return

    # Load the first page (index 0)
    page = doc.load_page(0)

    # Get a list of all images on the page with their bounding boxes
    image_info_list = page.get_image_info(xrefs=True)
    if not image_info_list:
        print("No images found on the front page.")
        doc.close()
        return

    # Find the largest image by its on-page rendered area
    largest_image_info = None
    largest_area = 0
    
    for img_info in image_info_list:
        bbox = img_info['bbox']  # (x0, y0, x1, y1)
        area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
        if area > largest_area:
            largest_area = area
            largest_image_info = img_info

    if largest_image_info is None:
        print("Could not determine largest image.")
        doc.close()
        return

    # Get the bounding box of the largest image
    img_bbox = largest_image_info['bbox']
    img_xref = largest_image_info['xref']

    # Extract the raw image bytes
    try:
        base_image = doc.extract_image(img_xref)
        image_bytes = base_image["image"]
    except Exception as e:
        print(f"Error extracting image (xref={img_xref}): {e}")
        doc.close()
        return

    # Save the image using Pillow
    try:
        pil_image = Image.open(io.BytesIO(image_bytes))

        # --- THIS IS THE FIX ---
        # Convert CMYK images to RGB so they can be saved as PNG
        if pil_image.mode == "CMYK":
            pil_image = pil_image.convert("RGB")
        # ---------------------

        pil_image.save(output_image_path)
        print(f"✅ Successfully saved photo to: {output_image_path}")
        
    except Exception as e:
        print(f"Error saving image: {e}") # This error should now be gone
        doc.close()
        return

    # --- Part 2: Find the Closest Caption ---

    # Get all text "blocks" (paragraphs) with their bounding boxes
    # Format: (x0, y0, x1, y1, "text...", block_no, block_type)
    text_blocks = page.get_text("blocks")
    
    closest_caption = None
    min_vertical_distance = float('inf')
    
    # Define a reasonable max distance (in points) to look for a caption
    # (72 points = 1 inch)
    MAX_CAPTION_DISTANCE = 50 

    for block in text_blocks:
        block_text = block[4].strip().replace("\n", " ") # Clean up text
        block_bbox = block[:4] # (x0, y0, x1, y1)
        
        # Heuristic: Caption must be *below* the image
        # Calculate vertical distance from image bottom to text top
        vertical_dist = block_bbox[1] - img_bbox[3]
        
        # Heuristic: Caption must be *horizontally aligned* with the image
        # (i.e., they overlap on the X-axis)
        img_x0, _, img_x1, _ = img_bbox
        block_x0, _, block_x1, _ = block_bbox
        
        is_horizontally_aligned = (
            max(img_x0, block_x0) < min(img_x1, block_x1)
        )

        # Check if this block is a valid candidate
        if 0 < vertical_dist < min_vertical_distance and is_horizontally_aligned:
            min_vertical_distance = vertical_dist
            closest_caption = block_text
            
    if closest_caption and min_vertical_distance <= MAX_CAPTION_DISTANCE:
        print(f"✅ Found likely caption:\n---\n{closest_caption}\n---")
    else:
        print("⚠️ Could not find a likely caption text below the image.")
        
    doc.close()

# --- Run the function ---
if __name__ == "__main__":
    os.makedirs('Photos', exist_ok=True)

    # --- Configuration ---
    PDF_FILE = r"C:\Users\dgara\Documents\Universidad\Florida Tech\Work\Bisk College of Business\WSJ - Dataset\Daily Newspapers\wallstreetjournal_20251105_TheWallStreetJournal.pdf" # <--- CHANGE THIS
    OUTPUT_IMAGE = os.path.join('Photos', 'front_page_photo.png')
    # ---------------------
    #     
    if not os.path.exists(PDF_FILE):
        print(f"Error: File not found at '{PDF_FILE}'.")
        print("Please update the 'PDF_FILE' variable in the script.")
    else:
        find_front_page_photo_and_caption(PDF_FILE, OUTPUT_IMAGE)

✅ Successfully saved photo to: Photos\front_page_photo.png
✅ Found likely caption:
---
Tuesday was a big night for Democrats as, from left, Abigail Spanberger was elected Virginia’s first female governor, Zohran Mamdani rallied young voters to become New York City’s next mayor, and Mikie Sherrill won the New Jersey governor’s race.
---
