# Image & Diagram Processing with Vision LLM

In [5]:
import base64
from io import BytesIO
from PIL import Image
import fitz
from typing import List, Dict
from IPython.display import display, Markdown

def extract_images_from_pdf(pdf_path: str) -> List[Dict]:
    """
    Extract images from PDF and prepare for vision LLM analysis.
    """
    doc = fitz.open(pdf_path)
    images = []
    
    for page_num, page in enumerate(doc, start=1):
        image_list = page.get_images()
        
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            
            pil_image = Image.open(BytesIO(image_bytes))
            
            # Get surrounding text context
            img_rect = page.get_image_rects(xref)[0] if page.get_image_rects(xref) else None
            context = ""
            if img_rect:
                # Get text above and below image
                context = page.get_text("text", clip=fitz.Rect(
                    0, max(0, img_rect.y0 - 100),
                    page.rect.width, min(page.rect.height, img_rect.y1 + 100)
                ))
            
            images.append({
                'page': page_num,
                'image_index': img_index,
                'image': pil_image,
                'image_bytes': image_bytes,
                'context': context.strip(),
                'format': base_image['ext'],
                'width': base_image['width'],
                'height': base_image['height']
            })
    
    doc.close()
    return images

def analyze_image_with_vision_llm(image_data: Dict) -> str:
    """
    Use GPT-4 Vision to describe images/diagrams for RAG.
    """
    from openai import OpenAI
    
    client = OpenAI()
    
    # Convert PIL image to base64
    buffered = BytesIO()
    image_data['image'].save(buffered, format="PNG")
    img_base64 = base64.b64encode(buffered.getvalue()).decode()
    
    # Create vision prompt with context
    prompt = f"""Analyze this image from a document (page {image_data['page']}).

Surrounding text context:
{image_data['context'][:300]}

Please provide:
1. A concise description of what the image shows
2. Key information or data points visible
3. How this image relates to the surrounding text
4. Any text visible in the image (if applicable)

Format your response as plain text suitable for RAG retrieval."""
    
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/png;base64,{img_base64}"
                        }
                    }
                ]
            }
        ],
        max_tokens=500
    )
    
    return response.choices[0].message.content

# Example usage for complete image processing pipeline
def process_pdf_with_images(pdf_path: str) -> List[Dict]:
    """
    Complete pipeline: extract images and analyze with vision LLM.
    """
    images = extract_images_from_pdf(pdf_path)
    enriched_chunks = []
    
    for img_data in images:
        # Analyze image with vision LLM
        description = analyze_image_with_vision_llm(img_data)
        
        # Create chunk combining context + image description
        chunk_text = f"""
[Image on page {img_data['page']}]

Context: {img_data['context']}

Image Description: {description}
"""
        
        enriched_chunks.append({
            'text': chunk_text.strip(),
            'metadata': {
                'type': 'image',
                'page': img_data['page'],
                'image_index': img_data['image_index'],
                'has_vision_analysis': True
            }
        })
    
    return enriched_chunks

In [6]:
image_chunks = process_pdf_with_images("RAG_BENCHMARK.pdf")

In [7]:
display(Markdown(image_chunks[4]["text"]))

[Image on page 6]

Context: ACME CORPORATION â€” INTERNAL USE ONLY
Certain dependencies introduced latency that could not be isolated to a single functional unit.
Operational throughput exhibited non-linear adjustments over the observation window, influenced by
regional scheduling constraints. Internal coordination benefited from informal escalation paths that were
not formally documented.

Image Description: 1. The image shows a step chart with several increases and horizontal segments.

2. Key information or data points:
   - Several discrete steps representing changes or increments.
   - Periods of stability indicated by horizontal lines.

3. The image likely illustrates non-linear adjustments in operational throughput as mentioned in the surrounding text. The steps could represent changes influenced by regional scheduling constraints or latency issues.

4. Visible text in the image:
   - "Figure 3" in the top left corner.