In [10]:
!pip install PyMuPDF pdfplumber pandas sentence_transformers chromadb google-generativeai

Collecting PyMuPDF
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.6-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting chromadb
  Downloading chromadb-1.0.7-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting pdfminer.six==20250327 (from pdfplumber)
  Downloading pdfminer_six-20250327-py3-none-any.whl.metadata (4.1 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chrom

In [11]:
!pip list | grep -E "typing-extensions|typing-inspection|pydantic|chromadb|google-generativeai"

chromadb                                 1.0.7
google-generativeai                      0.8.4
pydantic                                 2.11.3
pydantic_core                            2.33.1
typing-inspection                        0.4.0


In [12]:
import fitz
import pdfplumber
import os
from PIL import Image
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from transformers import CLIPProcessor, CLIPModel
import google.generativeai as genai
import re
from collections import Counter

2025-05-03 03:27:13.951967: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746242834.135456      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746242834.197143      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [13]:
def extract_content(pdf_path, output_folder="extracted_content"):
    """
    Extract content sequentially (tables then figures on each page) with consistent numbering
    With improved cropping for specific figures and tables
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    doc = fitz.open(pdf_path)
    pdf_pages = pdfplumber.open(pdf_path)
    extracted_data = {"figures": {}, "tables": {}}
    
    # Configuration
    DPI = 300
    content_counter = 1
    
    print(f"Processing PDF with {len(doc)} pages...")

    # Expected content locations
    content_locations = [
        ("Table_1_1", 1, "table"),
        ("Figure_1-1", 2, "figure"),
        ("Figure_1-2", 3, "figure"),
        ("Figure_1-3", 4, "figure"),
        ("Table_5_1", 5, "table"),
        ("Figure_1-4", 5, "figure"),
        ("Table_7_1", 7, "table"),
        ("Figure_1-5", 8, "figure"),
        ("Figure_1-6", 10, "figure"),
        ("Table_11_1", 11, "table"),
        ("Figure_2-1", 20, "figure"),
        ("Figure_2-2", 21, "figure"),
        ("Figure_2-3", 24, "figure"),
        ("Figure_2-4", 27, "figure"),
        ("Figure_2-5", 29, "figure"),
        ("Figure_2-6", 30, "figure"),
        ("Figure_2-7", 32, "figure")
    ]

    caption_data = extract_captions(doc, content_locations)

    for content_id, page_num, content_type in content_locations:
        try:
            print(f"\nProcessing {content_type} {content_id} on page {page_num}...")
            output_path = f"{output_folder}/{content_counter}.png"
            
            if content_type == "table":
                plumber_page = pdf_pages.pages[page_num - 1]
                tables = plumber_page.find_tables()
                
                if tables:
                    largest_table = max(tables, key=lambda t: (t.bbox[2] - t.bbox[0]) * (t.bbox[3] - t.bbox[1]))
                    margin_left_right = 30
                    margin_top = 30
                    margin_bottom = 5
                    width, height = plumber_page.width, plumber_page.height
                    bbox = (
                        max(0, largest_table.bbox[0] - margin_left_right),
                        max(0, largest_table.bbox[1] - margin_top),
                        min(width, largest_table.bbox[2] + margin_left_right),
                        min(height, largest_table.bbox[3] + margin_bottom)
                    )
                    if bbox[0] < bbox[2] and bbox[1] < bbox[3]:
                        table_img = plumber_page.crop(bbox).to_image(resolution=DPI)
                        table_img.save(output_path)
                        extracted_data["tables"][content_id] = {
                            "page": page_num,
                            "path": output_path,
                            "type": "table",
                            "number": content_counter,
                            "caption": caption_data.get(content_id, ""),
                            "original_id": content_id
                        }
                    else:
                        print(f"Invalid bounding box for {content_id}, falling back to full page")
                        page = doc.load_page(page_num - 1)
                        pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
                        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                        img.save(output_path)
                        extracted_data["tables"][content_id] = {
                            "page": page_num,
                            "path": output_path,
                            "type": "table",
                            "number": content_counter,
                            "caption": caption_data.get(content_id, ""),
                            "original_id": content_id
                        }
                else:
                    page = doc.load_page(page_num - 1)
                    pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
                    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                    img.save(output_path)
                    extracted_data["tables"][content_id] = {
                        "page": page_num,
                        "path": output_path,
                        "type": "table",
                        "number": content_counter,
                        "caption": caption_data.get(content_id, ""),
                        "original_id": content_id
                    }
                    
            else:
                page = doc.load_page(page_num - 1)
                width, height = page.rect.width, page.rect.height
                fig_id = content_id.replace("Figure_", "")
                
                if fig_id in ["1-1", "1-2", "1-3"]:
                    blocks = page.get_text("blocks")
                    end_y = height * 0.85
                    for block in blocks:
                        block_text = block[4].lower()
                        if "figure" in block_text and fig_id.lower() in block_text:
                            end_y = block[1] - 10
                            break
                    top_margin = 20
                    side_margin = 40
                    clip_rect = fitz.Rect(side_margin, top_margin, width - side_margin, end_y)
                    crop_type = f"Full figure from top for {fig_id}"
                    
                elif fig_id in ["2-5", "2-6", "2-7"]:
                    clip_rect = fitz.Rect(0, 0, width, height * 0.4)
                    crop_type = "top 40%"
                elif fig_id == "2-1":
                    clip_rect = fitz.Rect(0, height * 0.55, width, height)
                    crop_type = "center to bottom"
                elif fig_id == "2-2":
                    clip_rect = fitz.Rect(0, height * 0.52, width, height)
                    crop_type = "center to bottom"
                elif fig_id == "2-3":
                    clip_rect = fitz.Rect(0, 0, width, height * 0.45)
                    crop_type = "top 45%"
                elif fig_id == "2-4":
                    clip_rect = fitz.Rect(0, 0, width, height * 0.3)
                    crop_type = "top 30%"
                elif fig_id == "1-6":
                    clip_rect = fitz.Rect(0, height * 0.57, width, height * 0.95)
                    crop_type = "57% to 95%"
                elif fig_id == "1-4":
                    clip_rect = fitz.Rect(0, height * 0.5, width, height * 0.95)
                    crop_type = "50% to 95%"
                elif fig_id == "1-5":
                    margin = 20
                    clip_rect = fitz.Rect(margin, margin, width - margin, height - margin)
                    crop_type = "full page with margins"
                else:
                    blocks = page.get_text("blocks")
                    caption_block = None
                    for block in blocks:
                        block_text = block[4]
                        if f"Figure {fig_id}" in block_text or f"Figure {fig_id.replace('-', '.')}" in block_text:
                            caption_block = block
                            break
                    if caption_block:
                        top_margin = 50
                        clip_rect = fitz.Rect(40, top_margin, width - 40, caption_block[1] - 10)
                        crop_type = "above caption"
                    else:
                        margin = 20
                        clip_rect = fitz.Rect(margin, margin, width - margin, height - margin)
                        crop_type = "full page fallback"
                
                pix = page.get_pixmap(matrix=fitz.Matrix(2.5, 2.5), clip=clip_rect)
                fig_img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                fig_img.save(output_path)
                
                extracted_data["figures"][content_id] = {
                    "page": page_num,
                    "path": output_path,
                    "type": "figure",
                    "number": content_counter,
                    "crop_type": crop_type,
                    "caption": caption_data.get(content_id, ""),
                    "original_id": content_id
                }
                
            print(f"  Extracted as {content_counter} ({crop_type if content_type=='figure' else 'table'})")
            content_counter += 1
                
        except Exception as e:
            print(f"  Error extracting {content_id}: {str(e)}")
            try:
                page = doc.load_page(page_num - 1)
                pix = page.get_pixmap(matrix=fitz.Matrix(2.5, 2.5))
                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                output_path = f"{output_folder}/{content_counter}.png"
                img.save(output_path)
                
                if content_type == "table":
                    extracted_data["tables"][content_id] = {
                        "page": page_num,
                        "path": output_path,
                        "type": "table",
                        "number": content_counter,
                        "caption": caption_data.get(content_id, ""),
                        "original_id": content_id
                    }
                else:
                    extracted_data["figures"][content_id] = {
                        "page": page_num,
                        "path": output_path,
                        "type": "figure",
                        "number": content_counter,
                        "caption": caption_data.get(content_id, ""),
                        "original_id": content_id
                    }
                print(f"  Extracted full page as emergency fallback ({content_counter})")
                content_counter += 1
            except Exception as e2:
                print(f"  Complete failure extracting {content_id}: {str(e2)}")

    reference_mapping = {}
    for content_type in ["figures", "tables"]:
        for content_id, data in extracted_data[content_type].items():
            reference_mapping[content_id] = data["number"]
    
    pdf_pages.close()
    doc.close()
    return extracted_data, reference_mapping

In [14]:
def extract_captions(doc, content_locations):
    """Extract captions for each figure and table"""
    caption_data = {}
    for content_id, page_num, content_type in content_locations:
        try:
            page = doc.load_page(page_num - 1)
            text_blocks = page.get_text("blocks")
            
            if content_type == "figure":
                fig_id = content_id.replace("Figure_", "")
                search_terms = [
                    f"Figure {fig_id}",
                    f"Figure {fig_id.replace('-', '.')}",
                    f"Fig. {fig_id}",
                    f"Fig. {fig_id.replace('-', '.')}"
                ]
            else:
                table_id = content_id.replace("Table_", "")
                search_terms = [
                    f"Table {table_id}",
                    f"Table {table_id.replace('_', '.')}"
                ]
                
            for block in text_blocks:
                block_text = block[4]
                if any(term in block_text for term in search_terms):
                    caption_data[content_id] = block_text.strip()
                    break
                    
        except Exception as e:
            print(f"Error extracting caption for {content_id}: {str(e)}")
            
    return caption_data

In [15]:
def extract_and_chunk_text(pdf_path, chunk_size=100, overlap=20):
    """Extract and chunk text from PDF with page context"""
    doc = fitz.open(pdf_path)
    all_chunks = []
    
    for i, page in enumerate(doc):
        page_num = i + 1
        page_text = page.get_text()
        
        paragraphs = re.split(r'\n\s*\n', page_text)
        
        for para in paragraphs:
            para = para.strip()
            if len(para.split()) > chunk_size:
                chunks = sliding_window_chunks(para, chunk_size, overlap)
                for chunk in chunks:
                    all_chunks.append({
                        "page": page_num,
                        "text": chunk,
                        "type": "text"
                    })
            elif len(para.split()) > 5:
                all_chunks.append({
                    "page": page_num,
                    "text": para,
                    "type": "text"
                })
    
    doc.close()
    return all_chunks

In [16]:
def sliding_window_chunks(text, chunk_size=100, overlap=20):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        if len(chunk.split()) > 5:
            chunks.append(chunk)
    return chunks

In [17]:
def setup_text_and_image_db(pdf_path, extracted_data, reference_mapping):
    regular_chunks = extract_and_chunk_text(pdf_path)
    
    caption_chunks = []
    content_to_image_mapping = {}
    
    for content_type in ["figures", "tables"]:
        for content_id, data in extracted_data[content_type].items():
            if "caption" in data and data["caption"]:
                content_to_image_mapping[content_id] = data["number"]
                caption_chunks.append({
                    "page": data["page"],
                    "text": data["caption"],
                    "type": "caption",
                    "image_number": data["number"],
                    "content_type": content_type[:-1],
                    "original_id": data["original_id"]
                })
    
    all_chunks = regular_chunks + caption_chunks
    chunk_df = pd.DataFrame(all_chunks)
    
    semantic_mapping = {}
    for content_id, image_num in content_to_image_mapping.items():
        if "Figure" in content_id:
            fig_id = content_id.replace("Figure_", "")
            patterns = [
                f"figure {fig_id}",
                f"figure {fig_id.replace('-', '.')}",
                f"fig. {fig_id}",
                f"fig. {fig_id.replace('-', '.')}"
            ]
            for pattern in patterns:
                semantic_mapping[pattern] = image_num
        else:
            table_id = content_id.replace("Table_", "")
            patterns = [
                f"table {table_id}",
                f"table {table_id.replace('_', '.')}"
            ]
            for pattern in patterns:
                semantic_mapping[pattern] = image_num
    
    text_model = SentenceTransformer('intfloat/e5-large')
    chunk_df['embedding'] = chunk_df['text'].apply(lambda x: text_model.encode(x, normalize_embeddings=True))

    chroma_client = chromadb.PersistentClient(path="./chroma_db_13")
    text_collection = chroma_client.get_or_create_collection(name="text_embeddings", metadata={"hnsw:space": "cosine"})
    
    text_ids = [f"text_{i}" for i in range(len(chunk_df))]
    text_embeddings = [emb.tolist() for emb in chunk_df['embedding']]
    
    text_metadatas = []
    for _, row in chunk_df.iterrows():
        metadata = {
            "page": int(row['page']),
            "text": row['text'],
            "type": row['type'],
            "image_number": int(row.get('image_number', 0)) if row['type'] == "caption" else 0,
            "content_type": row.get('content_type', ''),
            "original_id": row.get('original_id', '')
        }
        text_metadatas.append(metadata)
    
    text_collection.upsert(ids=text_ids, embeddings=text_embeddings, metadatas=text_metadatas)

    clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    
    image_folder = "extracted_content"
    image_paths = sorted([os.path.join(image_folder, f) for f in os.listdir(image_folder) if f.endswith(('png', 'jpg', 'jpeg'))])
    
    image_embeddings = []
    image_names = []
    image_metadatas = []
    
    for path in image_paths:
        try:
            image = Image.open(path).convert("RGB")
            inputs = clip_processor(images=image, return_tensors="pt")
            outputs = clip_model.get_image_features(**inputs)
            
            image_embeddings.append(outputs[0].detach().numpy())
            image_name = os.path.basename(path).split('.')[0]
            image_names.append(image_name)
            
            matching_content_id = None
            content_type_str = ""
            for content_type in ["figures", "tables"]:
                for content_id, data in extracted_data[content_type].items():
                    if str(data.get("number")) == image_name:
                        matching_content_id = content_id
                        content_type_str = "figure" if "Figure" in str(matching_content_id) else "table"
                        break
                if matching_content_id:
                    break
            
            metadata = {
                "image_number": image_name,
                "path": path,
                "content_id": matching_content_id,
                "content_type": content_type_str,
                "caption": extracted_data.get("figures" if "Figure" in str(matching_content_id) else "tables", {}).get(matching_content_id, {}).get("caption", ""),
                "original_id": matching_content_id
            }
            image_metadatas.append(metadata)
            
        except Exception as e:
            print(f"Error processing image {path}: {str(e)}")
    
    if image_embeddings:
        image_embeddings = np.vstack(image_embeddings)
        image_collection = chroma_client.get_or_create_collection(name="image_embeddings", metadata={"hnsw:space": "cosine"})
        image_ids = [f"image_{i}" for i in range(len(image_paths))]
        image_collection.upsert(ids=image_ids, embeddings=image_embeddings.tolist(), metadatas=image_metadatas)
    else:
        image_collection = None
    
    return text_model, clip_model, clip_processor, text_collection, image_collection, semantic_mapping, reference_mapping

In [18]:
def retrieve_relevant_content(question, text_emb, clip_emb, text_collection, image_collection, semantic_mapping, reference_mapping, k=18):
    """Optimized content retrieval with balanced image selection"""
    
    text_results = text_collection.query(
        query_embeddings=[text_emb.tolist()],
        n_results=k,
        include=["metadatas", "distances"]
    )
    
    retrieved_items = list(zip(text_results['metadatas'][0], text_results['distances'][0]))
    
    # Weight context chunks by similarity
    weighted_items = []
    for item, distance in retrieved_items:
        if item['type'] == "text":
            weight = 1.0 / (distance + 1e-6)  # Inverse distance as weight
            weighted_items.append((item, weight))
    
    # Sort by weight and select top chunks
    weighted_items.sort(key=lambda x: x[1], reverse=True)
    top_text_items = weighted_items[:10]  # Limit to top 10 for context
    combined_text = " ".join([item[0]['text'] for item in top_text_items])
    
    # Caption-based matching
    caption_items = [item for item in retrieved_items if item[0]['type'] == "caption" and item[1] < 0.3]
    
    if caption_items:
        sorted_captions = sorted(caption_items, key=lambda x: x[1])
        best_caption = sorted_captions[0]
        if len(sorted_captions) == 1 or best_caption[1] < sorted_captions[1][1] * 0.65:
            selected_image = best_caption[0]['image_number']
            original_id = best_caption[0].get('original_id', '')
            print(f"Selected Image {selected_image} for caption match, distance: {best_caption[1]:.3f}, caption: {best_caption[0]['text'][:50]}...")
            return combined_text, int(selected_image), "caption_match", original_id
    
    # Explicit figure/table references
    question_lower = question.lower()
    combined_lower = combined_text.lower()
    
    for pattern, image_num in semantic_mapping.items():
        if pattern in question_lower:
            original_id = next((content_id for content_id, num in reference_mapping.items() if num == image_num), "")
            print(f"Selected Image {image_num} for question mention: {pattern}")
            return combined_text, int(image_num), "question_mention", original_id
    
    mentioned_images = []
    for pattern, image_num in semantic_mapping.items():
        if pattern in combined_lower:
            original_id = next((content_id for content_id, num in reference_mapping.items() if num == image_num), "")
            mentioned_images.append((pattern, image_num, original_id))
    
    if mentioned_images:
        pattern_counts = Counter([pattern for pattern, _, _ in mentioned_images])
        most_common_pattern = pattern_counts.most_common(1)[0][0]
        for pattern, image_num, original_id in mentioned_images:
            if pattern == most_common_pattern:
                print(f"Selected Image {image_num} for context mention: {pattern}")
                return combined_text, int(image_num), "context_mention", original_id
    
    # CLIP-based matching
    if image_collection:
        visual_cues = [
            "growth rate", "unemployment rate", "recession", "economic trend",
            "crisis impact", "gdp change", "price level", "inflation rate",
            "comparison", "decline", "increase", "chart", "graph", "data",
            "show", "visual", "illustrate"
        ]
        
        if any(cue in question_lower for cue in visual_cues):
            image_results = image_collection.query(
                query_embeddings=[clip_emb.tolist()],
                n_results=10,
                include=["metadatas", "distances"]
            )
            
            distances = image_results['distances'][0]
            if distances[0] < 0.4 and (len(distances) == 1 or distances[0] < distances[1] * 0.7):
                selected_image = int(image_results['metadatas'][0][0]['image_number'])
                original_id = image_results['metadatas'][0][0].get('original_id', '')
                if str(selected_image) in combined_lower or any(cue in combined_lower for cue in visual_cues):
                    top_images = [(m['image_number'], d) for m, d in zip(image_results['metadatas'][0][:3], distances[:3])]
                    print(f"Selected Image {selected_image} for CLIP match, distance: {distances[0]:.3f}, original_id: {original_id}, top-3: {top_images}")
                    return combined_text, selected_image, "clip_match", original_id
    
    print("Selected Image 0: no match")
    return combined_text, 0, "no_match", ""

In [19]:
def generate_answer(question, context, image, match_reason="", original_id="", extracted_data=None):
    """Generate a concise answer using Gemini API, based on context and image caption."""
    # Prepare image caption from extracted_data if image is selected
    image_context = ""
    if image > 0 and extracted_data:
        for content_type in ["figures", "tables"]:
            for content_id, data in extracted_data[content_type].items():
                if str(data.get("number")) == str(image):
                    image_context = f"Relevant image caption: {data.get('caption', '')}"
                    break
            if image_context:
                break

    # Prompt inspired by DA266, adapted for economics and Gemini
    prompt = f"""
You are an economics expert answering questions about a document.
Based on the provided context, provide a concise answer in complete sentences (max 65 words).
Do not reference any figures, tables, or visuals in the answer, even if relevant.
If you don't have enough information, say "I don't have enough information."

Question: {question}

Context:
{context}

{image_context}
"""

    try:
        # Configure Gemini model
        model = genai.GenerativeModel('gemini-1.5-flash')
        response = model.generate_content(
            prompt,
            generation_config={
                "max_output_tokens": 100,
                "temperature": 0.3
            }
        )
        answer = response.text.strip()
        # Clean up answer
        answer = re.sub(r'\s+', ' ', answer)
        answer = re.sub(r'\(see [Ff]igure \d+\)', '', answer)
        answer = re.sub(r'\(see [Tt]able \d+\)', '', answer)
        return answer[:65]
    except Exception as e:
        print(f"Error generating answer with Gemini: {e}")
        return "I don't have enough information."

In [20]:
def process_question(question, question_id, text_model, clip_model, clip_processor, text_collection, image_collection, semantic_mapping, reference_mapping, extracted_data, k=18):
    """Process question and generate answer with optimized image selection"""
    
    question_text_emb = text_model.encode(question, normalize_embeddings=True)
    
    inputs = clip_processor(text=[question[:480]], return_tensors="pt", padding=True, truncation=True, max_length=77)
    question_clip_emb = clip_model.get_text_features(**inputs)[0].detach().numpy()
    
    context, image, match_reason, original_id = retrieve_relevant_content(
        question, question_text_emb, question_clip_emb, text_collection,
        image_collection, semantic_mapping, reference_mapping, k
    )
    
    answer = generate_answer(question, context, image, match_reason, original_id, extracted_data)
    
    print(f"Q{question_id} | Image {image} | Reason: {match_reason} | Answer: {answer[:50]}...")
    
    return {'ID': question_id, 'Text': answer, 'Image': image}

In [21]:
def main():
    pdf_path = "/kaggle/input/lab2-p1/document.pdf"
    questions_path = "/kaggle/input/lab2-p1/Lab_2_Part_1_Questions.csv"

    # Step 1: Extract visual and textual content from PDF
    extracted_data, reference_mapping = extract_content(pdf_path)

    # Step 2: Build embedding DBs
    text_model, clip_model, clip_processor, text_collection, image_collection, semantic_mapping, reference_mapping = setup_text_and_image_db(
        pdf_path, extracted_data, reference_mapping
    )

    # Step 3: Configure Gemini with provided key
    genai.configure(api_key="AIzaSyCiRPegQJIRStEUUTYAPtg6Mhaibnw4EGg")

    # Step 4: Load questions and generate answers
    questions_df = pd.read_csv(questions_path)
    results = [
        process_question(
            row['Question'], row['ID'], text_model, clip_model, clip_processor,
            text_collection, image_collection, semantic_mapping, reference_mapping, extracted_data
        )
        for _, row in questions_df.iterrows()
    ]

    # Step 5: Save submission
    final_submission = pd.DataFrame(results)[['ID', 'Text', 'Image']]
    final_submission.to_csv("optimized_submission.csv", index=False)

    print(f"✅ Processed {len(final_submission)} questions")
    print(f"📊 With images: {sum(final_submission['Image'] > 0)}, Without: {sum(final_submission['Image'] == 0)}")
    print("🖼️ Image distribution:")
    print(final_submission['Image'].value_counts().sort_index())

if __name__ == "__main__":
    main()

Processing PDF with 37 pages...

Processing table Table_1_1 on page 1...
  Extracted as 1 (table)

Processing figure Figure_1-1 on page 2...
  Extracted as 2 (Full figure from top for 1-1)

Processing figure Figure_1-2 on page 3...
  Extracted as 3 (Full figure from top for 1-2)

Processing figure Figure_1-3 on page 4...
  Extracted as 4 (Full figure from top for 1-3)

Processing table Table_5_1 on page 5...
  Extracted as 5 (table)

Processing figure Figure_1-4 on page 5...
  Extracted as 6 (50% to 95%)

Processing table Table_7_1 on page 7...
  Extracted as 7 (table)

Processing figure Figure_1-5 on page 8...
  Extracted as 8 (full page with margins)

Processing figure Figure_1-6 on page 10...
  Extracted as 9 (57% to 95%)

Processing table Table_11_1 on page 11...
  Extracted as 10 (table)

Processing figure Figure_2-1 on page 20...
  Extracted as 11 (center to bottom)

Processing figure Figure_2-2 on page 21...
  Extracted as 12 (center to bottom)

Processing figure Figure_2-3 on p

modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/67.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Selected Image 2 for context mention: figure 1-1
Q1 | Image 2 | Reason: context_mention | Answer: The 2008 global economic crisis stemmed from a col...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Selected Image 13 for caption match, distance: 0.140, caption: unemployment rate would equal zero. This would mak...
Q2 | Image 13 | Reason: caption_match | Answer: Rising unemployment is concerning because it signi...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Selected Image 12 for caption match, distance: 0.159, caption: We have focused so far on the level of real GDP. T...
Q3 | Image 12 | Reason: caption_match | Answer: Economists use real GDP, which adjusts for price c...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Selected Image 2 for context mention: figure 1-1
Q4 | Image 2 | Reason: context_mention | Answer: The 2009 recession severely impacted the global ec...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Selected Image 13 for context mention: figure 2-3
Q5 | Image 13 | Reason: context_mention | Answer: Following the 2008 crisis, U.S. unemployment rose ...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Selected Image 9 for context mention: figure 1-6
Q6 | Image 9 | Reason: context_mention | Answer: Before the crisis, China's economy grew at approxi...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Selected Image 2 for context mention: figure 1-1
Q7 | Image 2 | Reason: context_mention | Answer: The 2008 crisis caused stock market declines globa...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Selected Image 15 for context mention: figure 2-5
Q8 | Image 15 | Reason: context_mention | Answer: No, fast economic growth in the U.S. doesn't alway...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Selected Image 14 for caption match, distance: 0.155, caption: represent the consumption basket of a typical urba...
Q9 | Image 14 | Reason: caption_match | Answer: No, consumer prices (CPI) and overall economic pri...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Selected Image 0: no match
Q10 | Image 0 | Reason: no_match | Answer: Even before the 2007 crisis, Europe had high unemp...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Selected Image 3 for caption match, distance: 0.192, caption: growth in both advanced countries and in emerging ...
Q11 | Image 3 | Reason: caption_match | Answer: China countered the 2008 crisis's negative demand ...
✅ Processed 11 questions
📊 With images: 10, Without: 1
🖼️ Image distribution:
Image
0     1
2     3
3     1
9     1
12    1
13    2
14    1
15    1
Name: count, dtype: int64
