In [1]:
!pip install PyMuPDF pdfplumber pandas sentence-transformers faiss-cpu pillow transformers torch torchvision google-generativeai

Collecting PyMuPDF
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.6-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting pdfminer.six==20250327 (from pdfplumber)
  Downloading pdfminer_six-20250327-py3-none-any.whl.metadata (4.1 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cubl

In [2]:
import fitz  # PyMuPDF
import pdfplumber
import os
from PIL import Image
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from transformers import CLIPProcessor, CLIPModel
import google.generativeai as genai
import time
import re

2025-04-30 22:15:25.963171: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746051326.429399      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746051326.548706      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
GEMINI_API_KEY = "AIzaSyCGXFlbdqFFAUDwX7mzK7s8K0atPAxtJlM"  # Provided Gemini API key
genai.configure(api_key=GEMINI_API_KEY)
TEAM_NAME = "Pair_Programming_Team_28"

In [4]:

# Cell 4: Define image extraction function
def extract_content(pdf_path, output_folder="/kaggle/working/extracted_content"):
    """
    Extract content sequentially (tables then figures on each page) with output numbered 1 to 17
    With improved cropping for specific figures and tables
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    doc = fitz.open(pdf_path)
    pdf_pages = pdfplumber.open(pdf_path)
    extracted_data = {"figures": {}, "tables": {}}
    
    DPI = 200  # Reduced for Kaggle memory constraints
    content_counter = 1
    
    print(f"Processing PDF with {len(doc)} pages...")

    content_locations = [
        ("Table_1_1", 1, "table"),
        ("Figure_1-1", 2, "figure"),
        ("Figure_1-2", 3, "figure"),
        ("Figure_1-3", 4, "figure"),
        ("Table_5_1", 5, "table"),
        ("Figure_1-4", 5, "figure"),
        ("Table_7_1", 7, "table"),
        ("Figure_1-5", 8, "figure"),
        ("Figure_1-6", 10, "figure"),
        ("Table_11_1", 11, "table"),
        ("Figure_2-1", 20, "figure"),
        ("Figure_2-2", 21, "figure"),
        ("Figure_2-3", 24, "figure"),
        ("Figure_2-4", 27, "figure"),
        ("Figure_2-5", 29, "figure"),
        ("Figure_2-6", 30, "figure"),
        ("Figure_2-7", 32, "figure")
    ]

    for content_id, page_num, content_type in content_locations:
        try:
            print(f"\nProcessing {content_type} {content_id} on page {page_num}...")
            output_path = f"{output_folder}/{content_counter}.png"
            
            if content_type == "table":
                plumber_page = pdf_pages.pages[page_num - 1]
                tables = plumber_page.find_tables()
                
                if tables:
                    largest_table = max(tables, key=lambda t: (t.bbox[2] - t.bbox[0]) * (t.bbox[3] - t.bbox[1]))
                    margin_left_right = 30
                    margin_top = 30
                    margin_bottom = 5
                    width, height = doc[page_num - 1].rect.width, doc[page_num - 1].rect.height
                    bbox = (
                        max(0, largest_table.bbox[0] - margin_left_right),
                        max(0, largest_table.bbox[1] - margin_top),
                        min(width, largest_table.bbox[2] + margin_left_right),
                        min(height, largest_table.bbox[3] + margin_bottom)
                    )
                    table_img = plumber_page.crop(bbox).to_image(resolution=DPI)
                    table_img.save(output_path)
                    extracted_data["tables"][content_id] = {
                        "page": page_num,
                        "path": output_path,
                        "type": "table",
                        "number": content_counter
                    }
                else:
                    page = doc.load_page(page_num - 1)
                    pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
                    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                    img.save(output_path)
                    extracted_data["tables"][content_id] = {
                        "page": page_num,
                        "path": output_path,
                        "type": "table",
                        "number": content_counter
                    }
                    
            else:
                page = doc.load_page(page_num - 1)
                width, height = page.rect.width, page.rect.height
                fig_id = content_id.replace("Figure_", "")
                
                if fig_id in ["1-1", "1-2", "1-3"]:
                    blocks = page.get_text("blocks")
                    end_y = height * 0.85
                    for block in blocks:
                        block_text = block[4].lower()
                        if "figure" in block_text and fig_id.lower() in block_text:
                            end_y = block[1] - 10
                            break
                    top_margin = 20
                    side_margin = 40
                    clip_rect = fitz.Rect(side_margin, top_margin, width - side_margin, end_y)
                    crop_type = f"Full figure from top for {fig_id}"
                elif fig_id in ["2-5", "2-6", "2-7"]:
                    clip_rect = fitz.Rect(0, 0, width, height * 0.4)
                    crop_type = "top 40%"
                elif fig_id == "2-1":
                    clip_rect = fitz.Rect(0, height * 0.55, width, height)
                    crop_type = "center to bottom"
                elif fig_id == "2-2":
                    clip_rect = fitz.Rect(0, height * 0.52, width, height)
                    crop_type = "center to bottom"
                elif fig_id == "2-3":
                    clip_rect = fitz.Rect(0, 0, width, height * 0.45)
                    crop_type = "top 45%"
                elif fig_id == "2-4":
                    clip_rect = fitz.Rect(0, 0, width, height * 0.3)
                    crop_type = "top 30%"
                elif fig_id == "1-6":
                    clip_rect = fitz.Rect(0, height * 0.57, width, height * 0.95)
                    crop_type = "57% to 95%"
                elif fig_id == "1-4":
                    clip_rect = fitz.Rect(0, height * 0.5, width, height * 0.95)
                    crop_type = "50% to 95%"
                elif fig_id == "1-5":
                    margin = 20
                    clip_rect = fitz.Rect(margin, margin, width - margin, height - margin)
                    crop_type = "full page with margins"
                else:
                    blocks = page.get_text("blocks")
                    caption_block = None
                    for block in blocks:
                        block_text = block[4]
                        if f"Figure {fig_id}" in block_text or f"Figure {fig_id.replace('-', '.')}" in block_text:
                            caption_block = block
                            break
                    if caption_block:
                        top_margin = 50
                        clip_rect = fitz.Rect(40, top_margin, width - 40, caption_block[1] - 10)
                        crop_type = "above caption"
                    else:
                        margin = 20
                        clip_rect = fitz.Rect(margin, margin, width - margin, height - margin)
                        crop_type = "full page fallback"
                
                pix = page.get_pixmap(matrix=fitz.Matrix(2.5, 2.5), clip=clip_rect)
                fig_img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                fig_img.save(output_path)
                extracted_data["figures"][content_id] = {
                    "page": page_num,
                    "path": output_path,
                    "type": "figure",
                    "number": content_counter,
                    "crop_type": crop_type
                }
                
            print(f"  Extracted as {content_counter} ({crop_type if content_type=='figure' else 'table'})")
            content_counter += 1
                
        except Exception as e:
            print(f"  Error extracting {content_id}: {str(e)}")
            try:
                page = doc.load_page(page_num - 1)
                pix = page.get_pixmap(matrix=fitz.Matrix(2.5, 2.5))
                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                output_path = f"{output_folder}/{content_counter}.png"
                img.save(output_path)
                if content_type == "table":
                    extracted_data["tables"][content_id] = {
                        "page": page_num,
                        "path": output_path,
                        "type": "table",
                        "number": content_counter
                    }
                else:
                    extracted_data["figures"][content_id] = {
                        "page": page_num,
                        "path": output_path,
                        "type": "figure",
                        "number": content_counter
                    }
                print(f"  Extracted full page as emergency fallback ({content_counter})")
                content_counter += 1
            except Exception as e2:
                print(f"  Complete failure extracting {content_id}: {str(e2)}")

    pdf_pages.close()
    doc.close()
    return extracted_data


In [5]:
# Cell 5: Execute image extraction
pdf_path = "/kaggle/input/p1-lab2/document.pdf"  # Update with your dataset path
extracted_data = extract_content(pdf_path, output_folder="/kaggle/working/extracted_content")
print("\n=== Extraction Results ===")
print(f"Extracted {len(extracted_data['tables'])} tables and {len(extracted_data['figures'])} figures (total {len(extracted_data['tables']) + len(extracted_data['figures'])} items)")

Processing PDF with 37 pages...

Processing table Table_1_1 on page 1...
  Extracted as 1 (table)

Processing figure Figure_1-1 on page 2...
  Extracted as 2 (Full figure from top for 1-1)

Processing figure Figure_1-2 on page 3...
  Extracted as 3 (Full figure from top for 1-2)

Processing figure Figure_1-3 on page 4...
  Extracted as 4 (Full figure from top for 1-3)

Processing table Table_5_1 on page 5...
  Extracted as 5 (table)

Processing figure Figure_1-4 on page 5...
  Extracted as 6 (50% to 95%)

Processing table Table_7_1 on page 7...
  Extracted as 7 (table)

Processing figure Figure_1-5 on page 8...
  Extracted as 8 (full page with margins)

Processing figure Figure_1-6 on page 10...
  Extracted as 9 (57% to 95%)

Processing table Table_11_1 on page 11...
  Extracted as 10 (table)

Processing figure Figure_2-1 on page 20...
  Extracted as 11 (center to bottom)

Processing figure Figure_2-2 on page 21...
  Extracted as 12 (center to bottom)

Processing figure Figure_2-3 on p

In [6]:
# Cell 6: Text chunking and embedding
def sliding_window_chunks(text, chunk_size=200, overlap=40):
    words = text.split()
    return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size - overlap)]

doc = fitz.open(pdf_path)
all_chunks = []
for i, page in enumerate(doc):
    page_text = page.get_text()
    chunks = sliding_window_chunks(page_text, chunk_size=200, overlap=40)
    for chunk in chunks:
        all_chunks.append({"page": i + 1, "text": chunk})
chunk_df = pd.DataFrame(all_chunks)
doc.close()

text_model = SentenceTransformer('intfloat/e5-large')
chunk_df['embedding'] = chunk_df['text'].apply(lambda x: text_model.encode(x, normalize_embeddings=True))
print(f"Generated embeddings for {len(chunk_df)} text chunks")

modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/67.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Generated embeddings for 156 text chunks


In [7]:
# Cell 7: Store text embeddings in FAISS
dimension = chunk_df['embedding'].iloc[0].shape[0]
text_index = faiss.IndexFlatIP(dimension)  # Inner Product for cosine similarity
text_embeddings = np.vstack(chunk_df['embedding'].values).astype('float32')
# Normalize text embeddings
text_embeddings = text_embeddings / np.linalg.norm(text_embeddings, axis=1, keepdims=True)
text_index.add(text_embeddings)
text_metadatas = [{"page": int(row['page']), "text": row['text']} for _, row in chunk_df.iterrows()]
print(f"Stored {text_index.ntotal} text embeddings in FAISS")

Stored 156 text embeddings in FAISS


In [13]:
# Cell 8: Generate and store image embeddings
clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
image_folder = "/kaggle/working/extracted_content"
image_paths = sorted([os.path.join(image_folder, f) for f in os.listdir(image_folder) if f.endswith(('png', 'jpg', 'jpeg'))])
print(f"DEBUG: Found {len(image_paths)} images in {image_folder}: {image_paths}")
image_embeddings = []
image_names = []
for path in image_paths:
    try:
        image = Image.open(path).convert("RGB")
        inputs = clip_processor(images=image, return_tensors="pt")
        outputs = clip_model.get_image_features(**inputs)
        emb = outputs[0].detach().numpy()
        emb = emb / np.linalg.norm(emb)  # Normalize
        image_embeddings.append(emb)
        image_names.append(os.path.basename(path).split('.')[0])
    except Exception as e:
        print(f"Error processing image {path}: {e}")
if len(image_embeddings) != 17:
    print(f"WARNING: Expected 17 images, got {len(image_embeddings)}")
image_embeddings = np.vstack(image_embeddings).astype('float32')
print(f"Image embedding norms: {[np.linalg.norm(emb) for emb in image_embeddings]}")
image_index = faiss.IndexFlatIP(image_embeddings.shape[1])  # Inner Product for cosine similarity
image_index.add(image_embeddings)
image_metadatas = [{"image_number": name, "path": path} for name, path in zip(image_names, image_paths)]
print(f"Stored {image_index.ntotal} image embeddings in FAISS")

DEBUG: Found 17 images in /kaggle/working/extracted_content: ['/kaggle/working/extracted_content/1.png', '/kaggle/working/extracted_content/10.png', '/kaggle/working/extracted_content/11.png', '/kaggle/working/extracted_content/12.png', '/kaggle/working/extracted_content/13.png', '/kaggle/working/extracted_content/14.png', '/kaggle/working/extracted_content/15.png', '/kaggle/working/extracted_content/16.png', '/kaggle/working/extracted_content/17.png', '/kaggle/working/extracted_content/2.png', '/kaggle/working/extracted_content/3.png', '/kaggle/working/extracted_content/4.png', '/kaggle/working/extracted_content/5.png', '/kaggle/working/extracted_content/6.png', '/kaggle/working/extracted_content/7.png', '/kaggle/working/extracted_content/8.png', '/kaggle/working/extracted_content/9.png']
Image embedding norms: [0.99999994, 0.99999994, 0.99999994, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.9999999, 0.99999994, 1.0, 0.99999994, 1.0, 1.0, 0.99999994, 0.99999994]
Stored 17 image embeddings in FAISS

In [14]:
# Cell 9: Load and embed questions
questions_path = "/kaggle/input/p1-lab2/Lab_2_Part_1_Questions.csv"  # Update with your dataset path
questions_df = pd.read_csv(questions_path)
questions_df['text_embedding'] = questions_df['Question'].apply(lambda x: text_model.encode(x, normalize_embeddings=True))

def truncate_text(text, max_chars=480):
    return text[:max_chars] if len(text) > max_chars else text

questions_df['clip_embedding'] = questions_df['Question'].apply(
    lambda x: clip_processor(text=[truncate_text(x)], return_tensors="pt", padding=True, truncation=True, max_length=77)
)
questions_df['clip_embedding'] = questions_df['clip_embedding'].apply(
    lambda x: clip_model.get_text_features(**x)[0].detach().numpy()
)
print(f"Embedded {len(questions_df)} questions")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embedded 11 questions


In [15]:
# Cell 10: Retrieve text and images (hybrid RAG)
# Mapping of figure/table IDs to image numbers
FIGURE_TABLE_MAPPING = {
    "Table_1_1": 1, "Table_5_1": 5, "Table_7_1": 7, "Table_11_1": 10,
    "Figure_1-1": 2, "Figure_1-2": 3, "Figure_1-3": 4, "Figure_1-4": 6,
    "Figure_1-5": 8, "Figure_1-6": 9, "Figure_2-1": 11, "Figure_2-2": 12,
    "Figure_2-3": 13, "Figure_2-4": 14, "Figure_2-5": 15, "Figure_2-6": 16,
    "Figure_2-7": 17
}

def retrieve_top_k_texts_and_image(question_text, text_emb, clip_emb, k=3, similarity_threshold=0.2):
    text_emb = text_emb.astype('float32').reshape(1, -1)
    clip_emb = clip_emb.astype('float32')
    clip_emb = clip_emb / np.linalg.norm(clip_emb)
    clip_emb = clip_emb.reshape(1, -1)
    
    # Retrieve top-k texts
    distances, indices = text_index.search(text_emb, k)
    retrieved_texts = [text_metadatas[idx]['text'] for idx in indices[0]]
    combined_text = " ".join(retrieved_texts)
    
    # Retrieve top-2 images (or fewer if index is smaller)
    n_images = min(2, image_index.ntotal)
    distances, indices = image_index.search(clip_emb, n_images)
    print(f"DEBUG: FAISS returned {len(indices[0])} images for question: {question_text}")
    image_scores = []
    for i, idx in enumerate(indices[0]):
        similarity = distances[0][i] if i < len(distances[0]) else 0
        image_num = int(image_metadatas[idx]['image_number'])
        score = similarity
        image_scores.append((image_num, score, similarity))
    
    # Keyword-based scoring
    figure_match = re.search(r'(?:Figure|Fig\.|Table)\s*(\d+[-_]\d+)', question_text, re.IGNORECASE)
    keyword_image = 0
    keyword_score = 0
    if figure_match:
        fig_id = figure_match.group(1).replace('_', '-')
        fig_key = f"{'Table' if 'Table' in question_text else 'Figure'}_{fig_id}"
        if fig_key in FIGURE_TABLE_MAPPING:
            keyword_image = FIGURE_TABLE_MAPPING[fig_key]
            keyword_score = 0.9
            print(f"DEBUG: Keyword match found: {fig_key} -> Image {keyword_image}")
    
    # Combine scores
    final_scores = []
    for image_num, clip_score, similarity in image_scores:
        score = clip_score
        if image_num == keyword_image:
            score += keyword_score * 2
        final_scores.append((image_num, score, similarity))
    
    # Select top image
    if final_scores:
        top_image_num, top_score, top_similarity = max(final_scores, key=lambda x: x[1])
        image_number = top_image_num if top_score >= similarity_threshold else 0
    else:
        image_number = keyword_image if keyword_score > 0 else 0
        top_similarity = keyword_score if keyword_score > 0 else 0
    
    # Debug logging
    print(f"DEBUG: Question: {question_text}")
    print(f"  Top images: {[(num, sim) for num, _, sim in image_scores]}")
    print(f"  Keyword image: {keyword_image}, Score: {keyword_score}")
    print(f"  Final: Image {image_number}, Score: {top_score:.4f}, CLIP Similarity: {top_similarity:.4f}")
    
    return combined_text, image_number

questions_df[['retrieved_text', 'image']] = questions_df.apply(
    lambda row: pd.Series(retrieve_top_k_texts_and_image(row['Question'], row['text_embedding'], row['clip_embedding'], k=3)),
    axis=1
)
print(f"Retrieved text and images for {len(questions_df)} questions")

DEBUG: FAISS returned 2 images for question: What sparked the global economic crisis around 2008?
DEBUG: Question: What sparked the global economic crisis around 2008?
  Top images: [(1, 0.2721006), (12, 0.2543431)]
  Keyword image: 0, Score: 0
  Final: Image 1, Score: 0.2721, CLIP Similarity: 0.2721
DEBUG: FAISS returned 2 images for question: Why should we worry about unemployment rates going up?
DEBUG: Question: Why should we worry about unemployment rates going up?
  Top images: [(16, 0.25068176), (5, 0.2491242)]
  Keyword image: 0, Score: 0
  Final: Image 16, Score: 0.2507, CLIP Similarity: 0.2507
DEBUG: FAISS returned 2 images for question: How do economists measure economic growth without price changes messing it up?
DEBUG: Question: How do economists measure economic growth without price changes messing it up?
  Top images: [(6, 0.24711134), (15, 0.23862179)]
  Keyword image: 0, Score: 0
  Final: Image 6, Score: 0.2471, CLIP Similarity: 0.2471
DEBUG: FAISS returned 2 images for

In [16]:
# Cell 11: Generate answers using Gemini
def generate_gemini_answer(question, context, image, retries=3, delay=5):
    prompt = f"""As an expert in economics, answer the following question using only the provided context. Be concise and accurate. Reference the figure or table number if relevant.

Question: {question}
Context: {context}
{"Relevant Figure/Table: " + str(image) if image != 0 else ""}

Answer:"""
    for attempt in range(retries):
        try:
            model = genai.GenerativeModel('gemini-1.5-flash')
            response = model.generate_content(prompt, generation_config={"max_output_tokens": 100})
            print(f"\nPROMPT:\n  Response: {response.text.strip()}\n")
            return response.text.strip()
        except Exception as e:
            print(f"Error generating response for question '{question}' (attempt {attempt+1}/{retries}): {e}")
            if attempt < retries - 1:
                time.sleep(delay)
            else:
                return "Error generating response"
    
questions_df['answer'] = questions_df.apply(
    lambda row: generate_gemini_answer(row['Question'], row['retrieved_text'], row['image']),
    axis=1
)


PROMPT:
  Response: The 2008 global economic crisis was sparked by a decline in housing prices, exacerbated by the complexity and opacity of mortgage-backed securities.  This led to banks' inability to assess the value of their assets, reluctance to lend, and ultimately, the bankruptcy of Lehman Brothers, triggering a wider financial and economic crisis.


PROMPT:
  Response: Rising unemployment harms the welfare of the unemployed and signals inefficient resource use in the economy.  High unemployment indicates the country is not utilizing its human resources efficiently.


PROMPT:
  Response: The provided text does not explain how economists measure economic growth without price changes affecting the measurement.  It mentions the GDP deflator and CPI as measures of price level, and notes that a new method used by the U.S. Bureau of Economic Analysis since 1995 avoids this problem, but doesn't describe the method itself.


PROMPT:
  Response: In 2009, advanced economies experienced -3

In [18]:
# Cell 12: Prepare submission
final_df = questions_df[['ID', 'answer', 'image']]
final_df = final_df.rename(columns={'answer': 'Text', 'image': 'Image'})
final_df[['ID', 'Text', 'Image']].to_csv("/kaggle/working/final_submission.csv", index=False)
print(f"✅ Submission CSV saved as '/kaggle/working/submission.csv'")

# Cell 13: Prepare Canvas ZIP
import zipfile
zip_path = f"/kaggle/working/{TEAM_NAME}.zip"
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for path in image_paths:
        zipf.write(path, os.path.basename(path))
    zipf.write("/kaggle/working/final_submission.csv", "final_submission.csv")
    with open("/kaggle/working/retrieved_texts.txt", "w") as f:
        for idx, row in questions_df.iterrows():
            f.write(f"Question {row['ID']}:\n{row['retrieved_text']}\n\n")
    zipf.write("/kaggle/working/retrieved_texts.txt", "retrieved_texts.txt")
print(f"✅ Canvas ZIP saved as '{zip_path}'")

✅ Submission CSV saved as '/kaggle/working/submission.csv'
✅ Canvas ZIP saved as '/kaggle/working/Pair_Programming_Team_28.zip'
