In [6]:
import os
import uuid
import base64
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.messages import HumanMessage
from langchain_core.documents import Document
from langchain_core.stores import InMemoryStore

# The explicit path for the MultiVectorRetriever
# from langchain_community.retrievers import MultiVectorRetriever
from langchain_classic.retrievers import MultiVectorRetriever

# Dedicated package for Chroma is now standard in 2026
from langchain_chroma import Chroma
from unstructured.partition.pdf import partition_pdf

# 1. Extract Elements from PDF
# Using 'hi_res' strategy is required for image/table extraction
elements = partition_pdf(
    filename="../docs/samplegraphs.pdf",
    extract_images_in_pdf=True,
    infer_table_structure=True,
    chunking_strategy="by_title",
    strategy="hi_res",  # Required for image extraction
    max_characters=4000,
    new_after_n_chars=3800,
    extract_image_block_output_dir="extracted_images",
)


# 2. Encoding Helper and Summarizer
def encode_image(image_path):
    """Helper to convert image to base64 for GPT-4o."""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")


def summarize_image(image_base64):
    chat = ChatOpenAI(model="gpt-4o", max_tokens=1024)
    msg = chat.invoke(
        [
            HumanMessage(
                content=[
                    {
                        "type": "text",
                        "text": "Describe this chart in detail. Extract data points and trends.",
                    },
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
                    },
                ]
            )
        ]
    )
    return msg.content


# 3. Setup Chroma and Multi-Vector Retriever
# Decouples search (summaries) from response (raw images)
vectorstore = Chroma(collection_name="chart_rag", embedding_function=OpenAIEmbeddings())
store = InMemoryStore()
id_key = "doc_id"

retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

# 4. Process and Add to Stores
image_paths = [
    os.path.join("extracted_images", f) for f in os.listdir("extracted_images")
]
image_summaries = []
img_ids = [str(uuid.uuid4()) for _ in image_paths]

for img_path in image_paths:
    b64_img = encode_image(img_path)
    image_summaries.append(summarize_image(b64_img))

# Add Summaries to Vector Store (Child)
summary_docs = [
    Document(page_content=s, metadata={id_key: img_ids[i]})
    for i, s in enumerate(image_summaries)
]
retriever.vectorstore.add_documents(summary_docs)

# Add Raw Data to Docstore (Parent)
# We use .mset to map the IDs to the original content
retriever.docstore.mset(list(zip(img_ids, image_paths)))



preprocessor_config.json:   0%|          | 0.00/274 [00:00<?, ?B/s]

Loading weights:   0%|          | 0/367 [00:00<?, ?it/s]