In [1]:
%pip install --upgrade --quiet pymupdf langchain langchain-core langchain-google-genai langchain-pinecone langchain-community sentence-transformers

[0m

In [2]:
import fitz
import re
from langchain.document_loaders import PyMuPDFLoader
from langchain_core.documents import Document

class EnhancedPyMuPDFLoader(PyMuPDFLoader):

    def load(self):
        """Load documents and detect images"""
        docs = super().load()
        enhanced_docs = []

        pdf_document = fitz.open(self.file_path)
        for i, page in enumerate(pdf_document):
            page_docs = [doc for doc in docs if doc.metadata.get('page') == i]

            image_list = page.get_images(full=True)

            for doc in page_docs:
                image_placeholders = {}

                for img_idx, img_info in enumerate(image_list):
                    xref = img_info[0]

                    base_image = pdf_document.extract_image(xref)

                    image_id = f"img_{i+1}_{img_idx+1}"
                    placeholder = f"[IMAGE: {image_id}]"
                    image_metadata = {
                        "image_id": image_id,
                        "page_num": i + 1,
                        "position": img_idx + 1,
                        "source_file": self.file_path,
                        "image_format": base_image["ext"],
                        "width": base_image.get("width"),
                        "height": base_image.get("height")
                    }

                    image_placeholders[image_id] = image_metadata

                if image_placeholders:
                    new_metadata = doc.metadata.copy()
                    new_metadata["images"] = image_placeholders
                    enhanced_doc = Document(
                        page_content=doc.page_content,
                        metadata=new_metadata
                    )
                    enhanced_docs.append(enhanced_doc)
                else:
                    enhanced_docs.append(doc)

        pdf_document.close()
        return enhanced_docs

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = EnhancedPyMuPDFLoader("/content/GPU.pdf")
documents = loader.load()

class ImageAwareTextSplitter(RecursiveCharacterTextSplitter):
    def split_documents(self, documents):
        """Split documents while preserving image metadata"""
        splits = super().split_documents(documents)

        for split in splits:
            parent_metadata = next((doc.metadata for doc in documents
                                    if doc.metadata.get('page') == split.metadata.get('page')), {})

            parent_images = parent_metadata.get('images', {})
            if not parent_images:
                continue

            chunk_images = {}
            for img_id, img_data in parent_images.items():
                if f"[IMAGE: {img_id}]" in split.page_content:
                    chunk_images[img_id] = img_data

            if chunk_images:
                split.metadata["images"] = chunk_images

        return splits


splitter = ImageAwareTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = splitter.split_documents(documents)
image_chunks = [chunk for chunk in chunks if "images" in chunk.metadata]

Created 82 chunks, 52 contain images


In [4]:
import json

for chunk in image_chunks:
    if "images" in chunk.metadata and isinstance(chunk.metadata["images"], dict):
        chunk.metadata["images"] = json.dumps(chunk.metadata["images"])

In [5]:
from langchain.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L12-v2",
    model_kwargs={'device': 'cpu'}
)

  embedding_model = HuggingFaceEmbeddings(


In [7]:
# Create Pinecone vector store
import os
from pinecone import Pinecone, ServerlessSpec
from google.colab import userdata

pc = Pinecone(api_key=userdata.get("PINECONE_API_KEY"))

index_name = "multiagent-rag"

# Check if index already exists
if index_name not in [index['name'] for index in pc.list_indexes()]:
    # Create index if it doesn't exist
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
    print(f"Index '{index_name}' created.")
    pinecone_index = pc.Index(index_name)
else:
    pinecone_index = pc.Index(index_name)
    print(f"Index '{index_name}' already exists and has been initialized")

from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore(index=pinecone_index,embedding=embedding_model)
vector_store.add_documents(chunks)
# vector_store.add_documents(image_chunks)
print(f"Added {len(chunks)} chunks and {len(image_chunks)} image chunks to the vector store")

Index 'multiagent-rag' already exists and has been initialized
Added 82 chunks and 52 image chunks to the vector store


In [8]:
class RetrievalAgent:
    def __init__(self):
        self.embeddings = embedding_model
        self.vector_store = vector_store

    def retrieve(self, query, k=3):
        """Retrieve the top-k most relevant documents for the query"""
        docs_with_scores = self.vector_store.similarity_search_with_score(query, k=k)
        return docs_with_scores

In [13]:
import json

class ImageAgent:
    def __init__(self):
        pass

    def identify_images(self, retrieved_docs_with_scores):
        """
        Identify images in the retrieved documents
        Returns: List of image information dictionaries
        """
        images_info = []

        for doc, score in retrieved_docs_with_scores:
            doc_images_raw = doc.metadata.get("images", {})
            if isinstance(doc_images_raw, str):
                try:
                    doc_images = json.loads(doc_images_raw)
                except json.JSONDecodeError:
                    print(f"Warning: Could not decode image metadata for document on page {doc.metadata.get('page')}. Skipping images for this document.")
                    continue
            elif isinstance(doc_images_raw, dict):
                doc_images = doc_images_raw
            else:
                continue

            if not isinstance(doc_images, dict):
                continue

            for img_id, img_data in doc_images.items():
                image_info = img_data.copy()
                placeholder = f"[IMAGE: {img_id}]"
                if placeholder in doc.page_content:
                    placeholder_pos = doc.page_content.find(placeholder)
                    start_pos = max(0, placeholder_pos - 100)
                    end_pos = min(len(doc.page_content), placeholder_pos + len(placeholder) + 100)
                    image_info["document_context"] = doc.page_content[start_pos:end_pos]
                else:
                    image_info["document_context"] = "Context not available"

                image_info["relevance_score"] = score
                image_info["placeholder"] = placeholder

                images_info.append(image_info)

        return images_info

In [14]:
class SimpleOrchestrator:
    def __init__(self, retrieval_agent, image_agent):
        """Initialize the orchestrator"""
        self.retrieval_agent = retrieval_agent
        self.image_agent = image_agent

    def process_query(self, query, k=3):
        """Process a user query through the pipeline"""
        retrieved_docs_with_scores = self.retrieval_agent.retrieve(query, k=k)
        image_info = self.image_agent.identify_images(retrieved_docs_with_scores)
        return {
            "query": query,
            "retrieved_docs_with_scores": retrieved_docs_with_scores,
            "image_info": image_info
        }

In [15]:
from langchain_core.prompts import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from google.colab import userdata

class EnhancedResponseGenerator:
    def __init__(self, orchestrator):
        self.orchestrator = orchestrator

        self.llm = ChatGoogleGenerativeAI(model="models/gemini-2.5-flash-preview-04-17",api_key=userdata.get("GOOGLE_API_KEY_1"))

        # Create prompt template with image information
        self.prompt_template = PromptTemplate.from_template(
            """You are a helpful assistant that answers questions based on the provided context.

            Context:
            {context}

            Images in the context:
            {image_info}

            Question: {query}

            Please provide a detailed and accurate answer based only on the information in the context.
            When there are images mentioned in the text, acknowledge them like "[There is an image here showing X]"
            based on the surrounding context.
            If the context doesn't contain relevant information to answer the question, say so.
            """
        )

    def generate_response(self, query):
        results = self.orchestrator.process_query(query)
        context = "\n\n".join([
            f"Document (relevance: {score:.2f}):\n{doc.page_content}"
            for doc, score in results["retrieved_docs_with_scores"]
        ])

        image_info = ""
        if results["image_info"]:
            for i, img in enumerate(results["image_info"]):
                image_info += f"Image {i+1}: {img['placeholder']} (on page {img.get('page_num', 'unknown')})\n"
                image_info += f"Context around image: {img.get('document_context', 'Not available')}\n\n"
        else:
            image_info = "No images found in the retrieved documents."

        response = self.llm.invoke(
            self.prompt_template.format(
                query=query,
                context=context,
                image_info=image_info
            )
        )

        return response.content

In [17]:
# Initialize all components
retrieval_agent = RetrievalAgent()
image_agent = ImageAgent()
orchestrator = SimpleOrchestrator(retrieval_agent, image_agent)
generator = EnhancedResponseGenerator(orchestrator)

question = "Explain me GPGPU architecture and memory"
response = generator.generate_response(question)
print(response)

results = orchestrator.process_query(question)
print(f"Found {len(results['image_info'])} images in the retrieved documents")
for img in results['image_info']:
    print(f"Image ID: {img['image_id']}")
    print(f"Context: {img['document_context'][:100]}...")

Based on the provided context:

A General Purpose Graphics Processing Unit (GPGPU) is a modified GPU designed for graphics rendering but adapted to perform applications traditionally handled by CPUs, such as floating-point calculations. Essentially, all modern GPUs are considered GPGPUs. They can be programmed to deploy their processing power for scientific computing needs.

Regarding architecture and memory, the context states:
*   GPGPUs are connected to a CPU. The CPU offloads compute-heavy jobs to the GPU in terms of concurrent instruction streams.
*   GPUs have more transistors dedicated to data processing than to control or caches.
*   The L1 cache in GPUs is small, and in earlier versions, it was absent.
*   The context mentions that a more detailed discussion on GPU memories will be presented later while discussing CUDA.

[There is an image here, but the context around it is not available to describe what it shows.]
Found 1 images in the retrieved documents
Image ID: img_8_1
Co