<h1 style="background: linear-gradient(to right, #ff6b6b, #4ecdc4); 
           color: white; 
           padding: 20px; 
           border-radius: 10px; 
           text-align: center; 
           font-family: Arial, sans-serif; 
           text-shadow: 2px 2px 4px rgba(0,0,0,0.5);">
  Multimodal Agentic RAG with Document Retrieval (ColPali), Vision Language Model (ColQwen2), Amazon Nova and CrewAI
</h1>.

<h1 style="background: linear-gradient(to right, #ff6b6b, #4ecdc4); 
           color: white; 
           padding: 20px; 
           border-radius: 10px; 
           text-align: center; 
           font-family: Arial, sans-serif; 
           text-shadow: 2px 2px 4px rgba(0,0,0,0.5);">
    Loading the PDF files (Dataset)
</h1>

In [1]:
import requests
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Download pdfs from different sources 
def download_pdf(pdfs, output_dir):
    for name, url in pdfs.items():
        response = requests.get(url)
        pdf_path = os.path.join(output_dir, f"{name}.pdf")

        with open(pdf_path, "wb") as f:
            f.write(response.content)

        print(f"Downloaded {name} to {pdf_path}")

In [2]:
pdfs = {
    "Transformers": "https://arxiv.org/pdf/1706.03762.pdf",  
    "DSPy": "https://arxiv.org/pdf/2310.03714.pdf", 
    "ColPali": "https://arxiv.org/pdf/2407.01449.pdf",
}

# Downloading the PDF files 
input_path = "./data"
os.makedirs(input_path, exist_ok=True)

download_pdf(pdfs, input_path)


Downloaded Transformers to ./data/Transformers.pdf
Downloaded DSPy to ./data/DSPy.pdf
Downloaded ColPali to ./data/ColPali.pdf


<h1 style="background: linear-gradient(to right, #ff6b6b, #4ecdc4); 
           color: white; 
           padding: 20px; 
           border-radius: 10px; 
           text-align: center; 
           font-family: Arial, sans-serif; 
           text-shadow: 2px 2px 4px rgba(0,0,0,0.5);">
  Build the Retrieval Model 
</h1>


In [6]:
import os
from byaldi import RAGMultiModalModel
from PIL import Image
from pdf2image import convert_from_path
import torch

class ImageRetriever:
    def __init__(self, model_name, device, output_dir="matched_images"):
        """
        Initialize the RAG model for multimodal retrieval.
        
        :param model_name: Name of the pretrained model.
        :param device: Device to run the model on ('cpu' or 'cuda').
        :param output_dir: Directory where matched images will be saved.
        """
        self.myRAG = RAGMultiModalModel.from_pretrained(model_name, device=device)
        self.output_dir = output_dir
        self.indexed = False  # Flag to check if indexing is already done
        self.all_images = {}  # Dictionary to store images from PDFs

    def convert_pdfs_to_images(self, pdf_folder):
        """
        Convert all PDFs in the given folder into images and store them.

        :param pdf_folder: Path to the folder containing PDFs.
        """
        pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith(".pdf")]
        all_images = {}

        if not pdf_files:
            print("No PDF files found in the given folder.")

        for doc_id, pdf_file in enumerate(pdf_files):
            pdf_path = os.path.join(pdf_folder, pdf_file)
            images = convert_from_path(pdf_path)
            all_images[doc_id] = images  # Map doc_id to the images of this PDF

        self.all_images = all_images  # Store in class for retrieval
        print(f"Converted {len(pdf_files)} PDFs to images.")

    def index_documents(self, input_path, index_name="research_papers"):
        """
        Convert PDFs to images (if not already converted) and index the documents.

        :param input_path: Directory where the documents are stored.
        :param index_name: Name of the index.
        """
        if self.indexed:
            print("Documents are already indexed. Skipping indexing...")
            return

        # Convert PDFs to images before indexing
        self.convert_pdfs_to_images(input_path)

        # Index the documents
        self.myRAG.index(input_path=input_path, 
                         index_name=index_name, 
                         store_collection_with_index=False, 
                         overwrite=True)  # Avoid overwriting existing index

        self.indexed = True  # Set flag to True to prevent re-indexing
        print("Indexing completed successfully.")

    def get_matched_images(self, results):
        """
        Retrieve images that match the search results.

        :param results: List of search results containing doc_id and page_num.
        :return: List of PIL images.
        """
        matched_images = []
        for result in results:
            doc_id = result["doc_id"]
            page_num = result["page_num"]

            if doc_id in self.all_images and len(self.all_images[doc_id]) >= page_num:
                matched_images.append(self.all_images[doc_id][page_num - 1])
            else:
                print(f"Warning: Image for doc_id {doc_id} and page_num {page_num} not found.")

        return matched_images

    def save_images_as_png(self, image_list):
        """
        Convert and save PIL images as PNG format.

        :param image_list: List of PIL image objects.
        :return: List of file paths where images are saved.
        """
        os.makedirs(self.output_dir, exist_ok=True)  # Ensure output directory exists
        file_paths = []  # List to store saved file paths

        for idx, img in enumerate(image_list):
            file_path = os.path.join(self.output_dir, f"image_{idx + 1}.png")
            img.save(file_path, format="PNG")
            file_paths.append(file_path)

        return file_paths

    def retrieve_images(self, text_query, k=5):
        """
        Perform a search and retrieve matched image paths.

        :param text_query: Query string to search for.
        :param k: Number of results to return.
        :return: List of file paths to matched images.
        """
        if not self.indexed:
            raise ValueError("Documents are not indexed. Please call `index_documents()` first.")

        # Perform search using the query
        results = self.myRAG.search(text_query, k=k)

        # Retrieve matched images from the results
        matched_images = self.get_matched_images(results)

        if not matched_images:
            print("No matching images found.")
            return []

        # Save and return paths to the images
        return self.save_images_as_png(matched_images)


In [7]:
# Usage Example
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
model_name = "vidore/colpali-v1.2"
input_path = "./data"  # Folder containing the PDFs

# Initialize the ImageRetriever class
image_retriever = ImageRetriever(model_name=model_name, device=device)

Verbosity is set to 1 (active). Pass verbose=0 to make quieter.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
# Index documents (automatically converts PDFs to images before indexing)
image_retriever.index_documents(input_path)


Converted 3 PDFs to images.
overwrite is on. Deleting existing index research_papers to build a new one.
Indexing file: data/DSPy.pdf
Added page 1 of document 0 to index.
Added page 2 of document 0 to index.
Added page 3 of document 0 to index.
Added page 4 of document 0 to index.
Added page 5 of document 0 to index.
Added page 6 of document 0 to index.
Added page 7 of document 0 to index.
Added page 8 of document 0 to index.
Added page 9 of document 0 to index.
Added page 10 of document 0 to index.
Added page 11 of document 0 to index.
Added page 12 of document 0 to index.
Added page 13 of document 0 to index.
Added page 14 of document 0 to index.
Added page 15 of document 0 to index.
Added page 16 of document 0 to index.
Added page 17 of document 0 to index.
Added page 18 of document 0 to index.
Added page 19 of document 0 to index.
Added page 20 of document 0 to index.
Added page 21 of document 0 to index.
Added page 22 of document 0 to index.
Added page 23 of document 0 to index.
A

In [9]:
text_query = "What is proposed in the DSPy programming model?"

# Now, perform searches multiple times without re-indexing
matched_image_paths = image_retriever.retrieve_images(text_query)

# Print the matched image paths
print(matched_image_paths)

['matched_images/image_1.png', 'matched_images/image_2.png', 'matched_images/image_3.png', 'matched_images/image_4.png', 'matched_images/image_5.png']


<h1 style="background: linear-gradient(to right, #ff6b6b, #4ecdc4); 
           color: white; 
           padding: 20px; 
           border-radius: 10px; 
           text-align: center; 
           font-family: Arial, sans-serif; 
           text-shadow: 2px 2px 4px rgba(0,0,0,0.5);">
  Generation with Bedrock Nova
</h1>

In [10]:
import boto3
import base64

def read_and_encode_image(image_path: str):

    with open(image_path, 'rb') as image_file:
        image_bytes = image_file.read()
        
    base64_encoded = base64.b64encode(image_bytes).decode('utf-8')
    # Determine the image format (supported formats: jpg, jpeg, png, gif, webp)
    image_format = Image.open(image_path).format.lower()

    message_content = {
                    "image": {
                        "format": image_format,
                        "source": {"bytes": image_bytes},
                    }
                }
    
    return message_content


def send_images_to_model_using_converse(matched_items: list, query: str, model_id: str):

    system_prompt = 'You are a helpful assistant for question answering. Given the context, answer the question in details, and if needed format the code in markdown.'

    image_list = []
    for image_path in matched_items:
        image_list.append({
            "image_path": image_path, 
        })

    content_list = []
    for img in image_list:
        message_content = read_and_encode_image(img['image_path'])
        content_list.append(message_content)
    
    content_list.append({"text": query})
    system = [ { "text": system_prompt } ]
    
    # Define a "user" message including both the image and a text prompt.
    messages = [
        {
            "role": "user",
            "content": content_list,
        }
    ]
    
    # Configure the inference parameters.
    inf_params = {"temperature": .3, "topP": 0.1}
    
    # Initialize the Bedrock client
    client = boto3.client('bedrock-runtime', region_name='us-east-1')

    response = client.converse(
        modelId=model_id, 
        messages=messages,
        system=system, 
        inferenceConfig=inf_params
    )
    
    # Print Response
    output_message = response["output"]["message"]["content"][0]["text"]

    return output_message

In [11]:
PRO_MODEL_ID = "amazon.nova-pro-v1:0"
LITE_MODEL_ID = "amazon.nova-lite-v1:0"
MICRO_MODEL_ID = "amazon.nova-micro-v1:0"

response = send_images_to_model_using_converse(matched_items=matched_image_paths, query=text_query, model_id=PRO_MODEL_ID)
print(response)

The DSPy programming model proposes a systematic approach to designing AI pipelines by translating hand-based prompting techniques into declarative modules that carry natural-language typed signatures. These modules are task-adaptive components that can learn any particular text transformation, like answering a question or summarizing a paper. The model then parameterizes each module so that it can learn its desired behavior by iteratively bootstrapping useful demonstrations within the pipeline.


<h1 style="background: linear-gradient(to right, #ff6b6b, #4ecdc4); 
           color: white; 
           padding: 20px; 
           border-radius: 10px; 
           text-align: center; 
           font-family: Arial, sans-serif; 
           text-shadow: 2px 2px 4px rgba(0,0,0,0.5);">
  Building an Agentic RAG System with CrewAI

</h1>

In [23]:
from crewai import Agent, Task, Crew, LLM
from crewai_tools import tool
from langchain_community.tools import DuckDuckGoSearchRun

PRO_MODEL_ID = "us.amazon.nova-pro-v1:0"
LITE_MODEL_ID = "us.amazon.nova-lite-v1:0"
MICRO_MODEL_ID = "us.amazon.nova-micro-v1:0"

In [25]:
# Define a web search tool  
@tool('DuckDuckGoSearch')
def search(search_query: str):
    """Search the web for information on a given topic"""
    return DuckDuckGoSearchRun().run(search_query)

In [26]:
# Configure the LLM
llm = LLM(model=PRO_MODEL_ID)

In [30]:
# Define an agent to search the web who is an expert in research papers and applied machine learning
web_search_agent = Agent(
    role="Web Search Agent",
    goal="An expert in research papers and applied machine learning",
    backstory="You are an expert in research papers and applied machine learning",
    verbose=True,
    allow_delegation=False,
    llm=llm,
    tools=[search]
)
# Define a task to search the web
web_search_task = Task(
    description="Search {query} on the web for information on the given topic",
    expected_output="A list of web search results",     
    agent=web_search_agent
)

In [33]:
# Create a crew with the web search agent and task
crew = Crew(
    agents=[web_search_agent],
    tasks=[web_search_task],
    verbose=True
)
# Run the crew
query = "What is the latest research in applied machine learning?"
result = crew.kickoff(inputs={"query": query}) 



[1m[95m# Agent:[00m [1m[92mWeb Search Agent[00m
[95m## Task:[00m [92mSearch What is the latest research in applied machine learning? on the web for information on the given topic[00m


[1m[95m# Agent:[00m [1m[92mWeb Search Agent[00m
[95m## Thought:[00m [92mThought: I need to search the web for the latest research in applied machine learning.[00m
[95m## Using tool:[00m [92mDuckDuckGoSearch[00m
[95m## Tool Input:[00m [92m
"{\"search_query\": \"latest research in applied machine learning\"}"[00m
[95m## Tool Output:[00m [92m
ASAP: Aligning Simulation and Real-World Physics for Learning Agile Humanoid Whole-Body Skills. lecar-lab/asap • • 3 Feb 2025. In the second stage, we deploy the policies in the real world and collect real-world data to train a delta (residual) action model that compensates for the dynamics mismatch. Lev Craig covers AI and machine learning as site editor for TechTarget's Enterprise AI site. Craig graduated from Harvard University with a 