In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import fitz  # PyMuPDF
from langchain_core.documents import Document
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import numpy as np
from langchain.chat_models import init_chat_model
from langchain.prompts import PromptTemplate
from langchain.schema.messages import HumanMessage
from sklearn.metrics.pairwise import cosine_similarity
import os
import base64
import io
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

In [3]:
### initialize the Clip Model for unified embeddings
clip_model=CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor=CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model.eval()

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Fetching 1 files: 100%|██████████| 1/1 [00:00<?, ?it/s]


CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [28]:
import os
from dotenv import load_dotenv
from langchain_google_genai import GoogleGenerativeAI

# Load environment variables
load_dotenv()

# Create LLM with explicit API key
llm = GoogleGenerativeAI(
    model="gemini-2.5-pro",  # Use correct model name
    google_api_key=os.getenv('GOOGLE_API_KEY')  # Explicit API key
)



In [4]:
# creating the embedding fucntions

def embed_image(image_data):
    """embedding the image using clip"""

    if isinstance(image_data,str):
        image = Image.open(image_data).convert("RGB")
    else:    #if PIL image
         image = image_data

    inputs = clip_processor(images=image,return_tensors="pt")
    with torch.no_grad():
        features = clip_model.get_image_features(**inputs)
        #normalizing embeddings to unit vector
        features = features/features.norm(dim=1,keepdim=True)
        return features.squeeze().numpy()
    

def embed_text(text):
    """Embed the text using clip"""
    inputs = clip_processor(text=text,return_tensors="pt",padding = True,truncation = True,max_length = 77)# 77 is clips max token length
    with torch.no_grad():
        features =clip_model.get_text_features(**inputs)
        features = features/features.norm(dim=1,keepdim=True)
        return features.squeeze().numpy()

In [5]:
# processing the PDF fucntion

## Process PDF
pdf_path="G:\AGENTIC_FRAMEWORK\data\Flowers.pdf"
doc=fitz.open(pdf_path)
# Storage for all documents and embeddings
all_docs = []
all_embeddings = []
image_data_store = {}  # Store actual image data for LLM

# Text splitter
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

In [6]:
doc

Document('G:\AGENTIC_FRAMEWORK\data\Flowers.pdf')

In [7]:
#processing the text and images

for i , page in enumerate(doc):

    ## process the text
    text = page.get_text()
    if text.strip():
        ## create tem documents for spliting
        temp_doc = Document(page_content=text,metadata={"page":i,"type":"text"})
        text_chunks = splitter.split_documents([temp_doc])

        #embed each chunk using clip
        for chunk in text_chunks:
            embedding = embed_text(chunk.page_content)
            all_embeddings.append(embedding)
            all_docs.append(chunk)

    ##process the images

    ##Convert PDF image to PIL format
    ##Store as base64 for GPT-4V (which needs base64 images)
    ##Create CLIP embedding for retrieval
    
    for img_index, img in enumerate(page.get_images(full=True)):
        try:
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            
            # Convert to PIL Image
            pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
            
            # Create unique identifier
            image_id = f"page_{i}_img_{img_index}"
            
            # Store image as base64 for later use with GPT-4V
            buffered = io.BytesIO()
            pil_image.save(buffered, format="PNG")
            img_base64 = base64.b64encode(buffered.getvalue()).decode()
            image_data_store[image_id] = img_base64
            
            # Embed image using CLIP
            embedding = embed_image(pil_image)
            all_embeddings.append(embedding)
            
            # Create document for image
            image_doc = Document(
                page_content=f"[Image: {image_id}]",
                metadata={"page": i, "type": "image", "image_id": image_id}
            )
            all_docs.append(image_doc)
            
        except Exception as e:
            print(f"Error processing image {img_index} on page {i}: {e}")
            continue

doc.close()
    

In [10]:
all_docs

[Document(metadata={'page': 0, 'type': 'text'}, page_content='Morphology of flower'),
 Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_0'}, page_content='[Image: page_0_img_0]'),
 Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_1'}, page_content='[Image: page_0_img_1]'),
 Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_2'}, page_content='[Image: page_0_img_2]'),
 Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_3'}, page_content='[Image: page_0_img_3]'),
 Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_4'}, page_content='[Image: page_0_img_4]'),
 Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_5'}, page_content='[Image: page_0_img_5]'),
 Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_6'}, page_content='[Image: page_0_img_6]'),
 Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_7'}, page_content='[Image: pag

In [12]:
# Create unified FAISS vector store with CLIP embeddings
embeddings_array = np.array(all_embeddings)
embeddings_array

array([[-0.04384758, -0.00486706,  0.02732429, ...,  0.02509783,
        -0.0129518 , -0.03269687],
       [ 0.03706964,  0.03470185, -0.00674659, ...,  0.03387407,
         0.01132378,  0.009261  ],
       [ 0.01944484,  0.02597684,  0.00277657, ...,  0.04933508,
         0.04852957,  0.02182267],
       ...,
       [ 0.01664223,  0.01754193,  0.00626485, ...,  0.06931599,
         0.02405416,  0.02622443],
       [-0.01750883,  0.00059912,  0.03032437, ...,  0.06875303,
        -0.01199512,  0.03460951],
       [ 0.03061348,  0.0165642 ,  0.01175014, ...,  0.05474591,
         0.02658661,  0.00393654]], shape=(9182, 512), dtype=float32)

In [14]:


# Create custom FAISS index since we have precomputed embeddings
vector_store = FAISS.from_embeddings(
    text_embeddings=[(doc.page_content, emb) for doc, emb in zip(all_docs, embeddings_array)],
    embedding=None,  # We're using precomputed embeddings
    metadatas=[doc.metadata for doc in all_docs]
)
vector_store



`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


<langchain_community.vectorstores.faiss.FAISS at 0x15a729d4d90>

In [16]:
def retrieve_multimodal(query, k=5):
    """Unified retrieval using CLIP embeddings for both text and images."""
    # Embed query using CLIP
    query_embedding = embed_text(query)
    
    # Search in unified vector store
    results = vector_store.similarity_search_by_vector(
        embedding=query_embedding,
        k=k
    )
    
    return results

In [17]:


def create_multimodal_message(query, retrieved_docs):
    """Create a message with both text and images for GPT-4V."""
    content = []
    
    # Add the query
    content.append({
        "type": "text",
        "text": f"Question: {query}\n\nContext:\n"
    })
    
    # Separate text and image documents
    text_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "text"]
    image_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "image"]
    
    # Add text context
    if text_docs:
        text_context = "\n\n".join([
            f"[Page {doc.metadata['page']}]: {doc.page_content}"
            for doc in text_docs
        ])
        content.append({
            "type": "text",
            "text": f"Text excerpts:\n{text_context}\n"
        })
    
    # Add images
    for doc in image_docs:
        image_id = doc.metadata.get("image_id")
        if image_id and image_id in image_data_store:
            content.append({
                "type": "text",
                "text": f"\n[Image from page {doc.metadata['page']}]:\n"
            })
            content.append({
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{image_data_store[image_id]}"
                }
            })
    
    # Add instruction
    content.append({
        "type": "text",
        "text": "\n\nPlease answer the question based on the provided text and images."
    })
    
    return HumanMessage(content=content)



In [31]:
def multimodal_pdf_rag_pipeline(query):
    """Main pipeline for multimodal RAG."""
    # Retrieve relevant documents
    context_docs = retrieve_multimodal(query, k=5)
    
    # Create multimodal message
    message = create_multimodal_message(query, context_docs)
    
    # Get response from GPT-4V
    response = llm.invoke([message])
    
    # Print retrieved context info
    print(f"\nRetrieved {len(context_docs)} documents:")
    for doc in context_docs:
        doc_type = doc.metadata.get("type", "unknown")
        page = doc.metadata.get("page", "?")
        if doc_type == "text":
            preview = doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
            print(f"  - Text from page {page}: {preview}")
        else:
            print(f"  - Image from page {page}")
    print("\n")
    
    return response

In [33]:
if __name__ == "__main__":
    # Example queries
    queries = [
        
        "Tell me the name of the flower in red?"
        
    ]
    
    for query in queries:
        print(f"\nQuery: {query}")
        print("-" * 50)
        answer = multimodal_pdf_rag_pipeline(query)
        print(f"Answer: {answer}")
        print("=" * 70)


Query: Tell me the name of the flower in red?
--------------------------------------------------

Retrieved 5 documents:
  - Text from page 1: • Floral characteristics
are the most
commonly  used
f
t
t
id
tif
Flowers
features to identify
plant...
  - Text from page 9: Flower Structure Variation
imperfect
perfect
imperfect
  - Text from page 0: Morphology of flower
  - Text from page 38: could also be upward
  - Text from page 30: inserted
exserted


Answer: Based on the provided text and images, the name of the red flower is not mentioned. The images are general diagrams used to explain the "Morphology of flower" and its parts, such as petals and sepals, rather than identifying a specific species.
