<a href="https://colab.research.google.com/github/chaitykundu/Simple_Rag_Project/blob/main/Multimodal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install PyMuPDF langchain_community transformers torch numpy faiss-cpu langchain_google_genai



In [2]:
import fitz  # PyMuPDF
from langchain.schema import Document
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import numpy as np
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.schema.messages import HumanMessage
from sklearn.metrics.pairwise import cosine_similarity
import os
import base64
import io
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS  # official LangChain FAISS


In [3]:
###Clip Model
import os
from dotenv import load_dotenv
from google.colab import userdata

load_dotenv()

## set up the environment
os.environ["GOOGLE_API_KEY"] = userdata.get("GOOGLE_API_KEY")

### initialize the Clip Model for unified embeddings
clip_model=CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor=CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [4]:
### Embedding functions
def embed_image(image_data):
    """Embed image using CLIP"""
    if isinstance(image_data, str):  # If path
        image = Image.open(image_data).convert("RGB")
    else:  # If PIL Image
        image = image_data

    inputs=clip_processor(images=image,return_tensors="pt")
    with torch.no_grad():
        features = clip_model.get_image_features(**inputs)
        # Normalize embeddings to unit vector
        features = features / features.norm(dim=-1, keepdim=True)
        return features.squeeze().numpy()

def embed_text(text):
    """Embed text using CLIP."""
    inputs = clip_processor(
        text=text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=77  # CLIP's max token length
    )
    with torch.no_grad():
        features = clip_model.get_text_features(**inputs)
        # Normalize embeddings
        features = features / features.norm(dim=-1, keepdim=True)
        return features.squeeze().numpy()

In [6]:
## Process PDF
pdf_path="Rag.pdf"
doc=fitz.open(pdf_path)
# Storage for all documents and embeddings
all_docs = []
all_embeddings = []
image_data_store = {}  # Store actual image data for LLM

# Text splitter
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

In [7]:
doc


Document('Rag.pdf')

In [8]:
for i,page in enumerate(doc):
    ## process text
    text=page.get_text()
    if text.strip():
        ##create temporary document for splitting
        temp_doc = Document(page_content=text, metadata={"page": i, "type": "text"})
        text_chunks = splitter.split_documents([temp_doc])

        #Embed each chunk using CLIP
        for chunk in text_chunks:
            embedding = embed_text(chunk.page_content)
            all_embeddings.append(embedding)
            all_docs.append(chunk)



    ## process images
    ##Three Important Actions:

    ##Convert PDF image to PIL format
    ##Store as base64 for GPT-4V (which needs base64 images)
    ##Create CLIP embedding for retrieval

    for img_index, img in enumerate(page.get_images(full=True)):
        try:
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]

            # Convert to PIL Image
            pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")

            # Create unique identifier
            image_id = f"page_{i}_img_{img_index}"

            # Store image as base64 for later use with GPT-4V
            buffered = io.BytesIO()
            pil_image.save(buffered, format="PNG")
            img_base64 = base64.b64encode(buffered.getvalue()).decode()
            image_data_store[image_id] = img_base64

            # Embed image using CLIP
            embedding = embed_image(pil_image)
            all_embeddings.append(embedding)

            # Create document for image
            image_doc = Document(
                page_content=f"[Image: {image_id}]",
                metadata={"page": i, "type": "image", "image_id": image_id}
            )
            all_docs.append(image_doc)

        except Exception as e:
            print(f"Error processing image {img_index} on page {i}: {e}")
            continue

doc.close()

In [9]:
all_docs

[Document(metadata={'page': 0, 'type': 'text'}, page_content='ScienceDirect\nAvailable online at www.sciencedirect.com\nProcedia Computer Science 246 (2024) 3781–3790\n1877-0509 © 2024 The Authors. Published by Elsevier B.V.\nThis is an open access article under the CC BY-NC-ND license (https://creativecommons.org/licenses/by-nc-nd/4.0)\nPeer-review under responsibility of the scientific committee of the 28th International Conference on Knowledge \nBased and Intelligent information and Engineering Systems\n10.1016/j.procs.2024.09.178'),
 Document(metadata={'page': 0, 'type': 'text'}, page_content='Based and Intelligent information and Engineering Systems\n10.1016/j.procs.2024.09.178\nKeywords: Large Language Models (LLMs); Natural Language Processing (NLP); Retrieval-Augmented Generation (RAG); Text generation; \nDigital transformation. \n1. Introduction \nDigital transformation signifies the incorporation of digital technology across different facets of a business, \nreshaping its ope

In [10]:
# Create unified FAISS vector store with CLIP embeddings
embeddings_array = np.array(all_embeddings)
embeddings_array

array([[ 0.03966911,  0.02233979,  0.03567739, ..., -0.02313417,
        -0.04350961, -0.0356045 ],
       [-0.00800639, -0.01203629, -0.00161855, ...,  0.00660626,
         0.00440606, -0.06144243],
       [ 0.00099998,  0.01867807, -0.02668937, ..., -0.09358583,
        -0.03183691,  0.03085243],
       ...,
       [ 0.07239746, -0.04516119, -0.02439865, ..., -0.04424017,
        -0.04348588, -0.01945706],
       [-0.00251454, -0.04437987, -0.01670402, ..., -0.03156574,
        -0.08298665, -0.03596834],
       [-0.0059841 , -0.00655963, -0.03402936, ..., -0.03022855,
        -0.0441472 , -0.02386659]], dtype=float32)

In [11]:
(all_docs,embeddings_array)

([Document(metadata={'page': 0, 'type': 'text'}, page_content='ScienceDirect\nAvailable online at www.sciencedirect.com\nProcedia Computer Science 246 (2024) 3781–3790\n1877-0509 © 2024 The Authors. Published by Elsevier B.V.\nThis is an open access article under the CC BY-NC-ND license (https://creativecommons.org/licenses/by-nc-nd/4.0)\nPeer-review under responsibility of the scientific committee of the 28th International Conference on Knowledge \nBased and Intelligent information and Engineering Systems\n10.1016/j.procs.2024.09.178'),
  Document(metadata={'page': 0, 'type': 'text'}, page_content='Based and Intelligent information and Engineering Systems\n10.1016/j.procs.2024.09.178\nKeywords: Large Language Models (LLMs); Natural Language Processing (NLP); Retrieval-Augmented Generation (RAG); Text generation; \nDigital transformation. \n1. Introduction \nDigital transformation signifies the incorporation of digital technology across different facets of a business, \nreshaping its o

In [12]:
# Create custom FAISS index since we have precomputed embeddings
vector_store = FAISS.from_embeddings(
    text_embeddings=[(doc.page_content, emb) for doc, emb in zip(all_docs, embeddings_array)],
    embedding=None,  # We're using precomputed embeddings
    metadatas=[doc.metadata for doc in all_docs]
)
vector_store



<langchain_community.vectorstores.faiss.FAISS at 0x7df2f1b39d10>

In [13]:
from langchain_google_genai import ChatGoogleGenerativeAI

# Initialize Gemini model
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest")
llm

ChatGoogleGenerativeAI(model='models/gemini-1.5-flash-latest', google_api_key=SecretStr('**********'), client=<google.ai.generativelanguage_v1beta.services.generative_service.client.GenerativeServiceClient object at 0x7df2ee14e050>, default_metadata=(), model_kwargs={})

In [14]:
def retrieve_multimodal(query, k=5):
    """Unified retrieval using CLIP embeddings for both text and images."""
    # Embed query using CLIP
    query_embedding = embed_text(query)

    # Search in unified vector store
    results = vector_store.similarity_search_by_vector(
        embedding=query_embedding,
        k=k
    )

    return results

In [15]:
def create_multimodal_message(query, retrieved_docs):
    """Create a message with both text and images for GPT-4V."""
    content = []

    # Add the query
    content.append({
        "type": "text",
        "text": f"Question: {query}\n\nContext:\n"
    })

    # Separate text and image documents
    text_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "text"]
    image_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "image"]

    # Add text context
    if text_docs:
        text_context = "\n\n".join([
            f"[Page {doc.metadata['page']}]: {doc.page_content}"
            for doc in text_docs
        ])
        content.append({
            "type": "text",
            "text": f"Text excerpts:\n{text_context}\n"
        })

    # Add images
    for doc in image_docs:
        image_id = doc.metadata.get("image_id")
        if image_id and image_id in image_data_store:
            content.append({
                "type": "text",
                "text": f"\n[Image from page {doc.metadata['page']}]:\n"
            })
            content.append({
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{image_data_store[image_id]}"
                }
            })

    # Add instruction
    content.append({
        "type": "text",
        "text": "\n\nPlease answer the question based on the provided text and images."
    })

    return HumanMessage(content=content)

In [16]:
def multimodal_pdf_rag_pipeline(query):
    """Main pipeline for multimodal RAG."""
    # Retrieve relevant documents
    context_docs = retrieve_multimodal(query, k=5)

    # Create multimodal message
    message = create_multimodal_message(query, context_docs)

    # Get response from GPT-4V
    response = llm.invoke([message])

    # Print retrieved context info
    print(f"\nRetrieved {len(context_docs)} documents:")
    for doc in context_docs:
        doc_type = doc.metadata.get("type", "unknown")
        page = doc.metadata.get("page", "?")
        if doc_type == "text":
            preview = doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
            print(f"  - Text from page {page}: {preview}")
        else:
            print(f"  - Image from page {page}")
    print("\n")

    return response.content

In [19]:
if __name__ == "__main__":
    # Example queries
    queries = [
        #"What does the chart on page 1 show about revenue trends?",
        #"Summarize the main findings from the document",
        "What visual elements are present in the document?",
        "What is the abstract ot the paper?",
        "QWhat are those components of the paper?"
    ]

    for query in queries:
        print(f"\nQuery: {query}")
        print("-" * 50)
        answer = multimodal_pdf_rag_pipeline(query)
        print(f"Answer: {answer}")
        print("=" * 70)


Query: What visual elements are present in the document?
--------------------------------------------------

Retrieved 5 documents:
  - Text from page 7: 3) Educational 
- Educational decision making [10] 
- Textbook QA [11] 
4) Technology and Software D...
  - Text from page 1: articles, the system can respond to current business events, presenting opportunities for business i...
  - Text from page 7: 1) Medical / Biomedical 
- Biomedical QA [1] 
- Medical QA [3] 
- Medical text summarization [4] 
- ...
  - Text from page 6: 4) Text Analysis and Processing 
- Sentiments classification [13] 
- Text error correction [35] 
- T...
  - Text from page 6: Text Generation and Summarization: (6) 
Information Retrieval and Extraction: (6) 
Text Analysis and...


Answer: Based on the provided text, the only visual element explicitly mentioned is a figure:  "Fig. 1. (a) A generic RAG architecture, where users’ queries, potentially in different modalities (e.g., text, code, image, etc.), are inpu