In [None]:
import os
from google.colab import userdata

os.environ["GOOGLE_API_KEY"] = userdata.get("GOOGLE_API_KEY")


In [None]:
!pip install langchain langchain-core langchain-community langchain-google-genai langchain-text-splitters faiss-cpu transformers pillow scikit-learn pymupdf requests




In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.tools import tool
import requests
import fitz  # PyMuPDF
from langchain_core.documents import Document
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
from langchain.chat_models import init_chat_model
from langchain.prompts import PromptTemplate
from langchain.schema.messages import HumanMessage
from sklearn.metrics.pairwise import cosine_similarity
import os
import base64
import io
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

In [None]:

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model.eval()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [None]:
!pip install torch



In [None]:
### Embedding functions
def embed_image(image_data):
    """Embed image using CLIP"""
    if isinstance(image_data, str):  # If path
        image = Image.open(image_data).convert("RGB")
    else:  # If PIL Image
        image = image_data

    inputs=clip_processor(images=image,return_tensors="pt")
    with torch.no_grad():
        features = clip_model.get_image_features(**inputs)
        # Normalize embeddings to unit vector
        features = features / features.norm(dim=-1, keepdim=True)
        return features.squeeze().numpy()

def embed_text(text):
    """Embed text using CLIP."""
    inputs = clip_processor(
        text=text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=77  # CLIP's max token length
    )
    with torch.no_grad():
        features = clip_model.get_text_features(**inputs)
        # Normalize embeddings
        features = features / features.norm(dim=-1, keepdim=True)
        return features.squeeze().numpy()

In [None]:
from google.colab import files
uploaded = files.upload()


Saving Growth_of_AI_Report.pdf to Growth_of_AI_Report (1).pdf


In [None]:
pdf_path = "Growth_of_AI_Report.pdf"
doc = fitz.open(pdf_path)
len(doc)

8

In [None]:
all_docs = []
all_embeddings =[]
image_data_store = {}
splitter = RecursiveCharacterTextSplitter(chunk_size = 500 , chunk_overlap = 100)

In [None]:
doc

Document('Growth_of_AI_Report.pdf')

In [None]:
import torch

for i, page in enumerate(doc):
    # ---- Process text ----
    text = page.get_text()
    if text.strip():
        temp_doc = Document(page_content=text, metadata={"page": i, "type": "text"})
        text_chunks = splitter.split_documents([temp_doc])

        for chunk in text_chunks:
            embedding = embed_text(chunk.page_content)
            all_embeddings.append(embedding)
            all_docs.append(chunk)

    # ---- Process images ----
    for img_index, img in enumerate(page.get_images(full=True)):
        try:
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]

            pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")

            # create unique identifier
            image_id = f"page_{i}_img_{img_index}"

            # Store image in base64
            buffered = io.BytesIO()
            pil_image.save(buffered, format="PNG")
            img_base64 = base64.b64encode(buffered.getvalue()).decode()
            image_data_store[image_id] = img_base64

            # embed the image
            embedding = embed_image(pil_image)
            all_embeddings.append(embedding)

            # create doc for image
            image_doc = Document(
                page_content=f"[Image : {image_id}]",
                metadata={"page": i, "type": "image", "image_id": image_id}
            )
            all_docs.append(image_doc)

        except Exception as e:
            print(f"Error processing image {img_index} on page {i}: {e}")
            continue

doc.close()


In [None]:
all_docs

[Document(metadata={'page': 0, 'type': 'text'}, page_content='Growth of Artificial Intelligence\nA Comprehensive Analysis with Visuals'),
 Document(metadata={'page': 1, 'type': 'text'}, page_content='Introduction\nArtificial Intelligence (AI) is one of the most transformative technologies of the 21st century. From\nhealthcare to finance, education to entertainment, AI has penetrated nearly every sector. This\nreport explores the growth of AI with supporting graphs, charts, and statistics.'),
 Document(metadata={'page': 2, 'type': 'text'}, page_content='AI Market Growth Over the Years'),
 Document(metadata={'page': 2, 'type': 'image', 'image_id': 'page_2_img_0'}, page_content='[Image : page_2_img_0]'),
 Document(metadata={'page': 3, 'type': 'text'}, page_content='Distribution of AI Applications'),
 Document(metadata={'page': 3, 'type': 'image', 'image_id': 'page_3_img_0'}, page_content='[Image : page_3_img_0]'),
 Document(metadata={'page': 4, 'type': 'text'}, page_content='AI Research P

In [None]:
# Create unified FAISS vector store with CLIP embeddings
import numpy as np
embeddings_array = np.array(all_embeddings)
embeddings_array

array([[ 0.00092127,  0.02736879, -0.03505028, ...,  0.02691303,
        -0.00488226,  0.01114095],
       [ 0.00615251,  0.02297794, -0.00556711, ..., -0.0919802 ,
        -0.00385127,  0.0151478 ],
       [-0.02410325,  0.01422895, -0.01279633, ..., -0.01356603,
        -0.0208317 ,  0.02439968],
       ...,
       [ 0.01711361, -0.06173725, -0.02766114, ...,  0.08771243,
         0.03944656,  0.00121575],
       [-0.00483633, -0.00846883, -0.01400868, ..., -0.01663832,
        -0.02785842, -0.01317464],
       [-0.00231355,  0.02196206, -0.00754419, ..., -0.01937577,
         0.00074463,  0.00721601]], dtype=float32)

In [None]:
pip install chromadb




In [None]:
pip install -qU langchain_community faiss-cpu

In [None]:
from langchain_community.vectorstores import FAISS
vector_store = FAISS.from_embeddings(
    text_embeddings=[(doc.page_content, emb) for doc, emb in zip(all_docs, embeddings_array)],
    embedding=None,  # We're using precomputed embeddings
    metadatas=[doc.metadata for doc in all_docs]
)
vector_store



<langchain_community.vectorstores.faiss.FAISS at 0x798b27ad7530>

In [None]:
from ChatGoogleGenerativeAI import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI()
llm

ModuleNotFoundError: No module named 'ChatGoogleGenerativeAI'

In [None]:
def retrieve_multimodal(query , k = 5):
    '''Unified Retrival using clip image for both text and images'''
    query_embedding = embed_text(query)
    result = vector_store.similarity_search_by_vector(
         embedding= query_embedding ,
         k =k)
    return result



In [None]:
def create_multimodal_message(query, retrieved_docs):
    """Create a message with both text and images for llm"""
    content = []

    content.append({
        "type": "text",
        "text": f"Question: {query}\n\nContext:\n"
    })

    text_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "text"]
    image_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "image"]

    if text_docs:
      text_context = "\n\n".join([
          f"[Page {doc.metadata['page']}]:{doc.page_content}"
          for doc in text_docs
      ])
      content.append({
          'type': 'text',
          "text": f"Text excerpts:\n{text_context}\n"
      })

    for doc in image_docs:
        image_id = doc.metadata.get("image_id")
        if image_id and image_id in image_data_store:
            content.append({
                "type": "image",
                "text": f"\n[Image from page {doc.metadata['page']}]:\n"
            })
            content.append({
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{image_data_store[image_id]}"
                }
            })


        content.append({
        "type": "text",
        "text": "\n\nPlease answer the question based on the provided text and images."
    })

    return HumanMessage(content=content)






In [None]:
def multimodal_pdf_rag_pipeline(query):
    """Main pipeline for multimodal RAG."""
    # Retrieve relevant documents
    context_docs = retrieve_multimodal(query, k=5)

    # Create multimodal message
    message = create_multimodal_message(query, context_docs)

    # Get response from llm
    response = llm.invoke([message])

    # Print retrieved context info
    print(f"\nRetrieved {len(context_docs)} documents:")
    for doc in context_docs:
        doc_type = doc.metadata.get("type", "unknown")
        page = doc.metadata.get("page", "?")
        if doc_type == "text":
            preview = doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
            print(f"  - Text from page {page}: {preview}")
        else:
            print(f"  - Image from page {page}")
    print("\n")

    return response.content

In [None]:
if __name__ == "__main__":
    # Example queries
    queries = [
        "What does the chart on page 1 show about AI trends?",
        "Summarize the main findings from the document",
        "What visual elements are present in the document?"
    ]

    for query in queries:
        print(f"\nQuery: {query}")
        print("-" * 50)
        answer = multimodal_pdf_rag_pipeline(query)
        print(f"Answer: {answer}")
        print("=" * 70)


Query: What does the chart on page 1 show about AI trends?
--------------------------------------------------

Retrieved 5 documents:
  - Text from page 0: Growth of Artificial Intelligence
A Comprehensive Analysis with Visuals
  - Text from page 4: AI Research Publications
  - Text from page 1: Introduction
Artificial Intelligence (AI) is one of the most transformative technologies of the 21st...
  - Text from page 3: Distribution of AI Applications
  - Text from page 5: AI Investment by Sector


Answer: Based on the provided context, page 1 introduces the report and mentions that it explores the growth of AI with supporting graphs, charts, and statistics. While it doesn't explicitly say what the chart on page 1 shows, it implies that the chart will likely depict some aspect of AI's growth or trends.

To know the specific trend displayed by the chart on page 1, you would need to refer to the actual chart.

Query: Summarize the main findings from the document
-------------------------