In [1]:
import os
from typing import List
from pydantic import BaseModel
from langchain_core.documents import Document
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_text_splitters.character import RecursiveCharacterTextSplitter
from langgraph.graph import StateGraph, END

In [2]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_google_genai.chat_models import ChatGoogleGenerativeAI
import os
model = ChatGoogleGenerativeAI(
                model='gemini-2.0-flash',
                google_api_key=os.getenv('GOOGLE_API_KEY'),
                temperature=0,
                max_output_tokens=1000
            )

model.invoke('Hello')

AIMessage(content='Hello! How can I help you today?', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-2.0-flash', 'safety_ratings': [], 'grounding_metadata': {}, 'model_provider': 'google_genai'}, id='lc_run--c07d7da8-eba0-4cc7-9fa2-02efd28f6fe3-0', usage_metadata={'input_tokens': 1, 'output_tokens': 10, 'total_tokens': 11, 'input_token_details': {'cache_read': 0}})

In [3]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import os
embeddings = GoogleGenerativeAIEmbeddings(
    model = 'models/text-embedding-004',
   google_api_key =os.getenv('GOOGLE_API_KEY')
)

embeddings


text="Hello, I am learning about embeddings!"
em = embeddings.embed_query(text)

em

[0.0057954853400588036,
 -0.020777961239218712,
 -0.073767751455307,
 0.005520607344806194,
 0.03836417198181152,
 0.008309532888233662,
 0.05843047797679901,
 0.0198830459266901,
 -0.018817421048879623,
 0.0014214477268978953,
 -0.003038973780348897,
 0.05139459669589996,
 0.0849136933684349,
 -0.017072148621082306,
 0.005389293190091848,
 -0.040984898805618286,
 0.03339604660868645,
 0.023471752181649208,
 -0.10320930182933807,
 0.04719139635562897,
 0.01596192456781864,
 -0.019129928201436996,
 0.011023784056305885,
 -0.025355495512485504,
 -0.03703681007027626,
 -0.008627268485724926,
 0.006926806177943945,
 -0.029250923544168472,
 0.015356908552348614,
 -0.01128516998142004,
 0.04638025164604187,
 0.08751466125249863,
 0.02563812956213951,
 -0.04505224898457527,
 -0.0009145495132543147,
 0.007491415832191706,
 -0.005396759137511253,
 0.015194687992334366,
 0.07375132292509079,
 -0.05054553970694542,
 -0.0762038454413414,
 0.021329674869775772,
 -0.01231074333190918,
 0.06367276608

In [4]:
import fitz
pdf_path="multimodal_sample.pdf"
doc=fitz.open(pdf_path)
# Storage for all documents and embeddings
all_docs = []
all_embeddings = []
image_data_store = {}  # Store actual image data for LLM

# Text splitter
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

In [9]:
from langchain_community.document_loaders import PyPDFLoader

# Load documents (e.g., a PDF)
loader = PyPDFLoader("multimodal_sample.pdf")
docs = loader.load()

# Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
text_chunks = text_splitter.split_documents(docs)

In [10]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import HumanMessage
from PIL import Image
import io

def get_image_summary(image_path: str, llm: ChatGoogleGenerativeAI) -> str:
    # Load and encode the image
    img = Image.open(image_path)
    
    # Create the multimodal message
    message = HumanMessage(
        content=[
            {
                "type": "text",
                "text": "Describe this image in detail for a retrieval system. Include all key subjects, text in the image, and any data points (like in a graph or table).",
            },
            img
        ]
    )
    
    # Invoke the Gemini model to get the description
    response = llm.invoke([message])
    return response.content

# Initialize the Gemini Multimodal LLM
gemini_llm_vision = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.0)

# Example image processing
# image_summary = get_image_summary("your_image.jpg", gemini_llm_vision)
# You would then store this summary along with a reference to the image file.

In [11]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import FAISS # Example Vector Store

# Initialize the embedding model
embeddings = GoogleGenerativeAIEmbeddings(model="gemini-embedding-001")

# Combine all documents (text chunks and image summaries as text Documents)
all_docs_to_embed = text_chunks # + list of image summary documents

# Create the vector store
vectorstore = FAISS.from_documents(all_docs_to_embed, embeddings)

# Create a retriever
retriever = vectorstore.as_retriever()

In [13]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# 1. Define the RAG prompt
template = """
You are an expert Q&A system. Use the following context to answer the question.
If you can't find the answer, just state that you don't know, don't try to make up an answer.
Include references to image file names if the context includes an image description.

Context:
{context}

Question: {question}
"""
rag_prompt = ChatPromptTemplate.from_template(template)

# 2. Initialize the final Gemini LLM
final_llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.2)

# 3. Create the RAG chain
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | rag_prompt
    | final_llm
    | StrOutputParser()
)

# 4. Invoke the chain
response = rag_chain.invoke("What does the chart on page 1 show about revenue trends?")
print(response)

The chart on page 1 shows that revenue grew steadily, with the highest growth recorded in Q3.
