# Adaptive RAG

Aiming to use both query analysis and active/self-corrective RAG

In [None]:
%pip install -U langchain_community tiktoken langchain-google-genai langchain-huggingface langchainhub chromadb langchain langgraph tavily-python sentence-transformers

In [2]:
import getpass
import os

def _set_env(var: str):
    if not os.environ.get(var):
        os.environ[var] = input(f"{var}: ")

_set_env("GEMINI_API_KEY")
_set_env("TAVILY_API_KEY")


### Create Index

Setting up a vector database using **HuggingFace** for embeddings(free, the model will be cached to your machine) and **Chroma vector database**. Data will be retrieved directly from the URLs specified.


In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

# Setting up embedding model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Docs to index
urls = [
    "https://lilianweng.github.io/posts/2023-06-23-agent/",
    "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
    "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/",
]

# Load the documents
docs = [WebBaseLoader(url).load() for url in urls]
# Flattening the docs into docs_list. From - [[doc1], [doc2], [doc3]] to [doc1, doc2, doc3]
docs_list = [item for sublist in docs for item in sublist]
# This single line double for loop is equivalent to -
"""
docs_list = []
for sublist in docs:
    for item in sublist:
        docs_list.append(item)
"""

# Splitting the documents into manageable chunks
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=500, chunk_overlap=0
)
doc_splits = text_splitter.split_documents(docs_list)

# Create vectorstore
vectorstore = Chroma.from_documents(
    documents = doc_splits,
    collection_name = "rag-chroma",
    embedding = embeddings,
    persist_directory="./chroma_db"
)

retriever = vectorstore.as_retriever()

  from .autonotebook import tqdm as notebook_tqdm
USER_AGENT environment variable not set, consider setting it to identify your requests.


#### **Query Analysis via a Router**

In the prompt we need to define what topics should be redirected to the RAG.

This process is kept manual as of now. We can make this automatic and let the llm summarize the RAG and define the prompt
for our Router but this can become very expensive for large documents. So in our case since we're only learning and experimenting
I've kept this manual for now.

In [4]:
from typing import Literal

from langchain_core.prompts import ChatPromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI

from pydantic import BaseModel, Field
import os

class RouteQuery(BaseModel):
    """I want a structured data object named RouteQuery that must follow certain rules"""
    """Route user query"""
    datasource: Literal["vectorstore", "web_search"] = Field(
        ..., 
        description="Given a user query choose to route it to a web search or a vector store"
    )

# Defining our llm
llm = ChatGoogleGenerativeAI(
    model = "gemini-2.5-flash",
    google_api_key = os.getenv("GEMINI_API_KEY"),
    temperature = 0
)
# Testing that llm was setup correctly using this
# response = llm.invoke("Hi How are you")
# print(response.content)
structured_llm_router = llm.with_structured_output(RouteQuery)

# Defining system prompt
system = """You are an expert at routing a user question to a vectorstore or web search.
The vectorstore contains documents related to agents, prompt engineering, and adversarial attacks.
Use the vectorstore for questions on these topics. Otherwise, use web-search."""
route_prompt = ChatPromptTemplate.from_messages([
    ("system", system),
    ("human", "{question}")
])

# Defining our chain, query will be used to call route_prompt, the output of route_prompt will
# be fed into the llm
question_router = route_prompt | structured_llm_router

print(question_router.invoke({"question": "What are the types of agent memory?"}))
print(question_router.invoke({"question": "Who won the FIFA worldcup in 2022?"}))

datasource='vectorstore'
datasource='web_search'


#### **Retrieval Grader**

After performing the retrieval, we'll evaluate the results. This is just a second check, even though we chose RAG based
on the query we'll still make sure that the document content retrieved are sufficiently relevant to the query.

Again we'll let the llm decide, its output will be a binary yes or no.


In [5]:
class GradeDocuments(BaseModel):
    binary_score: str = Field(
        description = "Documents are relevant to the question, 'yes' or 'no"
    )

structured_llm_router = llm.with_structured_output(GradeDocuments)

# System prompt 
system = """You are a grader assessing relevance of a retrieved document to a user question. \n 
    If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. \n
    It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question."""

grade_prompt = ChatPromptTemplate.from_messages([
    ("system", system),
    ("human", "Retrieved document: \n\n {document} \n\n User question: {question}")
])

retrieval_grader = grade_prompt | structured_llm_router

# Testing the retrieval grader
question = "agent memory"
docs = retriever.invoke(question)
doc_content = docs[1].page_content
print(question)
print(retrieval_grader.invoke({"question": question, "document": doc_content}))

question = "Mercedes benz"
docs = retriever.invoke(question)
doc_content = docs[1].page_content
print(question)
print(retrieval_grader.invoke({"question": question, "document": doc_content}))

agent memory
binary_score='yes'
Mercedes benz
binary_score='no'


In [6]:
# Generate

from langchain_core.output_parsers import StrOutputParser
from IPython.display import Markdown, display

system = """
You are a helpful assistant for question-answering tasks.
Use the following retrieved context to answer the user's question.

If you don't find the answer in the context, say you don't know â€” do not make up an answer.
"""

human = """
Context:
{context}

Question:
{question}
"""

generate_prompt = ChatPromptTemplate.from_messages([
    ("system", system),
    ("human", human),
])

question = "agent memory"
docs = retriever.invoke(question)

docs_txt = "\n\n".join(doc.page_content for doc in docs)

generate_rag_chain = generate_prompt | llm | StrOutputParser()

generation = generate_rag_chain.invoke({"context": docs_txt, "question": question})

display(Markdown(generation))

In a LLM-powered autonomous agent system, memory is a key component, and it is categorized into two main types:

1.  **Short-term memory:** This refers to the in-context learning capabilities of the model, often utilized through prompt engineering.
2.  **Long-term memory:** This allows the agent to retain and recall information over extended periods. It often leverages an external vector store and fast retrieval mechanisms to provide access to an "infinite" amount of information. Maximum Inner Product Search (MIPS) is mentioned as a method related to long-term memory.

### **Hallucination Grader**

This agent will verify if the LLMs produced any hallucinations while producing the output.

In [7]:
class GradeHallucinations(BaseModel):
    binary_score: str = Field(description = "Grounded answer in the facts, 'yes' or 'no'")

structured_llm_grader = llm.with_structured_output(GradeHallucinations)

system = """You are a grader assessing whether an LLM generation is grounded in / supported by a set of retrieved facts. \n 
     Give a binary score 'yes' or 'no'. 'Yes' means that the answer is grounded in / supported by the set of facts."""

hallucination_prompt = ChatPromptTemplate.from_messages([
    ("system", system),
    ("human", "Set of facts: \n\n {documents} \n\n LLM generation: {generation}"),
])

hallucination_grader = hallucination_prompt | structured_llm_grader
hallucination_grader.invoke({"documents": docs, "generation": generation})

GradeHallucinations(binary_score='yes')