# Assignment: Create a retrieval augmented chain to accept search terms and return results from your vector db or in memory dictionary

In [0]:
!pip install langchain==0.3.0 langchain-chroma==0.1.4 langchain-community==0.3.0 langchain-core==0.3.0 langchain-huggingface==0.1.0 langchain-text-splitters==0.3.0 pypdf

In [0]:
dbutils.library.restartPython() 

In [0]:
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace

## 1. Set Up models & Vector Store

In [0]:
import os
os.environ["HUGGINGFACEHUB_API_TOKEN "] = ""

In [0]:
# Create a Chroma vector store
embeddings = HuggingFaceEmbeddings(
    model_name="BAAI/bge-m3",
)

# Create the LLM
llm = HuggingFaceEndpoint(
    repo_id="microsoft/Phi-3-mini-4k-instruct",
    task="text-generation",
    huggingfacehub_api_token=os.environ.get("HUGGINGFACEHUB_API_TOKEN "),
    max_new_tokens=512,
    top_k=10,
    top_p=0.95,
    temperature=0.01,
    do_sample=False,
    repetition_penalty=1.03,
)

chat_llm = ChatHuggingFace(llm=llm, verbose=True)

## 2. Adding document corpus to Vector Store

In [0]:
from langchain_community.document_loaders import PyPDFLoader

def get_docs(path):
    loader = PyPDFLoader(path)
    pages = loader.load_and_split()
    return pages

In [0]:
docs = []

for i in dbutils.fs.ls("/Volumes/dbx_genai_classroom/rag/retreival_docs/"):
    if "vision_transformers" not in i.path and i.path.endswith("pdf"):
        docs += get_docs(i.path.replace('dbfs:', ''))

In [0]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200
)
splits = text_splitter.split_documents(docs)

In [0]:
vector_store = Chroma(
    collection_name=f"ArunachalPradesh",
    embedding_function=embeddings,
)

**Note**: Run this cell only once to initialize the vector store

In [0]:
vector_store.add_documents(splits)

## 3. Augemented Text Generation

In [0]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder

In [0]:
# Contextualize question #
contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate one standalone question which can be understood "
    "without the chat history. Ensure not to loose the semantic & syntactic meaning of the question."
)
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

retriever = vector_store.as_retriever()
print("Initialized Retriever")

history_aware_retriever = create_history_aware_retriever(
    chat_llm, retriever, contextualize_q_prompt
)

print("Created History aware Retriever")

# Answer question #
system_prompt = (
    "You are an expert on the state of Arunchal Pradesh adept at question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

rag_chain = create_retrieval_chain(
    history_aware_retriever, question_answer_chain
)

print("Created RAG Chain")

Initialized Retriever
Created History aware Retriever
Created RAG Chain


In [0]:

from langchain_core.chat_history import BaseChatMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

store = {}

def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]

In [0]:
conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [0]:
generated_content = conversational_rag_chain.invoke(
    {"input": "What are the primary animals found in Arunchal Pradesh?"},
    config={
        "configurable": {"session_id": "uniqueid001"},
    },
)

In [None]:
print(generated_content["answer"])

In [None]:
for doc in generated_content["context"]:
    print("------------------------------------------------------------------------------------------")
    print(f"Context taken from : {doc.metadata['source']}")
    print(doc.page_content)