In [1]:
!pip install groq==0.9.0 chromadb==0.5.5 langchain-chroma==0.1.2 langchain==0.2.11 langchain-community==0.2.10 langchain-text-splitters==0.2.2 langchain-groq==0.1.6 transformers==4.43.2 sentence-transformers==3.0.1 unstructured==0.15.0 unstructured[pdf]==0.15.0



In [2]:
import os
from google.colab import userdata
from zipfile import ZipFile

os.environ['GROQ_API_KEY'] = userdata.get("GROQ_API_KEY")

with ZipFile('/content/txt.zip') as zip_ref:
  zip_ref.extractall()

In [7]:
from langchain.document_loaders import DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain.chains import create_retrieval_chain, create_history_aware_retriever
from langchain.chains.combine_documents import create_stuff_documents_chain

In [4]:
# load the documents from the directory
loader = DirectoryLoader('/content/txt')

documents = loader.load()

In [5]:
# check the number of characters in the documents
token_count = [len(data.page_content) for data in documents]
print(token_count)

[409, 510, 450, 373]


In [6]:
# initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 200,
    chunk_overlap = 50
)

# split the documents into chunks
texts = text_splitter.split_documents(documents)

In [8]:
# initialize the embedding model
embedding = HuggingFaceEmbeddings()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [9]:
persist_directory = 'doc-db'

In [10]:
# initialize the vector DB
vector_db = Chroma.from_documents(
    documents = texts,
    embedding = embedding,
    persist_directory= persist_directory
)

In [11]:
# initialize the retriever
retriever = vector_db.as_retriever()

In [12]:
# initialize the llm
llm = ChatGroq(
    model = "llama-3.1-70b-versatile",
    temperature = 0
)

# **Add chat history**

In [13]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

contextualize_system_prompt = (
    "Using the chat history answer the latest questions, just reformulate if needed and otherwise return as it is"
)

contextualize_prompt = ChatPromptTemplate.from_messages(
    [
        ("system",contextualize_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human","{input}")
    ]
)

# create the history aware retriever
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_prompt
)

In [14]:
# creating the prompt template for QA chain and RAG chain
system_prompt = (
    "You are an intelligent chatbot. Answer the questions using the following context. If you don't know the answer just say that you don't know."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system",system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human","{input}")
    ]
)

In [15]:
# create the QA chain
qa_chain = create_stuff_documents_chain(llm, prompt)

# create the history aware RAG chain
rag_chain = create_retrieval_chain(history_aware_retriever, qa_chain)

# **Manage Chat session history**

In [17]:
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

# initialize the store for message history
store = {}

# function to get the session history for a given session id
def get_session_history(session_id: str) -> BaseChatMessageHistory:
  if session_id not in store:
    store[session_id] = ChatMessageHistory()
  return store[session_id]

# conversational history aware rag chain
conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key = "input",
    history_messages_key = "chat_history",
    output_messages_key = "answer"
)

In [18]:
# Invoke the conversational RAG chain using example questions
response = conversational_rag_chain.invoke(
    {"input":"What are new thing on AI?"},
    config = {"configurable":{"session_id":"101"}}
)

print(response['answer'])

Significant advancements in artificial intelligence have led to breakthroughs in healthcare, with AI-driven diagnostics improving patient outcomes and reducing costs.


In [19]:
response = conversational_rag_chain.invoke(
    {"input":"Who is the current captain of team India?"},
    config = {"configurable":{"session_id":"101"}}
)

print(response['answer'])

The current captain of the Indian cricket team is Rohit Sharma.


In [20]:
response = conversational_rag_chain.invoke(
    {"input":"provide me an update on the election."},
    config = {"configurable":{"session_id":"101"}}
)

print(response['answer'])

The presidential election is heating up with a highly competitive atmosphere. Recent polls indicate a tight race between incumbent President Jane Doe and her main rival, Senator John Smith. President Jane Doe is focusing on economic stability and healthcare reform, while Senator John Smith is emphasizing education and climate change.


In [21]:
response = conversational_rag_chain.invoke(
    {"input":"Provide me a summary of what we discussed so far."},
    config = {"configurable":{"session_id":"101"}}
)

print(response['answer'])

We discussed the following topics:

1. Artificial Intelligence (AI): I mentioned that there have been significant advancements in AI, particularly in healthcare, with AI-driven diagnostics improving patient outcomes and reducing costs.

2. Indian Cricket Team: I informed you that Rohit Sharma is the current captain of the Indian cricket team.

3. Election Update: I provided a general update that the election season is heating up with a highly competitive atmosphere, but I didn't have any specific information about the candidates or the election.
