<a href="https://colab.research.google.com/github/codeREXus/langchain-learnings/blob/main/mini_projs/RAG_QA_Bot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 # **Rag QA Bot**


In [3]:
%%capture
!pip install "langchain_google_genai"
!pip install "langchain"
!pip install "transformers==4.41.2"
!pip install  "huggingface-hub==0.23.4"
!pip install  "sentence-transformers==2.5.1"
!pip install  "chromadb"
!pip install  "langchain-community"
!pip install  "wget==3.2"
!pip install --upgrade torch --index-url https://download.pytorch.org/whl/cpu

In [2]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

from langchain_google_genai import GoogleGenerativeAI, GoogleGenerativeAIEmbeddings
import wget

In [16]:
from google.colab import userdata


# **Rag Archtecture**


## 1.   Indexing


*   Load data
*   Pre-process
*   Split into chunks
*   Embed as vectors
* Store in vector db


## 2.   Retreival and generation
* Retrival - search for similar vectors, to match prompt
* generaion - model results with output from the prompt



# Load data

In [82]:
filename = 'companyPolicies.txt'
url = input('Put the link to your document here: ')

# Use wget to download the file
wget.download(url, out=filename)
print('file downloaded')

Put the link to your document here: https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/6JDbUb_L3egv_eOkouY71A.txt
file downloaded


In [7]:
with open(filename, 'r') as file:
    # Read the contents of the file
    contents = file.read()
    print(len(contents.split("\n\n")))

94


# Split into chunks

In [14]:
loader = TextLoader(filename)
docs = loader.load()
text_splitter  = CharacterTextSplitter(chunk_size= 1000,separator="\n",chunk_overlap = 0)
texts = text_splitter.split_documents(docs)
print(len(texts))

18


#Embedding
we will have a qualitative analysis of a huggingface embeder and gemini embeder

In [18]:
hg_embeding= HuggingFaceEmbeddings()
hg_docsearch = Chroma.from_documents(texts, hg_embeding)

g_embedding = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001",
    google_api_key=userdata.get('google-api-key')
)
g_docsearch = Chroma.from_documents(texts, g_embedding)
print("ingested")

ingested


# Intialise a llm


In [32]:
import pprint

In [22]:
llm=GoogleGenerativeAI(
    model='gemini-1.5-flash-latest',
    google_api_key=userdata.get('google-api-key')
)

# run the chain/qa bot

In [37]:
qa = RetrievalQA.from_chain_type(llm=llm,
                                 chain_type="stuff",
                                 retriever=hg_docsearch.as_retriever(),
                                 return_source_documents=False)
query = "can you summarise the document for me?"
pprint.pprint(qa.invoke(query))
print(len(query))

{'query': 'can you summarise the document for me?',
 'result': 'The provided text consists of two identical sections outlining a '
           "company's Code of Conduct and a Recruitment Policy.  The Code of "
           'Conduct emphasizes integrity, respect, accountability, safety, and '
           'environmental responsibility.  It details expectations for ethical '
           'behavior, diversity and inclusion, responsible reporting, and '
           'sustainable practices.  The Recruitment Policy is identical to the '
           "Code of Conduct, indicating that the company's recruitment "
           'practices are guided by the same ethical principles.'}
38


In [38]:
from IPython.display import display, Markdown

# 1. Invoke the chain
response = qa.invoke(query)

# 2. Safely get the query and result
query_text = response.get('query', 'No query found.')
result_text = response.get('result', 'No result found.')

# 3. Create a formatted Markdown string
markdown_output = f"""
### 📝 Query
> {query_text}

---

### 💡 Answer
> {result_text}
"""

# 4. Display the rendered Markdown
display(Markdown(markdown_output))


### 📝 Query
> can you summarise the document for me?

---

### 💡 Answer
> The provided text consists of two identical sections outlining a company's Code of Conduct and a Recruitment Policy (which is also identical to the Code of Conduct).  The Code of Conduct emphasizes integrity, respect, accountability, safety, and environmental responsibility.  It details expectations for ethical behavior, inclusivity, adherence to laws, and proactive safety measures.  The document stresses that the Code of Conduct is a foundational element of the company culture.


In [48]:
qa = RetrievalQA.from_chain_type(llm=llm,
                                 chain_type="stuff",
                                 retriever=g_docsearch.as_retriever(),
                                 return_source_documents=False)
query = "Can  mobiles be used for gaming in the company?"
pprint.pprint(qa.invoke(query))
print(len(query))

{'query': 'Can  mobiles be used for gaming in the company?',
 'result': 'The provided text does not explicitly address the use of mobile '
           "devices for gaming.  Therefore, I don't know."}
47


In [49]:
from IPython.display import display, Markdown

response = qa.invoke(query)

query_text = response.get('query', 'No query found.')
result_text = response.get('result', 'No result found.')

markdown_output = f"""
### 📝 Query
> {query_text}

---

### 💡 Answer
> {result_text}
"""

display(Markdown(markdown_output))


### 📝 Query
> Can  mobiles be used for gaming in the company?

---

### 💡 Answer
> The provided text does not explicitly address the use of mobile devices for gaming.  Therefore, I don't know.



# To prevent from hallucination and wrongful generation of texts we are setting up a prompt





*   Without your prompt: The LLM behaved like a helpful general assistant. It prioritized giving a useful answer, even if it had to use its own knowledge.
*   With your prompt: You forced the LLM to behave like a strict document analyst. Its only job is to report what's in the document and nothing else.


In [50]:
prompt_template = """Use the information from the document to answer the question at the end. If you don't know the answer, just say that you don't know, definately do not try to make up an answer.

{context}

Question: {question}
"""

PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

chain_type_kwargs = {"prompt": PROMPT}

#context and question are keywords in the RetrievalQA, so LangChain can automatically recognize them as document content and query.

In [54]:
qa = RetrievalQA.from_chain_type(llm=llm,
                                 chain_type="stuff",
                                 retriever=hg_docsearch.as_retriever(),
                                 chain_type_kwargs=chain_type_kwargs,
                                 return_source_documents=False)

query = "Can I steal company vehicles?"
qa.invoke(query)

{'query': 'Can I steal company vehicles?',
 'result': "The provided text does not address the issue of stealing company vehicles.  Therefore, I don't know."}

## Lets set up memory as well. This would help in connecting thoughts and prompts as well (demo)

In [55]:
memory = ConversationBufferMemory(memory_key = "chat_history", return_message = True)

In [56]:
qa = ConversationalRetrievalChain.from_llm(llm=llm,
                                           chain_type="stuff",
                                           retriever=hg_docsearch.as_retriever(),
                                           memory = memory,
                                           get_chat_history=lambda h : h,
                                           return_source_documents=False)

In [66]:
history = []
query = "What is smoking policy?"
result = qa.invoke({"question":query}, {"chat_history": history})
print(result["answer"])

Smoking is only permitted in designated outdoor areas marked by signage.  Smoking is prohibited inside all company buildings, offices, meeting rooms, enclosed spaces, and company vehicles.  Electronic cigarettes and vaping devices are also prohibited.  Proper disposal of smoking materials is required. Failure to comply may result in disciplinary action, including fines or termination of employment.


In [67]:
history.append((query, result["answer"]))

In [68]:
query = "List points in it?"
result = qa({"question": query}, {"chat_history": history})
print(result["answer"])

The key points of the smoking policy are:

* **Designated Smoking Areas:** Smoking is only allowed in designated areas marked by signage.
* **Smoking Restrictions:** Smoking is prohibited inside buildings, offices, meeting rooms, and enclosed spaces.  This includes e-cigarettes and vaping devices.  It is also prohibited in company vehicles.
* **Compliance with Laws:** All must follow federal, state, and local smoking laws.
* **Waste Disposal:**  Proper disposal of smoking materials in designated receptacles is required. Littering is prohibited.
* **Enforcement:** Non-compliance can lead to disciplinary action, including fines or termination for employees.
* **Policy Review:** The policy is regularly reviewed to ensure it aligns with current laws and best practices.


# lets finish it up

In [79]:
def qa():
    memory = ConversationBufferMemory(memory_key = "chat_history", return_message = True,output_key='answer')
    qa = ConversationalRetrievalChain.from_llm(llm=llm,
                                               chain_type="stuff",
                                               retriever=hg_docsearch.as_retriever(),
                                               memory = memory,
                                               get_chat_history=lambda h : h,
                                               chain_type_kwargs=chain_type_kwargs,
                                               return_source_documents=True)
    history = []
    while True:
        query = input("Question: ")

        if query.lower() in ["quit","exit","bye"]:
            print("Answer: Goodbye!")
            break

        result = qa({"question": query}, {"chat_history": history})

        history.append((query, result["answer"]))

        print("Answer: ", result["answer"])
        print("source: ", result['source_documents'][0])

In [76]:
qa()

Question: can i use my personal phone
Answer:  The policy allows limited personal use of mobile devices, provided it doesn't disrupt work obligations.  It also specifies that personal phone usage should be kept separate from company accounts, and any personal charges on company-issued phones must be reimbursed.
source:  [Document(metadata={'source': 'companyPolicies.txt'}, page_content='The Mobile Phone Policy sets forth the standards and expectations governing the appropriate and responsible usage of mobile devices in the organization. The purpose of this policy is to ensure that employees utilize mobile phones in a manner consistent with company values and legal compliance.\nAcceptable Use: Mobile devices are primarily intended for work-related tasks. Limited personal usage is allowed, provided it does not disrupt work obligations.\nSecurity: Safeguard your mobile device and access credentials. Exercise caution when downloading apps or clicking links from unfamiliar sources. Promptly