# 0. Install dependencies (optional)

In [None]:
#! pip install pypdf      
# ! pip install chromadb
#!pip install lark
# !pip install -U langchain-huggingface

In [None]:
import os
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from langchain_chroma import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings.huggingface import HuggingFaceEmbeddings


# 1. Set the Environment

In [None]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = ""  #replace with your token

# 2. Document Loading

## 2.1 PDFs

 In this project we will be dealing with PDFs, however other types of Loaders can also be used

In [None]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("Module 1 .pdf")    # path to pdf file
pages = loader.load()            

In [None]:
len(pages)     #number of pages in the pdf

In [None]:
page = pages[1]     #the second page

In [None]:
print(page.page_content[0:500])

In [None]:
page.metadata

# 2. Document Splitting

We will be discussing two main kinds of Splitters - Recursive Character and Character TextSplitter

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

In [None]:
chunk_size =26
chunk_overlap = 4

In [None]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

In [None]:
text1 = 'abcdefghijklmnopqrstuvwxyz'

In [None]:
r_splitter.split_text(text1)

since the chunk size is 26 and the length of the text is also 26, the full text acts as the chunk

In [None]:
text2 = 'abcdefghijklmnopqrstuvwxyzabcdefg'

In [None]:
r_splitter.split_text(text2)

this time the text size was more so the text is split into two chunks with overlap of 4

In [None]:
text3 = "a b c d e f g h i j k l m n o p q r s t u v w x y z"

In [None]:
r_splitter.split_text(text3)

In [None]:
c_splitter.split_text(text3)

each alphabet is counted as one character and the spaces are omitted, this happens because the default separator for characterSplitter is newline 

In [None]:
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separator = ' '
)
c_splitter.split_text(text3)


here we define the separator as a space, so now it works like we would expect it to

## 2.1 Recurisve Splitting Details

`RecursiveCharacterTextSplitter` is recommended for generic text. 

In [None]:
some_text = """When writing documents, writers will use document structure to group content. \
This can convey to the reader, which idea's are related. For example, closely related ideas \
are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n  \
Paragraphs are often delimited with a carriage return or two carriage returns. \
Carriage returns are the "backslash n" you see embedded in this string. \
Sentences have a period at the end, but also, have a space.\
and words are separated by space."""

In [None]:
len(some_text)

In [None]:
c_splitter = CharacterTextSplitter(
    chunk_size=450,
    chunk_overlap=0,
    separator = ' '
)
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=150,
    chunk_overlap=0,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)


the text is split recursively based on the priority of these separator till the desired chunk size is obtained


In [None]:
c_splitter.split_text(some_text)

In [None]:
r_splitter.split_text(some_text)

# 3. VectorStores & Embeddings

## 3.1 Preprocessing

Data Loading & Splitting

In [None]:
from langchain.document_loaders import PyPDFLoader

# Load PDF
loaders = [

    PyPDFLoader("Module 1 .pdf"),
    PyPDFLoader("Module 2.pdf"),
    PyPDFLoader("Module 3 .pdf")
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [None]:
len(docs)    #docs has all the pages of the all the pdfs

In [None]:
docs[100].metadata

In [None]:
docs[100].page_content

In [None]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap = 200,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)

In [None]:
splits = text_splitter.split_documents(docs)

In [None]:
len(splits)

In [None]:
splits[94].page_content

In [None]:
splits[94].metadata

## 3.2 Embeddings

 We will use ChromaDB to store the vector embeddings

In [None]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
embedding = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

In [None]:
sentence1 = "i like dogs"
sentence2 = "i like canines"
sentence3 = "the weather is ugly outside"

In [None]:
embedding1 = embedding.embed_query(sentence1)
embedding2 = embedding.embed_query(sentence2)
embedding3 = embedding.embed_query(sentence3)

In [None]:
import numpy as np

In [None]:
np.dot(embedding1, embedding2) #shows how similar the embeddings of sentence1 are to sentence2

In [None]:
np.dot(embedding1, embedding3) #shows how similar the embeddings of sentence1 are to sentence3

In [None]:
np.dot(embedding2, embedding3) #shows how similar the embeddings of sentence2 are to sentence3

## 3.3 Vectorstores

In [None]:
from langchain.vectorstores import Chroma

persist_directory = 'docs/test/'  #define directory which will store the ChromaDB

In [None]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [None]:
print(vectordb._collection.count())

documents are automatically persisted

### Similarity Search

In [None]:
question = "what is fitts law"

In [None]:
docs = vectordb.similarity_search(question,k=3)   #will search for top k results

In [None]:
len(docs)

In [None]:
docs[0].page_content

# 4. Retrieval

## 4.1 Similarity Search

In [None]:
from langchain.vectorstores import Chroma
persist_directory = 'docs/test/'

In [None]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
embedding = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
vectordb = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding
)

In [None]:
print(vectordb._collection.count())

In [None]:
texts = [
    """The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).""",
    """A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.""",
    """A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.""",
]

In [None]:
smalldb = Chroma.from_texts(texts, embedding=embedding)

In [None]:
question = "Tell me about all-white mushrooms with large fruiting bodies"

In [None]:
smalldb.similarity_search(question, k=2)

In [None]:
smalldb.max_marginal_relevance_search(question,k=2, fetch_k=3)

# 5. Question Answering

In [None]:
from langchain.vectorstores import Chroma
persist_directory = 'docs/test/'

In [None]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
embedding = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
vectordb = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding
)

In [None]:
print(vectordb._collection.count())

## 5.1 RetrievalQA Chain

In [None]:
from langchain_huggingface import HuggingFacePipeline
from langchain.chains import RetrievalQA
from transformers import pipeline

# Initialize the question-answering pipeline with GPU support (device=0)
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad", device=0)

# Use the updated HuggingFacePipeline from the new package
llm = HuggingFacePipeline(pipeline=qa_pipeline)

# Create the RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectordb.as_retriever(),
    chain_type="stuff"  # Ensure that chain_type="stuff" is used for document retrieval
)

context = "Some relevant context extracted from the vector store or another source"
question = "Tell me about all-white mushrooms with large fruiting bodies"

# Now provide both question and context to the HuggingFace QA pipeline
result = qa_chain.invoke({"query": question, "context": context})

# Output the result
print(result["result"])



## 5.2 Prompt

In [None]:
from langchain.prompts import PromptTemplate

# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)


In [None]:
# Run chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [None]:
question = "Is probability a class topic?"

In [None]:
result = qa_chain({"query": question})

In [None]:
result["result"]

In [None]:
result["source_documents"][0]

## 5.3 Working QA with pipeline

In [None]:
#Define the question you want to ask
question = input("enter question: ")

#Perform similarity search on the vector DB to get the most relevant context
results = vectordb.similarity_search(query=question, k=2)  # Retrieve the top 1 result

# Extract the relevant context from the search result
context = results[0].page_content  
doc_id = results[0].metadata

# Prepare the input for the QA pipeline
input_data = {
    "question": question,
    "context": context
}

# Initialize the question-answering pipeline
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad", device = 0) #device=0 for GPU

#Use the pipeline to answer the question based on the retrieved context
response = qa_pipeline(input_data)

# Step 8: Print the answer
print(response['answer'])

# 6. Chatbot

## 6.1 Memory

In [None]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

## 6.2 Conversational Retrieval Chain

In [None]:
from langchain.chains import ConversationalRetrievalChain
retriever=vectordb.as_retriever()
qa = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=retriever,
    memory=memory
)

In [None]:
question = "Is probability a class topic?"
result = qa({"question": question})

In [None]:
result['answer']

# Actual working Chatbot

run only the below cells for the chatbot to work


In [1]:
import os
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from langchain_chroma import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings.huggingface import HuggingFaceEmbeddings


  from tqdm.autonotebook import tqdm, trange


In [2]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = ""  #replace with your token

In [3]:
persist_directory = 'docs/test/'                  #directory to store the chroma db
embedding = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)
print(vectordb._collection.count())

  embedding = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')


422


In [4]:
class ChatbotWithMemory:
    def __init__(self, vectordb, qa_pipeline):
        self.vectordb = vectordb  # The vector database for similarity search
        self.qa_pipeline = qa_pipeline  # The QA pipeline model
        self.memory = []  # Memory to store conversation history
    
    def add_to_memory(self, question, answer):
        """Store question and answer in memory."""
        self.memory.append({"question": question, "answer": answer})
    
    def get_memory_context(self):
        """Combine memory into a context string."""
        context = ""
        for entry in self.memory:
            context += f"Q: {entry['question']}\nA: {entry['answer']}\n"
        return context
    
    def ask_question(self, question, k=2):
        """Answer a question with memory and context retrieval."""
        
        # Perform similarity search on the vector DB to get the most relevant context
        results = self.vectordb.similarity_search(query=question, k=k)
        
        # Extract the relevant context from the search result
        if results:
            retrieved_context = results[0].page_content
            doc_id = results[0].metadata
        else:
            retrieved_context = ""
            doc_id = None

        # Combine memory and retrieved context
        memory_context = self.get_memory_context()
        full_context = memory_context + "\n" + retrieved_context
        
        # Prepare input for the QA pipeline
        input_data = {
            "question": question,
            "context": full_context
        }

        # Use the QA pipeline to generate an answer
        response = self.qa_pipeline(input_data)
        answer = response['answer']

        # Print the answer
        print(f"Answer: {answer}")
        
        # Add this interaction to memory
        self.add_to_memory(question, answer)
        
        return answer

qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad", device=0)

# Initialize chatbot with memory
chatbot = ChatbotWithMemory(vectordb, qa_pipeline)

# Chatbot interaction example
while True:
    user_input = input("You: ")
    if user_input.lower() == 'exit':
        break
    chatbot.ask_question(user_input)

You:  what is fitts law


Answer: MT = a + b . log2(A/W + 1)


You:  who gave fitts law


Answer: psychologist Paul Fitts


You:  what are the design rules


Answer: psychologist Paul Fitts

Design Rules


You:  exit
