# Chatbot

## Parsing

In [1]:
# Importing neccesary libraries
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [None]:
# Main URLs for data
main_urls = ['https://library.iitgn.ac.in/faqs.php','https://library.iitgn.ac.in/libhours.php','https://library.iitgn.ac.in/brrr.php','https://library.iitgn.ac.in/librarypolicy.php','https://library.iitgn.ac.in/staff.php','https://library.iitgn.ac.in/minilibrary.php','https://library.iitgn.ac.in/children_area.php','https://library.iitgn.ac.in/slc.php','https://library.iitgn.ac.in/giving.php','https://library.iitgn.ac.in/printresources.php','https://library.iitgn.ac.in/bibliographic_databases.php','https://library.iitgn.ac.in/dds.php','https://library.iitgn.ac.in/ill.php','https://library.iitgn.ac.in/ris.php','https://library.iitgn.ac.in/alumni_membership.php','https://library.iitgn.ac.in/similarity.php','https://library.iitgn.ac.in/grammarly.php','https://library.iitgn.ac.in/planning_beginning.php','https://library.iitgn.ac.in/uom_research.php','https://library.iitgn.ac.in/vdi_research.php','https://library.iitgn.ac.in/researchimpact_metrics.php','https://library.iitgn.ac.in/citation_reference.php','https://library.iitgn.ac.in/cwt_sessions.php','https://library.iitgn.ac.in/research_publications.php']

In [3]:
def extarct_text(url):
    # Fetch the page
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract all text
    all_text = soup.get_text(separator='\n', strip=True)
    sentences_1 = all_text.split("\n")
    sentences_1 = sentences_1[109:len(sentences_1)-31] # Removing the additional unnecessary data
    page_content_1 = " ".join(sentences_1)
    sentences_2 = page_content_1.split("\t")
    page_content_2 = " ".join(sentences_2)
    return page_content_2 # Removing the next line space and tab space

In [4]:
# Storing all the data
data = []
for url in main_urls:
    all_sentences = extarct_text(url)
    data.append(all_sentences)

print(data)



## Making embeddings

In [78]:
!pip install langchain_community



In [79]:
!pip install faiss-cpu



In [80]:
!pip install langchain_groq



In [None]:
# Importing neccesary libraries
import langchain
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS

In [6]:
# Defining the best chunks size so that it gives the best answer out of the data
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

In [8]:
# Loading the CSV file with FAQs in the website and also new FAQs which are not in the website but important.
loader = CSVLoader("FAQs.csv", encoding="utf-8-sig")

data_2 = loader.load()

In [9]:
# As the embedder was not able to read the tables properly so I converted the tabular data into text.
extra_information = "Faculty(Regular, Visiting, Adjunct, Guest) can borrow 30 books from general collection for one semester due date is 15th May and 15th December, can borrow 2 reference collection for 5 days, can borrow 2 Loose issue of Journals/Magazines for 5 days, can borrow 2 CD, DVDs etc for 5 days and can borrow 6 Children’s Collection for 30 days. Research Scholars(PhD & PDF) can borrow 12 books from general collection for 30 days, can borrow 2 Loose issue of Journals/Magazines for 5 days and can borrow 6 Children’s Collection for 30 days. PG Students (MTech, MSc, & MASC) can borrow 8 books from general collection for 15 days, can borrow 2 Loose issue of Journals/Magazines for 5 days and can borrow 6 Children’s Collection for 30 days. UG Students can borrow 6 books from general collection for 15 days, can borrow 2 Loose issue of Journals/Magazines for 5 days and can borrow 6 Children’s Collection for 30 days. Administrative & Project Staff including Trainees can borrow 6 books from general collection for 30 days, can borrow 2 Loose issue of Journals/Magazines for 5 days, can borrow 2 CD, DVDs etc for 5 days and can borrow 6 Children’s Collection for 30 days. The issue and return timings are 9:00 am to 8:00 pm right now."

In [10]:
# Combine all page contents into one string
all_content = "\n".join([doc.page_content for doc in data_2])

# Adding the new data into the data to make the embeddings
main_urls.append("FAQs")
main_urls.append("Extra Information")
data.append(all_content)
data.append(extra_information)

In [11]:
# Convert list of strings to list of Document objects
documents = [Document(page_content=data[k], metadata={"source": main_urls[k]}) for k in range(len(data))]

# Now split the documents according to the chunk size
docs = text_splitter.split_documents(documents)

print("Number of chunks::", len(docs))
print("Chunks::", docs)

Number of chunks:: 3300


In [12]:
# Create the embeddings of the chunks using MTB(Match the Blacks) embedder
embeddings = HuggingFaceEmbeddings(
    model_name="thenlper/gte-base",
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": True}
)

# Pass the documents and embeddings inorder to create FAISS vector index
vector_index = FAISS.from_documents(docs, embeddings)

  embeddings = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


In [13]:
# Saving the faiss embeddings
vector_index.save_local("faiss_index")

In [89]:
# Reusing the faiss embeddings
vector_index = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

## Checking the bot's answer

In [14]:
# Setting up the Groq LLM
llm = ChatGroq(groq_api_key="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", model_name="llama3-70b-8192")

In [15]:
# Question and answering pipline which takes the LLM and embedding and after the similarity search give the answer
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever = vector_index.as_retriever(search_type="similarity", search_kwargs={"k": 3}))

In [16]:
# Giving a sample query for the testing of the bot
query = "How many book can a PG student borrow?"
langchain.debug = True

# Response of the Q&A pipeline
response = chain({"question": query}, return_only_outputs=False)

# Extract outputs
answer = response.get("answer", "").strip()
sources = response.get("sources", "").strip()

# Final output
print("Answer:", answer)
print("Sources:", sources)

  response = chain({"question": query}, return_only_outputs=False)


[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain] Entering Chain run with input:
[0m{
  "question": "How many book can a PG student borrow?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "input_list": [
    {
      "context": "Faculty(Regular, Visiting, Adjunct, Guest) can borrow 30 books from general collection for one semester due date is 15th May and 15th December, can borrow 2 reference collection for 5 days, can borrow 2 Loose issue of Journals/Magazines for 5 days, can borrow 2 CD, DVDs etc for 5 days and can borrow 6 Children’s Collection for 30 days. Research Scholars(PhD & PDF) can borrow 12 books from general collection for 30 days, can borrow 2 Loose issue of Journals/Magazines for 5 days and can

Token indices sequence length is longer than the specified maximum sequence length for this model (1692 > 1024). Running this sequence through the model will result in indexing errors


[32;1m[1;3m[chain/start][0m [1m[chain:RetrievalQAWithSourcesChain > chain:MapReduceDocumentsChain > chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "How many book can a PG student borrow?",
  "summaries": "Content: Relevant text:\nPG Students (MTech, MSc, & MASC) can borrow 8 books from general collection for 15 days...\nSource: Extra Information\n\nContent: There is no relevant text to answer the question \"How many book can a PG student borrow?\" The provided text explains the different types of book lending policies, such as Short Loan, Vacation Issues, and Personal Books, but it does not mention the number of books a PG student can borrow.\nSource: https://library.iitgn.ac.in/librarypolicy.php\n\nContent: There is no relevant text for this question. The text only mentions borrowing rules for \"Students (Faculty of Engg. only)\", \"days\", and \"Administrative & Project Staff including Trainees\", but not for PG students.\nSource: Extra Information"
}
[32;1m

## Finding most relevant question

In [93]:
# Loading the FAQs
import pandas as pd
faq_data = pd.read_csv("/content/FAQs.csv", encoding="utf-8-sig")

In [94]:
# Setting up the sentence transformer to search for most relevant connected questions
from sentence_transformers import SentenceTransformer
from sklearn.neighbors import NearestNeighbors
import numpy as np

model = SentenceTransformer("all-MiniLM-L12-v2")
questions = faq_data["Questions"].tolist()
embeddings = model.encode(questions)

# Fit NearestNeighbors
nn = NearestNeighbors(n_neighbors=3, metric='cosine').fit(embeddings)

In [95]:
# Getting the 3 most relevant questions
def get_related_questions(user_query):
    query_embedding = model.encode([user_query])
    _, indices = nn.kneighbors(query_embedding)
    return [questions[i] for i in indices[0]]

In [96]:
print(get_related_questions(query))

['How many SL books can I borrow at a time?', 'How long can I borrow a CR book?', 'Does the Library have copies of textbooks?']
