In [18]:
import os
import json
import fitz  # PyMuPDF for PDF loading
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_groq import ChatGroq
from langchain.schema import Document
from langchain.chains import RetrievalQA
import streamlit as st
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader


In [19]:
working_dir = os.getcwd()
# Load config file
config_path = os.path.join(working_dir, "config.json")
with open(config_path, "r") as file:
    config_data = json.load(file)
GROQ_API_KEY = config_data["GROQ_API_KEY"]
os.environ["GROQ_API_KEY"] = GROQ_API_KEY

In [20]:
# Load embedding model
embedding = HuggingFaceEmbeddings()

# Load LLM from Groq
llm = ChatGroq(model="deepseek-r1-distill-llama-70b", temperature=0)

In [21]:
# Extract data from PDF file 
def load_pdf_file(data):
    loader=DirectoryLoader(data,
                           glob="*.pdf",
                           loader_cls=PyPDFLoader)
    documents=loader.load()
    return documents

In [22]:
extracted_data=load_pdf_file(data='Data/')


In [23]:
extracted_data

[Document(metadata={'producer': 'Adobe PDF Library 8.0', 'creator': 'Adobe InDesign CS3 (5.0)', 'creationdate': '2010-09-27T10:20:00+00:00', 'moddate': '2010-10-01T14:03:46+05:30', 'trapped': 'False', 'source': 'Data\\B4572 (1).pdf', 'total_pages': 181, 'page': 0, 'page_label': 'i'}, page_content="World Health House\nIndraprastha Estate,\nMahatma Gandhi Marg,\nNew Delhi-110002, India\nWebsite: www.searo.who.int 9 7 8 9 2 9 0 2 2 3 8 2 5\nISBN 978 92 9022 382 5\nHerbal medicines constitute the main component of traditional medicine, \nwhich have been used since thousands of years. They have made \nsignificant contribution to human health through their health promotive, \ncurative and rehabilitative properties and in the prevention of illnesses. \nIndeed, many herbal remedies used traditionally have become modern \nmedicines through drug development. Digoxin, morphine, colchicine, and \nartemisinin are some notable examples. Long tradition of use of many \nherbal remedies and experiences

In [24]:
def split_text_into_chunks(documents, chunk_size=500, chunk_overlap=20):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, 
        chunk_overlap=chunk_overlap
    )
    return text_splitter.split_documents(documents)



In [25]:
text_chunks = split_text_into_chunks(extracted_data)
print("Number of text chunks:", len(text_chunks))

Number of text chunks: 1652


In [26]:
text_chunks

[Document(metadata={'producer': 'Adobe PDF Library 8.0', 'creator': 'Adobe InDesign CS3 (5.0)', 'creationdate': '2010-09-27T10:20:00+00:00', 'moddate': '2010-10-01T14:03:46+05:30', 'trapped': 'False', 'source': 'Data\\B4572 (1).pdf', 'total_pages': 181, 'page': 0, 'page_label': 'i'}, page_content='World Health House\nIndraprastha Estate,\nMahatma Gandhi Marg,\nNew Delhi-110002, India\nWebsite: www.searo.who.int 9 7 8 9 2 9 0 2 2 3 8 2 5\nISBN 978 92 9022 382 5\nHerbal medicines constitute the main component of traditional medicine, \nwhich have been used since thousands of years. They have made \nsignificant contribution to human health through their health promotive, \ncurative and rehabilitative properties and in the prevention of illnesses.'),
 Document(metadata={'producer': 'Adobe PDF Library 8.0', 'creator': 'Adobe InDesign CS3 (5.0)', 'creationdate': '2010-09-27T10:20:00+00:00', 'moddate': '2010-10-01T14:03:46+05:30', 'trapped': 'False', 'source': 'Data\\B4572 (1).pdf', 'total_pa

In [27]:
def create_faiss_vector_db(text_chunks):
    """Generate FAISS vector store from text chunks."""
    print("🔍 Creating FAISS vector database...")
    vectordb = FAISS.from_documents(text_chunks, embedding)
    vectordb.save_local("faiss_index")
    print("✅ FAISS Database Created and Saved!")
    return vectordb

In [35]:
# Check if FAISS index exists, else create it
faiss_index_path = "faiss_index"
if os.path.exists(faiss_index_path):
    print("📂 Loading existing FAISS index...")
    # Set allow_dangerous_deserialization=True to bypass the pickle security check
    vectordb = FAISS.load_local(faiss_index_path, embedding, allow_dangerous_deserialization=True)
else:
    vectordb = create_faiss_vector_db(text_chunks)

retriever = vectordb.as_retriever()



📂 Loading existing FAISS index...


In [36]:
from langchain_core.prompts import ChatPromptTemplate
system_prompt=(
    'You are an assistent for question answering task.'
    'Use the following pices of retrived context to answer'
    'the question.If you do not know the answer say that you '
    'do not know . Use three sentences maximum '
    'And keep the answer concise.'
    "\n\n"
    "{context}"
)
prompt=ChatPromptTemplate.from_messages([
    ("system",system_prompt),
    ("human","{input}")
])

In [40]:
from langchain.chains import RetrievalQA
retriever = vectordb.as_retriever()

def answer_question(user_question):
    print("Generating answer...")
    qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
    response = qa_chain.invoke({"query": user_question})

    return response["result"]


question = "What is herbal?"
answer = answer_question(question)
print(answer)


Generating answer...
<think>
Okay, so I need to figure out what "herbal" means based on the provided context. Let me read through the context again to understand it better.

The context starts by explaining that herbal medicine is a significant resource for treating illnesses worldwide. It mentions that while conventional medicine is common in Western countries, traditional medicine, especially herbal remedies, is just as popular in places like China and India. This tells me that "herbal" relates to medicine made from plants.

Next, the context talks about the process of making herbal medicines, from harvesting plants to processing them. It mentions wild-crafting, which is picking plants from the wild, and gives examples like elder flowers and berries. This indicates that "herbal" refers to products derived from plants used for medicinal purposes.

Putting it all together, "herbal" seems to describe something that's made from plants and used in medicine. So, herbal medicine uses plant-