# RAG Chatbot Pipeline – Preprocessing, Tuning, and Evaluation
Author: Rajesh Kumar Dogra

This notebook demonstrates document processing, embedding generation, RAG pipeline execution, and evaluation using LangChain, FAISS, and HuggingFace models.

In [None]:
# 📦 Install necessary libraries
!pip install -q langchain langchain-community sentence-transformers faiss-cpu transformers torch pypdf python-dotenv accelerate

## 1. Document Loading & Preprocessing

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

pdf_path = 'data/AI Training Document.pdf'
loader = PyPDFLoader(pdf_path)
pages = loader.load_and_split()
text = "\n".join([p.page_content.strip() for p in pages])

splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
chunks = splitter.split_text(text)
print(f"✅ Processed {len(chunks)} chunks.")

## 2. Embedding & FAISS Vector Store

In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

embeddings = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')
vectorstore = FAISS.from_texts(chunks, embeddings)
vectorstore.save_local('vectordb/faiss_index')
print("✅ Vectorstore saved.")

## 3. Load LLM & Build RAG Chain

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain_core.prompts import PromptTemplate

retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
model_name = 'google/flan-t5-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
llm = HuggingFacePipeline(pipeline=pipe)

prompt = PromptTemplate(
    template="""You are an assistant answering questions based only on the context below.\nIf the answer is not in the context, say 'I don't know'.\n\nContext:\n{context}\n\nQuestion: {question}\n\nAnswer concisely:""",
    input_variables=["context", "question"]
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt},
    return_source_documents=True
)

## 4. Test Query & Evaluation

In [None]:
query = "What is eBay’s return policy?"
result = qa_chain({"query": query})
print("Answer:", result['result'])
print("\nSources:")
for i, doc in enumerate(result['source_documents'], 1):
    print(f"Source {i}:", doc.page_content[:300], "...\n")