In [3]:
import kagglehub
import os
import pandas as pd
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer 
import faiss
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


## Step 1: Load your FAQ dataset

In [None]:
# load faq json 
df = pd.read_json('data/Ecommerce_FAQ_Chatbot_dataset.json')
df.head()

Unnamed: 0,questions
0,"{'question': 'How can I create an account?', '..."
1,{'question': 'What payment methods do you acce...
2,"{'question': 'How can I track my order?', 'ans..."
3,"{'question': 'What is your return policy?', 'a..."
4,"{'question': 'Can I cancel my order?', 'answer..."


In [6]:
docs = []
for _, row in df.iterrows():
    rows_dict = row['questions']
    content = f"Q: {rows_dict['question']}\nA: {rows_dict['answer']}"
    docs.append(Document(page_content=content))

## Step 2: Split into chunks

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
)
docs_split = splitter.split_documents(docs)

## Optional Step: Create Embeddings + Vector Store

- This is manual way to embedd and create a faiss index
- We don't need to do this if we use langchain

In [None]:
from sentence_transformers import SentenceTransformer 
import faiss
import numpy as np 

# load embedding model
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# generate embeddings for all chunks
texts = [doc.page_content for doc in docs_split]
embeddings = embedder.encode(texts, convert_to_numpy=True)

# Facebook AI Similarity Search - FAISS
# FAISS main purpose is to store and search large sets of high-dimensional vectors efficiently.
# FAISS lets you quickly find the most similar vectors to a query vector without scanning all vectors linearly.
 
# Create FAISS index 
dimension = embeddings.shape[1]
print(f"Dimension of embeddings: {dimension}")
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

print(f"Stored {index.ntotal} documents in FAISS index.")

Dimension of embeddings: 384
Stored 79 documents in FAISS index.


## Step 3: Create Embeddings and Setup Retriever in LangChain

In [30]:
from langchain.chains import RetrievalQA
from langchain_community.vectorstores import FAISS
# from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline


# create embeddings from docs_split
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# Create FAISS vector store from documents and embeddings
vectorstore = FAISS.from_documents(docs_split, embeddings)

# Create retreiver
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k":10, "score_threshold": 0.9})

## Step 4: Intialize local Ollama LLM and prepare prompt template

In [None]:
from langchain_ollama import OllamaLLM
from langchain.llms import Ollama

llm = OllamaLLM(model="mistral",
                temperature=0,
                system=(
                    "You are a strict QA assistant. Use ONLY the provided context. "
                    "If the answer is not fully supported by the context, reply exactly: I don’t know."
                    )
                )

In [28]:
from langchain.prompts import PromptTemplate

# Custom prompt that restricts answers
prompt_template = """
Context:
{context}

Question: {question}

Answer:
"""

prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

## Step 5: Setup function to invoke query

In [38]:
def get_answer(qa, query):
    candidates = retriever.get_relevant_documents(query)
    if len(candidates) == 0:
        print("No relevant documents found.")
        return {'query': query, 'result': 'Sorry, I cannot find the answer to your query!'}
    else:
        print(f"Found {len(candidates)} documents.")
        answer = qa.invoke(query)
        return answer

## Step 6: Run the queries

In [41]:
# Create RetrievalQA chain
qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff", # simplest way to combine documents with a prompt
    chain_type_kwargs={"prompt": prompt}
)

# Ask a question 

queries = ["Where is Nepal", "What is Nuclear Physics", "How can I cancel my shipment?", "How much is the delivery charge?",
           "What is your return policy?", "What is the expensive product of yours", "Is there a way to order a product currently not available in store?"]
for query in queries:
    answer = get_answer(qa, query)
    print(f"Question = {answer['query']}")
    print(f"Answer = {answer['result']}\n")

No relevant documents found.
Question = Where is Nepal
Answer = Sorry, I cannot find the answer to your query!

No relevant documents found.
Question = What is Nuclear Physics
Answer = Sorry, I cannot find the answer to your query!

Found 2 documents.
Question = How can I cancel my shipment?
Answer =  To cancel your shipment, you should contact our customer support team immediately. Provide them with your order details, and they will guide you through the cancellation process, ensuring that your shipment is cancelled if it has not yet been shipped.

No relevant documents found.
Question = How much is the delivery charge?
Answer = Sorry, I cannot find the answer to your query!

Found 2 documents.
Question = What is your return policy?
Answer =  Our return policy allows you to return products within 30 days of purchase for a full refund, provided they are in their original condition and packaging. However, damage due to improper use may not be eligible for a return. For detailed instruct