In [2]:
# --- Create a RAG chatbot to answer questions based on Chanakya Neeti PDF ---
# important package installations
!pip install langchain openai pypdf faiss-cpu sentence-transformers
!pip install -U langchain-community pypdf
!pip install -qU langchain-openai

Collecting pypdf
  Downloading pypdf-6.0.0-py3-none-any.whl.metadata (7.1 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading pypdf-6.0.0-py3-none-any.whl (310 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.5/310.5 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m57.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf, faiss-cpu
Successfully installed faiss-cpu-1.12.0 pypdf-6.0.0
Collecting langchain-community
  Downloading langchain_community-0.3.29-py3-none-any.whl.metadata (2.9 kB)
Collecting requests<3,>=2.32.5 (from langchain-community)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7,>=0.6.7 (from langchain-commun

In [33]:
# Mount google drive to get the PDF for Chanakya Neeti
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Get PDF document and number of pages in the document

In [7]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from sentence_transformers import SentenceTransformer

# --- Configuration ---
PDF_PATH = "/content/drive/MyDrive/Colab Notebooks/AI-ML(new)/ChanakyaNeeti_in_English.pdf"  # Replace with the actual path to your PDF
# Make sure your OpenAI API key is set as an environment variable
# os.environ["OPENAI_API_KEY"] = "sk-..."

# --- 1. Load the Document ---
print("Loading PDF document...")
loader = PyPDFLoader(PDF_PATH)
documents = loader.load()
print(f"Loaded {len(documents)} pages.")

Loading PDF document...
Loaded 14 pages.


## Set environment variables to get Hugging Face Token and Open AI API key

In [None]:
# Environment Variables
import os
import getpass

# -------- Removed keys/tokens due to security reasons --------
# -------- uncomment and use your API key and tokens from Open AI and Hugging Face respectively --------

# key = ""
# if not os.environ.get("OPENAI_API_KEY"):
#   os.environ["OPENAI_API_KEY"] = key

# hf_token = ""
# if not os.environ.get("HF_TOKEN"):
#   os.environ["HF_TOKEN"] = hf_token

## Split the document to number of chunks based on separation (first prioirity: New Paragraph, second: New line, third: word based and fourth: each character

In [9]:
# --- 2. Split the Text ---
print("Splitting document into chunks...")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100,separators=["\n\n", "\n", " ", ""] )

texts = []
for doc in documents:
    texts.extend(text_splitter.split_text(doc.page_content))

print(f"Split into {len(texts)} text chunks.")

Splitting document into chunks...
Split into 173 text chunks.


## Install and use ChromaDB for Vector Enbedding Storage

In [11]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-1.0.20-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.3 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.9 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.36.0-py3-none-any.whl.metadata (2.4 kB)
Collecting pypika>=0.48.9 (from chromadb)
  Downloading PyPika-0.48.9.tar.gz (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [

In [12]:
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions
# from transformers import AutoTokenizer, AutoModel
# import torch

# # Load model & tokenizer
# model_name = "sentence-transformers/all-MiniLM-L6-v2"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModel.from_pretrained(model_name)

# #

model_name = "all-MiniLM-L6-v2"

# Load sentence-transformers model directly (optional, to check)
embedding_model = SentenceTransformer(model_name)

# Define Chroma embedding function
hf_embedder = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=model_name
)

# Function to get embeddings
def embed_texts(texts):
    return embedding_model.encode(texts).tolist()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [13]:
from langchain.vectorstores import Chroma

persist_directory = 'docs/chroma/'

In [14]:
!rm -rf ./docs/chroma  # remove old database files if any

In [15]:
# Init Chroma
chroma_client = chromadb.PersistentClient(path="./docs/chroma")

collection = chroma_client.get_or_create_collection(
    name="my_collection",
    embedding_function=hf_embedder
)

collection.add(
    documents=texts,
    embeddings = embed_texts(texts),
    ids=[f"id_{i}" for i in range(len(texts))]
)


## Create retrieval of documents based on Vector inclination (similarity based) to the query embedding given as user input

In [16]:
def retrieve_documents(query, collection, top_k=3):
    # Step 1: embed the query
    query_embedding = embed_texts(query)
    print(query)

    # Step 2: query Chroma collection
    results = collection.query(
        query_embeddings=query_embedding,
        n_results=top_k
    )

    docs = results['documents'][0]
    ids = results['ids'][0]
    return docs, ids

### Create a Hugging Face InferenceClient Chat Completion Chatbot to answer any user prompt based on the PDF context
Ensure the max tokens ensure the chatbot gives a proper reply to your query and the number of similarities (top_k) also defines how many similar contexts do you want your LLM to answer based on.

In [31]:
from huggingface_hub import InferenceClient

def generate_answer(query, collection, top_k=3):
    docs, ids = retrieve_documents(query, collection, top_k=top_k)

    # Create a prompt for the LLM
    context = "\n\n".join([f"[{i+1}] {doc}" for i, doc in enumerate(docs)])
    prompt = f"""
                You are an expert assistant. Answer the following question using ONLY the documents provided below.
                Cite each reference in square brackets corresponding to the documents used.

                Documents:
                {context}

                Question:
                {query}

                Answer with citations:
            """

    messages = [{"role": "user", "content": prompt}]

    client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")

    response = client.chat_completion(messages, max_tokens=500)

    answer = "Assistant: \n" + str(response.choices[0].message["content"])
    return answer

In [32]:
query = "What does the book say about discipline?"
answer = generate_answer(query, collection, 5)
print(answer)

What does the book say about discipline?
Assistant: 
The book discusses discipline in various contexts. 

From a passage in [2], it can be inferred that maintaining discipline is essential to achieve certain goals and maintain a peaceful life, as mentioned in the following lines: "What peace can we expect from a rascal friend?...How can renown be gained by instructing an unworthy disciple?" This implies that discipline is necessary for relationships, education, and personal growth.

The book also talks about the importance of learning from animals and their characteristics, specifically the dog and the ass, as mentioned in [3] and [5]. The dog's qualities such as "obedience to the master" and "bravery" [5] and the ass's qualities like being "unmindful of cold and heat" and "always contented" [5] are considered virtues that should be learned for discipline.

Moreover, the book emphasizes the importance of self-control and discipline in personal life, as mentioned in [3]: "As long as you