In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

#import numpy as np # linear algebra
#import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

"""
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
"""

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Purpose of notebook:
To read a given PDF and create an Open Search Context based understanding of the PDF.

In [None]:
# 1. Install EasyOCR (and Tesseract if not already there)
!pip install -q pdfplumber sentence-transformers chromadb


In [1]:
# Import dependencies
# PDF Plumber: To extract text from PDF reliably
# sentence-transformers: To convert text into numerical vectors
# Chromadb: To store and retrieve the vectors

import os
import pdfplumber
import chromadb
from sentence_transformers import SentenceTransformer
import os
from pypdf import PdfReader


import warnings
warnings.filterwarnings('ignore')

2026-01-09 16:17:15.949463: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767975436.136597     174 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767975436.192144     174 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767975436.640615     174 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767975436.640653     174 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767975436.640656     174 computation_placer.cc:177] computation placer alr

In [2]:
# Initialize the Multilingual Embedding Model
print("Loading the Multilingual AI model")
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

Loading the Multilingual AI model


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [3]:
# Since model's default token length is 128 (100 words) we will upgrade it to 512 tokens (1500 words)
model.max_seq_length = 512
print(f"Updated model's sequence length: {model.max_seq_length}")

Updated model's sequence length: 512


# Ingestion and Chunking
Now that we have setup the infrastructure, let's move to the ingestion and chunking of the file.
We will do the following:
- Read: Extract the text from the PDF as a continuous stream
- Chunk: Slice this stream into windows of 1000 characters
- Overlap: Each window slides over by 800 characters (leaving 200 character overlap with the previous one). This ensures that the next window will capture any meaning or verse left behind in the previous one. 

In [4]:
from tqdm.notebook import tqdm
import os
import numpy as np
import shutil


In [5]:
# Extract content from file
def process_pdf(path):
    #reader = PdfReader(path)
    full_text = ""
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                full_text += text + " "
            else:
                print(f"Warning: Page {i} was empty")
    return full_text

In [6]:
# Configuration
CHUNK_SIZE = 500
OVERLAP = 100
filename = "/kaggle/input/subrahmanya-bhujangam/Subramanya Bhujangam.pdf"

In [7]:
#Removing any old instances of the Chroma DB so that we work with fresh data
if os.path.exists("./kaggle_chroma_db"):
    print("Removing old DB")
    shutil.rmtree("./kaggle_chroma_db")
else:
    print("No old DB to remove")

No old DB to remove


In [8]:
# Sliding window function

def create_sliding_windows(text, chunk_size, overlap):
    chunks = []
    start = 0
    text_len = len(text)

    while start < text_len:
        end = start + chunk_size
        chunk = text[start:end] # Slice the text
        chunks.append(chunk)
        # Move the window forward but step back by overlap
        start += (chunk_size - overlap)
    if len(chunks) < 5:
        print("Warning very few chunks created")
    return chunks

In [9]:
# Extract and Chunk
raw_text = process_pdf(filename)
print(f"Total Characters extracted: {len(raw_text)}")

text_chunks = create_sliding_windows(raw_text, CHUNK_SIZE, OVERLAP)
print(f"Total Searchable Chunks: {len(text_chunks)}")

Total Characters extracted: 24142
Total Searchable Chunks: 61


In [10]:
# Let's verify what we have extracted and chunked
print(text_chunks[len(text_chunks)//2])

ान्ममास्तां पुरारेस्तनूज ॥१७॥ Sphurad-Ratna-Keyuura-Haara-Abhiraamah
Calat-Kunnddala-Shrii-Lasad-Ganndda-Bhaagah |
Kattau Piita-Vaasaah Kare Caaru-Shakti
Purastaan-Mamaas-Taam Puraares-Tanuuja ||17||
Meaning:
17.1: (I Reverentially Bow down to Sri Subramanya) With a Delightfully Pleasing Form adorned with
Bracelets and Garlands studded with Glittering Gems, ...
17.2: ... and Beautiful Ear-Rings moving to and fro over the Shining Face, ...
17.3: ... With Golden Yellow Clothes over the Waist and t


### Phase 3
Here we will vectorize and index the text chunks we extracted in the previous phase

In [11]:
# Initialize the Vector Database
# We use PersistentClient so that the database writes to a folder on the disk
print("Initializing ChromaDB")
chroma_client = chromadb.PersistentClient(path='./kaggle_chroma_db')
collection = chroma_client.get_or_create_collection(name='sacred_texts') # Creating a collection
print("_" * 30)
print("Infrastructure ready")
print(f"Model Loaded: {model}")
print(f"Database initialized at: ./kaggle_chroma_db")
print("_"*30)


Initializing ChromaDB
______________________________
Infrastructure ready
Model Loaded: SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)
Database initialized at: ./kaggle_chroma_db
______________________________


In [12]:
import time
print(f"Starting Knowledge Base construction for {len(text_chunks)} text blocks")

# Start the timer
t0 = time.time()

# 2. Generate Embeddings
# The model reads the text and outputs a list of 384 numbers for each chunk.
# show_progress_bar=True lets you see it working.
embeddings = model.encode(text_chunks, show_progress_bar=True)

print("Indexing (storing in ChromaDB)")

# 3. Create Unique IDs
# Databases need a unique ID for every row. We just number them chunk_0, chunk_1...
ids = [f"chunk_{i}" for i in range(len(text_chunks))]

# 4. Save to Database
# We store the raw text AND the vector together.
collection.add(
    documents=text_chunks,   # The readable text
    embeddings=embeddings,   # The math (vectors)
    ids=ids                  # The IDs
)

# Stop timer
t1 = time.time()

# --- SUCCESS REPORT ---
print("\n" + "="*40)
print("SUCCESS: ENGINE IS LIVE")
print("="*40)
print(f"Documents Indexed: {collection.count()}")
print(f"Time Taken:        {t1 - t0:.2f} seconds")
print(f"Database Location: ./kaggle_chroma_db")
print("="*40)

Starting Knowledge Base construction for 61 text blocks


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Indexing (storing in ChromaDB)

SUCCESS: ENGINE IS LIVE
Documents Indexed: 61
Time Taken:        0.80 seconds
Database Location: ./kaggle_chroma_db


### Phase 4: Retrieval of information
Here we will query the indexed information to see if the search engine can respond with context based information

In [13]:
def search_engine(query, n_results=5):
    print(f"\nSearching for concept: '{query}'")
    
    # 1. Convert Query to Vector
    # The model translates your English/Sanskrit question into the same math space as the document.
    query_embedding = model.encode([query])
    
    # 2. Search ChromaDB
    results = collection.query(
        query_embeddings=query_embedding,
        n_results=n_results
    )
    documents = results['documents'][0]
    distances = results['distances'][0] # Smaller distance = closer match
    
    print(f"Found {len(documents)} relevant passages:\n")
    
    for i, doc in enumerate(documents):
        print(f"--- RESULT {i+1} (Relevance Score: {distances[i]:.4f}) ---")
        # We print only the first 500 chars to keep it readable, 
        # but the full text is there if you need it.
        print(doc[:500] + "...") 
        print("\n")

In [17]:
# Testing in English

search_engine("Well being of family")


Searching for concept: 'Well being of family'
Found 5 relevant passages:

--- RESULT 1 (Relevance Score: 26.3914) ---
तो भवन्तं स्मरन्तश्च ते सन्तु सव  कु मार ॥२८॥
Kalatram Sutaa Bandhu-Vargah Pashurvaa
Naro Va-Atha Naari Grhe Ye Madiiyaah |
Yajanto Namantah Stuvanto Bhavantam
Smarantash-Ca Te Santu Sarve Kumaara ||28||
Meaning:
28.1: (I Reverentially Bow down to Sri Subramanya) Let my Wife, Children, Relatives or other People, ...
28.2: ... whether they are Men or Women, all those who stay in my House, ...
28.3: ... Let them all Worship You alone, let them all pay Salutations to You alone, let them all Praise ...


--- RESULT 2 (Relevance Score: 30.6212) ---
at Sages and
Ascetics only, ...
27.2: ... the Devas (Gods) everywhere (i.e. in most cases) become the bestower of Desired Boons?
27.3: But even for the sake of Persons (i.e. Devotees) who are of the Lowest Births, to Grant them His
Grace, ...
27.4: ... Apart from Guha, I do not know any other, I do not know any other.
कलत्रं सुता

In [22]:
# Testing in Sanskrit
search_engine("परिवार की भलाई")


Searching for concept: 'परिवार की भलाई'
Found 5 relevant passages:

--- RESULT 1 (Relevance Score: 12.4602) ---
तो भवन्तं स्मरन्तश्च ते सन्तु सव  कु मार ॥२८॥
Kalatram Sutaa Bandhu-Vargah Pashurvaa
Naro Va-Atha Naari Grhe Ye Madiiyaah |
Yajanto Namantah Stuvanto Bhavantam
Smarantash-Ca Te Santu Sarve Kumaara ||28||
Meaning:
28.1: (I Reverentially Bow down to Sri Subramanya) Let my Wife, Children, Relatives or other People, ...
28.2: ... whether they are Men or Women, all those who stay in my House, ...
28.3: ... Let them all Worship You alone, let them all pay Salutations to You alone, let them all Praise ...


--- RESULT 2 (Relevance Score: 13.2330) ---
at Sages and
Ascetics only, ...
27.2: ... the Devas (Gods) everywhere (i.e. in most cases) become the bestower of Desired Boons?
27.3: But even for the sake of Persons (i.e. Devotees) who are of the Lowest Births, to Grant them His
Grace, ...
27.4: ... Apart from Guha, I do not know any other, I do not know any other.
कलत्रं सुता बन्धु

In [None]:
# Polygolot Test

def test_polyglot_search(query_list):
    for q in query_list:
        results = collection.query(
            query_embeddings = model.encode([q]), n_results = 1
        )
        docs = results['documents'][0]
        if not docs:
            print(f"{q:<20} | No")
            continue
        snippet = docs[0][:100].replace('\n',' ')
        print(f"Found: '{snippet}'")
        

In [None]:
# ----Execute---
queries = ['Elephant Face',
          "Gajanana",
          "गणेश",
          "Mayura",
          " Paraashakti-Putram"]
test_polyglot_search(queries)