In [2]:
import zipfile
import os
import csv

# --- CONFIG ---
ZIP_PATH = "/content/policy.zip"    # path to your uploaded zip file
EXTRACT_DIR = "/content/policy/policy/policy/policyy"     # folder where PDFs will be extracted
OUTPUT_CSV = "/content/policy_index.csv"
START_NUMBER = 1
# --------------

# 1. Unzip
with zipfile.ZipFile(ZIP_PATH, "r") as z:
    z.extractall(EXTRACT_DIR)
print(f"Extracted zip to {EXTRACT_DIR}")

# 2. Find PDFs
files = sorted([
    f for f in os.listdir(EXTRACT_DIR)
    if f.lower().endswith(".pdf")
])

if not files:
    print("No PDF files found in extracted folder:", EXTRACT_DIR)
else:
    manifest = []
    num = START_NUMBER

    for original in files:
        old_path = os.path.join(EXTRACT_DIR, original)

        # new filename with leading zeros
        new_filename = f"{num:02d}.pdf"
        new_path = os.path.join(EXTRACT_DIR, new_filename)

        # rename
        os.rename(old_path, new_path)

        manifest.append({
            "id": num,
            "original_filename": original,
            "new_filename": new_filename
        })

        print(f"{original} → {new_filename}")
        num += 1

    # 3. Save CSV manifest
    with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=["id", "original_filename", "new_filename"])
        writer.writeheader()
        writer.writerows(manifest)

    print(f"Manifest saved as {OUTPUT_CSV}")


Extracted zip to /content/policy/policy/policy/policyy
1 (15).pdf → 01.pdf
1 (7).pdf → 02.pdf
Manifest saved as /content/policy_index.csv


In [3]:
!apt-get update
!apt-get install -y tesseract-ocr
!pip install pytesseract pdf2image pillow


0% [Working]            Get:1 https://cli.github.com/packages stable InRelease [3,917 B]
0% [Connecting to archive.ubuntu.com (91.189.91.83)] [Connecting to security.ub                                                                               Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
0% [Connecting to archive.ubuntu.com (91.189.91.83)] [Connecting to security.ub0% [Connecting to archive.ubuntu.com (91.189.91.83)] [Connecting to security.ub0% [Waiting for headers] [Waiting for headers] [Connected to r2u.stat.illinois.                                                                               Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
0% [Waiting for headers] [3 InRelease 14.2 kB/129 kB 11%] [Connected to r2u.sta                                                                               Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-update

In [4]:
!apt-get update
!apt-get install -y poppler-utils

0% [Working]            Hit:1 http://security.ubuntu.com/ubuntu jammy-security InRelease
0% [Waiting for headers] [Connected to cloud.r-project.org (99.84.132.23)] [Con                                                                               Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Waiting for headers] [Connected to cloud.r-project.org (99.84.132.23)] [Con                                                                               Hit:3 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
0% [Waiting for headers] [Waiting for headers] [Connected to r2u.stat.illinois.                                                                               Hit:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
0% [Waiting for headers] [Connected to r2u.stat.illinois.edu (192.17.190.167)]                                                                                Hit:5 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:6 

In [25]:
from pdf2image import convert_from_path
import pytesseract
import os

POLICY_DIR = "/content/policy/policy/policy/policyy"

def load_pdfs_with_ocr(policy_dir):
    documents = []

    pdf_files = sorted([
        f for f in os.listdir(policy_dir)
        if f.lower().endswith(".pdf")
    ])

    for pdf_file in pdf_files:
        pdf_path = os.path.join(policy_dir, pdf_file)

        # convert PDF pages to images
        images = convert_from_path(pdf_path, dpi=300)

        for page_num, image in enumerate(images, start=1):
            text = pytesseract.image_to_string(image)

            if text and text.strip():
                documents.append({
                    "text": text.strip(),
                    "metadata": {
                        "filename": pdf_file,
                        "page": page_num,
                        "source": "IRDAI"
                    }
                })

    return documents


In [26]:
documents = load_pdfs_with_ocr(POLICY_DIR)

print("Total extracted pages:", len(documents))
print("\nSample metadata:")
print(documents[0]["metadata"])
print("\nSample text:")
print(documents[0]["text"][:500])


Total extracted pages: 38

Sample metadata:
{'filename': '01.pdf', 'page': 1, 'source': 'IRDAI'}

Sample text:
Insurance Regulatory and Development Authority of India (Lloyd’s India)
Regulations, 2016!

[Amended up to 02”" June, 2020]

F. No. IRDAI/Reg/4/116/2016- In exercise of the powers conferred by section 3, 6 and 114A
of the Insurance Act, 1938, and sections 14 read with 26 of the Insurance Regulatory and
Development Authority of India Act, 1999, the Authority, in consultation with the Insurance
Advisory Committee hereby makes the following regulations, namely:

CHAPTER I

1. Short title and commen


In [27]:
import re
from typing import List, Dict

def clean_documents(documents: List[Dict]) -> List[Dict]:
    cleaned = []

    for doc in documents:
        text = doc["text"]

        text = text.lower()
        text = re.sub(r'\n+', '\n', text)
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'page\s*\d+', '', text)
        text = text.strip()

        if text:
            cleaned.append({
                "text": text,
                "metadata": doc["metadata"]
            })

    return cleaned


In [28]:
cleaned_documents = clean_documents(documents)



In [18]:
from nltk.tokenize import word_tokenize
import uuid

In [38]:
from nltk.tokenize import word_tokenize

def chunk_text(text, base_id, chunk_size=400, overlap=80):
    tokens = text.split()

    chunks = []

    start = 0
    chunk_idx = 0

    while start < len(tokens):
        end = start + chunk_size
        chunk_tokens = tokens[start:end]
        chunk_text = " ".join(chunk_tokens)

        chunks.append({
            "chunk_id": f"{base_id}_{chunk_idx}",
            "text": chunk_text
        })

        start += chunk_size - overlap
        chunk_idx += 1

    return chunks


In [31]:
!pip install nltk



In [33]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [39]:
all_chunks = []

for doc in cleaned_documents:
    meta = doc["metadata"]

    base_id = f"{meta['filename'].replace('.pdf','')}_p{meta['page']}"

    chunks = chunk_text(
        text=doc["text"],
        base_id=base_id
    )

    for c in chunks:
        all_chunks.append({
            "chunk_id": c["chunk_id"],
            "text": c["text"],
            "metadata": meta
        })


In [40]:
chunks = chunk_text(
    text=cleaned_documents[0]["text"],
    base_id="test_doc_p1"
)

print(len(chunks))
print(chunks[0]["chunk_id"])
print(chunks[0]["text"][:120])


2
test_doc_p1_0
insurance regulatory and development authority of india (lloyd’s india) regulations, 2016! [amended up to 02”" june, 202


In [43]:
!pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [44]:
from rank_bm25 import BM25Okapi

bm25_corpus = []

for i, chunk in enumerate(all_chunks):
    tokens = chunk["text"].split()
    bm25_corpus.append(tokens)

    # Debug prints (first 2 chunks only)
    if i < 2:
        print("\n--- Chunk", i, "---")
        print("Chunk ID:", chunk["chunk_id"])
        print("Text preview:", chunk["text"][:120])
        print("Tokens:", tokens[:15])



--- Chunk 0 ---
Chunk ID: 01_p1_0
Text preview: insurance regulatory and development authority of india (lloyd’s india) regulations, 2016! [amended up to 02”" june, 202
Tokens: ['insurance', 'regulatory', 'and', 'development', 'authority', 'of', 'india', '(lloyd’s', 'india)', 'regulations,', '2016!', '[amended', 'up', 'to', '02”"']

--- Chunk 1 ---
Chunk ID: 01_p1_1
Text preview: lii)securities premium account * vide notification no. f. no. irdai/reg/4/116/2016 dated 09" march, 2016 published in th
Tokens: ['lii)securities', 'premium', 'account', '*', 'vide', 'notification', 'no.', 'f.', 'no.', 'irdai/reg/4/116/2016', 'dated', '09"', 'march,', '2016', 'published']


In [45]:
bm25 = BM25Okapi(bm25_corpus)

print("\nBM25 INDEX CREATED")
print("Total chunks indexed:", len(bm25_corpus))
print("Average chunk length:", bm25.avgdl)



BM25 INDEX CREATED
Total chunks indexed: 68
Average chunk length: 242.36764705882354


In [47]:
query = "what is insurance policy duration guideline"


In [48]:
query_tokens = query.lower().split()
print("Query tokens:", query_tokens)


Query tokens: ['what', 'is', 'insurance', 'policy', 'duration', 'guideline']


In [49]:
scores = bm25.get_scores(query_tokens)

print("Scores array length:", len(scores))
print("First 10 raw scores:", scores[:10])


Scores array length: 68
First 10 raw scores: [0.69990005 0.         0.72101087 0.64101785 0.46475555 2.94479839
 0.48178288 0.3449105  0.         0.98579963]


In [50]:
bm25_results = []

for i, score in enumerate(scores):
    if score > 0:  # ignore zero-score chunks
        bm25_results.append({
            "chunk_id": all_chunks[i]["chunk_id"],
            "text": all_chunks[i]["text"],
            "metadata": all_chunks[i]["metadata"],
            "score": float(score)
        })

print("Non-zero BM25 matches:", len(bm25_results))


Non-zero BM25 matches: 41


In [51]:
bm25_results.sort(key=lambda x: x["score"], reverse=True)


In [52]:
for r in bm25_results[:5]:
    print("\nChunk ID:", r["chunk_id"])
    print("BM25 Score:", r["score"])
    print("Text preview:", r["text"][:200])



Chunk ID: 02_p3_0
BM25 Score: 3.64874027567846
Text preview: [at wi-ars 4] fra bl usa : acta 3 6) aan f frofe —afafe ® are fla ae & feo afer aft favat oz fora softer aaeat & sea & are fear stat | 7) freya — aes fh bre tsh st a tage ahaa & are waa ft yeaa (area 

Chunk ID: 01_p4_0
BM25 Score: 2.9447983878516393
Text preview: 11. 12. 13. ii. application for registration has been rejected by the authority at any time during the preceding three financial years on the date of requisition for registration application; or iii. 

Chunk ID: 01_p33_0
BM25 Score: 1.0839471089031782
Text preview: joint certificate of registration for lloyd’s india service company and lloyd’s india syndicate insurance regulatory and development authority of india joint certificate of registration registration n

Chunk ID: 01_p6_0
BM25 Score: 0.9857996293613813
Text preview: (b) the proposed infrastructure of lloyd’s india; (c) the proposed underwriting skills that will be transferred to lloyd’s india; (d) other re

In [53]:
def bm25_search(query, top_k=10):
    query_tokens = query.lower().split()
    scores = bm25.get_scores(query_tokens)

    results = []
    for i, score in enumerate(scores):
        if score > 0:
            results.append({
                "chunk_id": all_chunks[i]["chunk_id"],
                "text": all_chunks[i]["text"],
                "metadata": all_chunks[i]["metadata"],
                "score": float(score)
            })

    results.sort(key=lambda x: x["score"], reverse=True)
    return results[:top_k]


In [54]:
query = "what is maternity leave duration"

results = bm25_search(query, top_k=5)

for r in results:
    print("\nChunk:", r["chunk_id"])
    print("Score:", r["score"])
    print("Text:", r["text"][:150])



Chunk: 02_p3_0
Score: 2.9447983878516393
Text: [at wi-ars 4] fra bl usa : acta 3 6) aan f frofe —afafe ® are fla ae & feo afer aft favat oz fora softer aaeat & sea & are fear stat | 7) freya — aes 

Chunk: 01_p6_0
Score: 0.5706009375348383
Text: (b) the proposed infrastructure of lloyd’s india; (c) the proposed underwriting skills that will be transferred to lloyd’s india; (d) other relevant m

Chunk: 01_p6_1
Score: 0.566287749859715
Text: of registration - the authority, after making such inquiry as it deems fit and on being satisfied that — (a) the applicant is eligible, and in his/ he

Chunk: 01_p10_0
Score: 0.5045685600581044
Text: 28. 29. 30. vi. _ strive to make india as a reinsurance hub; vii. compliance with fema and other local laws; viii. | compliance to the conditions laid

Chunk: 01_p4_1
Score: 0.481782880497389
Text: particular and without prejudice to the generality of the foregoing, the authority shall consider the following matters, namely:- (i) the general trac


In [56]:
!pip install chromadb sentence-transformers

Collecting chromadb
  Downloading chromadb-1.4.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.4.0-py3-none-any.whl.metadata (5.8 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.3-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.39.1-py3-none-any.whl.metadata (2.5 kB)
Collecting pypika>=0.48.9 (from chromadb)
  Downloading pypika-0.50.0-py2.py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [57]:
import chromadb
from chromadb.utils import embedding_functions

# Create client (in-memory for now)
client = chromadb.Client()

# Embedding function (this is the model)
embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"
)

# Create collection = dense index
collection = client.create_collection(
    name="policy_chunks",
    embedding_function=embedding_fn
)

print("ChromaDB collection created")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

ChromaDB collection created


In [58]:
documents = []
metadatas = []
ids = []

for i, chunk in enumerate(all_chunks):
    documents.append(chunk["text"])
    metadatas.append(chunk["metadata"])
    ids.append(chunk["chunk_id"])

    # Debug print (first 2 only)
    if i < 2:
        print("\n--- Chunk to index ---")
        print("ID:", chunk["chunk_id"])
        print("Text preview:", chunk["text"][:120])
        print("Metadata:", chunk["metadata"])



--- Chunk to index ---
ID: 01_p1_0
Text preview: insurance regulatory and development authority of india (lloyd’s india) regulations, 2016! [amended up to 02”" june, 202
Metadata: {'filename': '01.pdf', 'page': 1, 'source': 'IRDAI'}

--- Chunk to index ---
ID: 01_p1_1
Text preview: lii)securities premium account * vide notification no. f. no. irdai/reg/4/116/2016 dated 09" march, 2016 published in th
Metadata: {'filename': '01.pdf', 'page': 1, 'source': 'IRDAI'}


In [59]:
collection.add(
    documents=documents,
    metadatas=metadatas,
    ids=ids
)

print("Total chunks indexed in ChromaDB:", len(ids))


Total chunks indexed in ChromaDB: 68


In [60]:
query = "maternity leave duration"


In [61]:
results = collection.query(
    query_texts=[query],
    n_results=5
)


In [62]:
print("\nRaw Chroma output:")
print("IDs:", results["ids"][0])
print("Distances:", results["distances"][0])



Raw Chroma output:
IDs: ['01_p23_0', '02_p4_0', '01_p20_1', '02_p1_0', '01_p22_1']
Distances: [0.8067558407783508, 0.8294236660003662, 0.8485453724861145, 0.8492252230644226, 0.869687557220459]


In [63]:
def dense_search(query, top_k=10):
    res = collection.query(
        query_texts=[query],
        n_results=top_k
    )

    results = []

    for i in range(top_k):
        results.append({
            "chunk_id": res["ids"][0][i],
            "text": res["documents"][0][i],
            "metadata": res["metadatas"][0][i],
            "distance": float(res["distances"][0][i])
        })

    return results


In [64]:
query = "how long can maternity leave be taken"

dense_results = dense_search(query, top_k=5)

for r in dense_results:
    print("\nChunk ID:", r["chunk_id"])
    print("Distance:", r["distance"])
    print("Text preview:", r["text"][:200])



Chunk ID: 02_p4_0
Distance: 0.8425444960594177
Text preview: the gazette of india : extraordinary [part iii—sec. 4] g) “secretary” means an officer of the authority nominated by the chairperson of the authority, to act as secretary to the committee; h) “total s

Chunk ID: 02_p1_0
Distance: 0.8474506139755249
Text preview: cs po plo ueto-33004/99 regd. no. d. l.-33004/99 che gazette of jndia extraordinary art iii—wavs 4 part i1]—section 4 wife wentfora published by authority a 280) ae faccit, wistar, wes 30, 2019/statt 

Chunk ID: 01_p23_0
Distance: 0.8587199449539185
Text preview: - date of resignation/departure - details of other business interests in the last 15 years where the directors and key management personnel have been a working shareholder, director or controller. rel

Chunk ID: 01_p20_1
Distance: 0.8878319263458252
Text preview: more than 1% of the paid up capital or its equivalent of the applicant are to be given. a 5 os hy oo —_ sg 5 oc — q p so 8&8 ov ss s 3fo s wo oo su

In [65]:
def normalize_bm25(results):
    if not results:
        return {}

    scores = [r["score"] for r in results]
    min_s, max_s = min(scores), max(scores)

    normalized = {}
    for r in results:
        if max_s - min_s == 0:
            norm = 0.0
        else:
            norm = (r["score"] - min_s) / (max_s - min_s)

        normalized[r["chunk_id"]] = {
            "norm_score": norm,
            "text": r["text"],
            "metadata": r["metadata"]
        }

    print("BM25 normalized scores (sample):")
    for k, v in list(normalized.items())[:3]:
        print(k, "→", v["norm_score"])

    return normalized


In [66]:
def normalize_dense(results):
    if not results:
        return {}

    distances = [r["distance"] for r in results]

    # Convert distance → similarity
    similarities = [1 / (d + 1e-6) for d in distances]

    min_s, max_s = min(similarities), max(similarities)

    normalized = {}
    for r, sim in zip(results, similarities):
        if max_s - min_s == 0:
            norm = 0.0
        else:
            norm = (sim - min_s) / (max_s - min_s)

        normalized[r["chunk_id"]] = {
            "norm_score": norm,
            "text": r["text"],
            "metadata": r["metadata"]
        }

    print("Dense normalized scores (sample):")
    for k, v in list(normalized.items())[:3]:
        print(k, "→", v["norm_score"])

    return normalized


In [69]:
def hybrid_search(query, top_k=10, alpha=0.5):
    print("\n=== QUERY ===")
    print(query)

    # Step 1: retrieve
    bm25_results = bm25_search(query, top_k=top_k)
    dense_results = dense_search(query, top_k=top_k)

    # Step 2: normalize
    bm25_norm = normalize_bm25(bm25_results)
    dense_norm = normalize_dense(dense_results)

    # Step 3: merge by chunk_id
    all_chunk_ids = set(bm25_norm.keys()) | set(dense_norm.keys())

    hybrid_results = []

    for cid in all_chunk_ids:
        bm25_score = bm25_norm.get(cid, {}).get("norm_score", 0.0)
        dense_score = dense_norm.get(cid, {}).get("norm_score", 0.0)

        final_score = alpha * bm25_score + (1 - alpha) * dense_score

        text = (
            bm25_norm.get(cid, {}) or dense_norm.get(cid, {})
        ).get("text")

        metadata = (
            bm25_norm.get(cid, {}) or dense_norm.get(cid, {})
        ).get("metadata")

        hybrid_results.append({
            "chunk_id": cid,
            "final_score": final_score,
            "bm25_norm": bm25_score,
            "dense_norm": dense_score,
            "text": text,
            "metadata": metadata
        })

    # Step 4: rank
    hybrid_results.sort(
        key=lambda x: x["final_score"],
        reverse=True
    )

    return hybrid_results[:top_k]



In [70]:
results = hybrid_search(
    query="what is maternity leave duration",
    top_k=5,
    alpha=0.6
)

for r in results:
    print("\nChunk ID:", r["chunk_id"])
    print("Final score:", round(r["final_score"], 3))
    print("BM25:", round(r["bm25_norm"], 3),
          "Dense:", round(r["dense_norm"], 3))
    print("Text:", r["text"][:200])



=== QUERY ===
what is maternity leave duration
BM25 normalized scores (sample):
02_p3_0 → 1.0
01_p6_0 → 0.0360606974549084
01_p6_1 → 0.03430951575822613
Dense normalized scores (sample):
01_p23_0 → 1.0
02_p4_0 → 0.7132930533624566
01_p22_1 → 0.21191476247307867

Chunk ID: 02_p3_0
Final score: 0.6
BM25: 1.0 Dense: 0.0
Text: [at wi-ars 4] fra bl usa : acta 3 6) aan f frofe —afafe ® are fla ae & feo afer aft favat oz fora softer aaeat & sea & are fear stat | 7) freya — aes fh bre tsh st a tage ahaa & are waa ft yeaa (area 

Chunk ID: 01_p23_0
Final score: 0.4
BM25: 0.0 Dense: 1.0
Text: - date of resignation/departure - details of other business interests in the last 15 years where the directors and key management personnel have been a working shareholder, director or controller. rel

Chunk ID: 02_p4_0
Final score: 0.285
BM25: 0.0 Dense: 0.713
Text: the gazette of india : extraordinary [part iii—sec. 4] g) “secretary” means an officer of the authority nominated by the chairperson of the a