In [3]:
import zipfile
import os
import csv

# --- CONFIG ---
ZIP_PATH = "/content/policy.zip"    # path to your uploaded zip file
EXTRACT_DIR = "/content/policy/policy/policy"     # folder where PDFs will be extracted
OUTPUT_CSV = "/content/policy_index.csv"
START_NUMBER = 1
# --------------

# 1. Unzip
with zipfile.ZipFile(ZIP_PATH, "r") as z:
    z.extractall(EXTRACT_DIR)
print(f"Extracted zip to {EXTRACT_DIR}")

# 2. Find PDFs
files = sorted([
    f for f in os.listdir(EXTRACT_DIR)
    if f.lower().endswith(".pdf")
])

if not files:
    print("No PDF files found in extracted folder:", EXTRACT_DIR)
else:
    manifest = []
    num = START_NUMBER

    for original in files:
        old_path = os.path.join(EXTRACT_DIR, original)

        # new filename with leading zeros
        new_filename = f"{num:02d}.pdf"
        new_path = os.path.join(EXTRACT_DIR, new_filename)

        # rename
        os.rename(old_path, new_path)

        manifest.append({
            "id": num,
            "original_filename": original,
            "new_filename": new_filename
        })

        print(f"{original} → {new_filename}")
        num += 1

    # 3. Save CSV manifest
    with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=["id", "original_filename", "new_filename"])
        writer.writeheader()
        writer.writerows(manifest)

    print(f"Manifest saved as {OUTPUT_CSV}")


Extracted zip to /content/policy/policy/policy
1 (15).pdf → 01.pdf
1 (16).pdf → 02.pdf
1 (3).pdf → 03.pdf
Manifest saved as /content/policy_index.csv


In [7]:
# List everything in the policy folder and subfolders
!find /content/policy -type f -maxdepth 3


/content/policy/policy/1 (10).pdf
/content/policy/policy/1 (5).pdf
/content/policy/policy/1 (9).pdf
/content/policy/policy/1 (13).pdf
/content/policy/policy/1 (4).pdf
/content/policy/policy/1 (11).pdf
/content/policy/policy/1 (15).pdf
/content/policy/policy/1 (14).pdf
/content/policy/policy/1 (8).pdf
/content/policy/policy/1 (12).pdf
/content/policy/policy/1 (16).pdf
/content/policy/policy/1 (6).pdf
/content/policy/policy/1 (3).pdf
/content/policy/policy/1 (7).pdf
/content/policy/policy/1 (2).pdf
/content/policy/policy/1 (1).pdf


In [5]:
import zipfile
import os

ZIP_PATH = "/content/policy.zip"

# check if valid ZIP
if zipfile.is_zipfile(ZIP_PATH):
    print("Valid zip file, extracting…")
    with zipfile.ZipFile(ZIP_PATH, "r") as z:
        z.extractall("/content/policy")
    print("Extracted successfully.")
else:
    print("Error: The file is not a valid zip archive!")


Valid zip file, extracting…
Extracted successfully.


In [9]:
!pip install pypdf


Collecting pypdf
  Downloading pypdf-6.6.2-py3-none-any.whl.metadata (7.1 kB)
Downloading pypdf-6.6.2-py3-none-any.whl (329 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m329.1/329.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-6.6.2


In [12]:
from pypdf import PdfReader
import os

POLICY_DIR = "/content/policy"

def load_pdfs(policy_dir):
    documents = []

    pdf_files = sorted([
        f for f in os.listdir(policy_dir)
        if f.lower().endswith(".pdf")
    ])

    for pdf_file in pdf_files:
        pdf_path = os.path.join(policy_dir, pdf_file)
        reader = PdfReader(pdf_path)

        for page_num, page in enumerate(reader.pages, start=1):
            text = page.extract_text()

            if text and text.strip():
                documents.append({
                    "text": text.strip(),
                    "metadata": {
                        "filename": pdf_file,
                        "page": page_num
                    }
                })

    return documents


In [13]:
documents = load_pdfs(POLICY_DIR)

print("Total extracted chunks (pages):", len(documents))
print("\nSample output:\n")
print(documents[0]["metadata"])
print(documents[0]["text"][:500])


Total extracted chunks (pages): 0

Sample output:



IndexError: list index out of range

In [5]:
!apt-get update
!apt-get install -y tesseract-ocr
!pip install pytesseract pdf2image pillow


0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Waiting for headers] [Waiting for headers] [Connected to cloud.r-project.or                                                                               Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
                                                                               Get:3 https://cli.github.com/packages stable InRelease [3,917 B]
Get:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:6 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
Get:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease [24.3 kB]
Get:9 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease [24.6 kB]
Get:10 http:

In [10]:
!apt-get update
!apt-get install -y poppler-utils


0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Waiting for headers] [Connecting to security.ubuntu.com (91.189.92.23)] [Co                                                                               Hit:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
                                                                               Hit:3 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
                                                                               Hit:4 http://security.ubuntu.com/ubuntu jammy-security InRelease
0% [Connected to cloud.r-project.org (65.9.86.118)] [Connecting to r2u.stat.ill                                                                               Hit:5 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
0% [Connecting to r2u.stat.illinois.edu (192.17.190.167)] [Waiting for headers]                                                                               Hit:6

In [11]:
from pdf2image import convert_from_path
import pytesseract
import os

POLICY_DIR = "/content/policy/policy/policy"

def load_pdfs_with_ocr(policy_dir):
    documents = []

    pdf_files = sorted([
        f for f in os.listdir(policy_dir)
        if f.lower().endswith(".pdf")
    ])

    for pdf_file in pdf_files:
        pdf_path = os.path.join(policy_dir, pdf_file)

        # convert PDF pages to images
        images = convert_from_path(pdf_path, dpi=300)

        for page_num, image in enumerate(images, start=1):
            text = pytesseract.image_to_string(image)

            if text and text.strip():
                documents.append({
                    "text": text.strip(),
                    "metadata": {
                        "filename": pdf_file,
                        "page": page_num,
                        "source": "IRDAI"
                    }
                })

    return documents


In [12]:
documents = load_pdfs_with_ocr(POLICY_DIR)

print("Total extracted pages:", len(documents))
print("\nSample metadata:")
print(documents[0]["metadata"])
print("\nSample text:")
print(documents[0]["text"][:500])


Total extracted pages: 105

Sample metadata:
{'filename': '01.pdf', 'page': 1, 'source': 'IRDAI'}

Sample text:
Insurance Regulatory and Development Authority of India (Lloyd’s India)
Regulations, 2016!

[Amended up to 02”" June, 2020]

F. No. IRDAI/Reg/4/116/2016- In exercise of the powers conferred by section 3, 6 and 114A
of the Insurance Act, 1938, and sections 14 read with 26 of the Insurance Regulatory and
Development Authority of India Act, 1999, the Authority, in consultation with the Insurance
Advisory Committee hereby makes the following regulations, namely:

CHAPTER I

1. Short title and commen


In [17]:
!apt-get update
!apt-get install -y poppler-utils


0% [Working]            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
0% [Waiting for headers] [Connected to cloud.r-project.org (13.225.47.74)] [Con                                                                               Hit:2 http://security.ubuntu.com/ubuntu jammy-security InRelease
                                                                               Hit:3 https://cli.github.com/packages stable InRelease
0% [Waiting for headers] [Connected to cloud.r-project.org (13.225.47.74)] [Con                                                                               Hit:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
0% [Waiting for headers] [Connected to cloud.r-project.org (13.225.47.74)] [Wai                                                                               Hit:5 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:7 https://r2u.stat.illinois

In [14]:
def chunk_text(text, chunk_size=1000, overlap=200):
    chunks = []
    start = 0
    text_length = len(text)

    while start < text_length:
        end = start + chunk_size
        chunk = text[start:end]

        if chunk.strip():
            chunks.append(chunk.strip())

        start += chunk_size - overlap

    return chunks


In [15]:
chunked_documents = []

for doc in documents:
    text_chunks = chunk_text(doc["text"])

    for idx, chunk in enumerate(text_chunks):
        chunked_documents.append({
            "text": chunk,
            "metadata": {
                **doc["metadata"],
                "chunk_id": idx
            }
        })


In [16]:
print("Total chunks:", len(chunked_documents))
print("\nSample chunk metadata:")
print(chunked_documents[0]["metadata"])
print("\nSample chunk text:")
print(chunked_documents[0]["text"][:500])


Total chunks: 351

Sample chunk metadata:
{'filename': '01.pdf', 'page': 1, 'source': 'IRDAI', 'chunk_id': 0}

Sample chunk text:
Insurance Regulatory and Development Authority of India (Lloyd’s India)
Regulations, 2016!

[Amended up to 02”" June, 2020]

F. No. IRDAI/Reg/4/116/2016- In exercise of the powers conferred by section 3, 6 and 114A
of the Insurance Act, 1938, and sections 14 read with 26 of the Insurance Regulatory and
Development Authority of India Act, 1999, the Authority, in consultation with the Insurance
Advisory Committee hereby makes the following regulations, namely:

CHAPTER I

1. Short title and commen


In [17]:
!pip install chromadb sentence-transformers


Collecting chromadb
  Downloading chromadb-1.4.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.4.0-py3-none-any.whl.metadata (5.8 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.3-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.39.1-py3-none-any.whl.metadata (2.5 kB)
Collecting pypika>=0.48.9 (from chromadb)
  Downloading pypika-0.50.0-py2.py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [18]:
import chromadb
from chromadb.config import Settings

CHROMA_DIR = "/content/chroma_db"

client = chromadb.Client(
    Settings(
        persist_directory=CHROMA_DIR,
        anonymized_telemetry=False
    )
)

collection = client.get_or_create_collection(
    name="insurance_policies"
)


In [19]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [21]:
texts = [doc["text"] for doc in chunked_documents]

metadatas = [doc["metadata"] for doc in chunked_documents]

ids = [
    f'{doc["metadata"]["filename"]}_p{doc["metadata"]["page"]}_c{doc["metadata"]["chunk_id"]}'
    for doc in chunked_documents
]

embeddings = embedding_model.encode(texts, show_progress_bar=True)

collection.add(
    documents=texts,
    metadatas=metadatas,
    embeddings=embeddings.tolist(),
    ids=ids
)

# persist to disk



Batches:   0%|          | 0/11 [00:00<?, ?it/s]

In [24]:
print("Total vectors:", collection.count())


Total vectors: 351


In [25]:
results = collection.query(
    query_texts=["What is the process for filing a claim?"],
    n_results=5
)

print("IDs:", results["ids"])
print("Documents:", results["documents"])
print("Metadatas:", results["metadatas"])
print("Distances:", results["distances"])


/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:02<00:00, 27.9MiB/s]


IDs: [['03.pdf_p15_c1', '03.pdf_p21_c0', '02.pdf_p22_c3', '03.pdf_p15_c0', '03.pdf_p15_c2']]
Documents: [['d in the\npolicy terms and conditions shall be deemed ‘necessary’. The insurer shall ensure\nthat all the documents required for claims processing are called for at one time and\nthat the documents are not called for in a piece-meal manner.\n\niii. The information that the insurer has captured in the proposal form at the time of\naccepting the proposal, the terms & conditions offered under the policy, the medical\nhistory as revealed by earlier claims, if any, and the prior claims experience shall all\nbe maintained by the insurer as an electronic record and shall not be called for\nagain from the policyholder/insured at the time of subsequent claim settlements.\n\niv. Insurer may stipulate a period within which all necessary claim documents should\nbe furnished by the policyholder/insured to make a claim. However, claims filed\neven beyond such period should be considered if ther

In [2]:
pip install anthropic

Collecting anthropic
  Downloading anthropic-0.77.0-py3-none-any.whl.metadata (28 kB)
Downloading anthropic-0.77.0-py3-none-any.whl (397 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/397.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m397.9/397.9 kB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: anthropic
Successfully installed anthropic-0.77.0


In [None]:
from anthropic import Anthropic

client = Anthropic(api_key="")

def ask_claude(prompt):
    response = client.messages.create(
        model="claude-opus-4-5-20251101",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=600,
        temperature=0.0,
    )
    return response.content[0].text


In [4]:
def retrieve_chunks(query, n_results=5):
    results = collection.query(
        query_texts=[query],
        n_results=n_results
    )
    docs = results["documents"][0]
    metas = results["metadatas"][0]
    return docs, metas


In [5]:
def build_prompt(query, docs, metas):
    context_sections = []
    for i, (doc, meta) in enumerate(zip(docs, metas), start=1):
        context_sections.append(
            f"[{i}] File: {meta['filename']} | Page: {meta['page']}\n{doc}"
        )

    context_text = "\n\n".join(context_sections)

    prompt = f"""
You are a policy assistant. Use the following retrieved policy text to answer the question.
Provide citations like [1], [2] referencing the sections below.

Context:
{context_text}

Question:
{query}

Answer in detail, grounded only in the provided context.
If the answer is not in the context, say that it is not available.
"""
    return prompt


In [6]:
def answer_with_claude(query):
    # 1) Retrieve relevant chunks
    docs, metas = retrieve_chunks(query, n_results=5)

    # 2) Build the prompt
    prompt = build_prompt(query, docs, metas)

    # 3) Call Claude
    answer = ask_claude(prompt)

    return answer


In [7]:
print(answer_with_claude("What is the process for filing a claim?"))


NameError: name 'collection' is not defined