In [3]:
pip install pymupdf chromadb

Collecting chromadb
  Downloading chromadb-1.0.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi==0.115.9 (from chromadb)
  Downloading fastapi-0.115.9-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.1-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.24.1-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.21.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentele

In [4]:
import fitz  # PyMuPDF
import re
import chromadb
from chromadb.config import Settings
from google import generativeai as genai

In [5]:
from kaggle_secrets import UserSecretsClient
import google.generativeai as genai

GOOGLE_API_KEY = UserSecretsClient().get_secret("GOOGLE_API_KEY")
genai.configure(api_key=GOOGLE_API_KEY)

In [6]:
# ---------------------- TEXT EXTRACTION ----------------------
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

In [7]:
# ---------------------- CHUNKING ----------------------
def legal_aware_chunking(text):
    chunks = {}
    text = re.sub(r'\n+', '\n', text.strip())

    chunks["case_info"] = re.search(r'(IN THE HIGH COURT.*?)\n+[-]+', text, re.DOTALL).group(1).strip() if re.search(r'(IN THE HIGH COURT.*?)\n+[-]+', text, re.DOTALL) else "N/A"
    chunks["representation"] = re.search(r'[-]+\s+CORAM.*?For the State\s*:\s*(.*?)\n[-]+', text, re.DOTALL).group(0).strip() if re.search(r'[-]+\s+CORAM.*?For the State\s*:\s*(.*?)\n[-]+', text, re.DOTALL) else "N/A"
    chunks["charges_allegations"] = re.search(r'Apprehending their arrest.*?under Section 3/4 of the Witch Craft Act,', text, re.DOTALL).group(0).strip() if re.search(r'Apprehending their arrest.*?under Section 3/4 of the Witch Craft Act,', text, re.DOTALL) else "N/A"
    chunks["petitioners_arguments"] = re.search(r'Learned counsel.*?privileges of anticipatory bail\.', text, re.DOTALL).group(0).strip() if re.search(r'Learned counsel.*?privileges of anticipatory bail\.', text, re.DOTALL) else "N/A"
    chunks["state_response"] = re.search(r'Learned Addl\. P\.P.*?anticipatory bail of the petitioners\.', text, re.DOTALL).group(0).strip() if re.search(r'Learned Addl\. P\.P.*?anticipatory bail of the petitioners\.', text, re.DOTALL) else "N/A"
    chunks["court_order"] = re.search(r'Considering the submissions.*?\(Anil Kumar Choudhary, J\.\)', text, re.DOTALL).group(0).strip() if re.search(r'Considering the submissions.*?\(Anil Kumar Choudhary, J\.\)', text, re.DOTALL) else "N/A"

    return chunks

In [8]:
# ---------------------- METADATA ----------------------
def extract_metadata(text):
    metadata = {}
    text = re.sub(r'\n+', '\n', text.strip())

    metadata['court_name'] = re.search(r'IN THE (HIGH COURT.*?)\n', text, re.IGNORECASE).group(1).strip() if re.search(r'IN THE (HIGH COURT.*?)\n', text, re.IGNORECASE) else "N/A"
    metadata['case_number'] = re.search(r'A\.B\.A\. No\.(\d+ of \d+)', text).group(1) if re.search(r'A\.B\.A\. No\.(\d+ of \d+)', text) else "N/A"
    metadata['order_date'] = re.search(r'Dated-?\s*(\d{2}/\d{2}/\d{4})', text).group(1) if re.search(r'Dated-?\s*(\d{2}/\d{2}/\d{4})', text) else "N/A"
    metadata['judge'] = "Anil Kumar Choudhary" if re.search(r'\(Anil Kumar Choudhary, J\.\)', text) else "N/A"
    
    petitioner_section = re.search(r'\n------\n(.*?)\n\s*\.\.\.', text, re.DOTALL)
    if petitioner_section:
        names = re.findall(r'\d+\.\s+(.*?)(?:,|\n)', petitioner_section.group(1))
        metadata['petitioners'] = [name.strip() for name in names]
    else:
        metadata['petitioners'] = "N/A"

    metadata['advocate_petitioner'] = re.search(r'For the Petitioners\s*:\s*(.*)', text).group(1).strip() if re.search(r'For the Petitioners\s*:\s*(.*)', text) else "N/A"
    metadata['advocate_state'] = re.search(r'For the State\s*:\s*(.*)', text).group(1).strip() if re.search(r'For the State\s*:\s*(.*)', text) else "N/A"
    metadata['ipc_sections'] = re.search(r'Sections? ([\d, ]+/[\d, ]+).*?Witch Craft Act', text).group(1) if re.search(r'Sections? ([\d, ]+/[\d, ]+).*?Witch Craft Act', text) else "N/A"

    return metadata

In [9]:
# ---------------------- EMBEDDING FUNCTION ----------------------
class GeminiEmbeddingFunction:
    def __call__(self, texts):
        return [
            genai.embed_content(
                model="models/embedding-001",
                content=text,
                task_type="retrieval_document"
            )["embedding"]
            for text in texts
        ]


In [10]:
# ---------------------- INITIALIZATION ----------------------
gemini_embedder = GeminiEmbeddingFunction()

chroma_client = chromadb.Client(Settings(
    persist_directory="chroma_db",
    anonymized_telemetry=False
))

collection = chroma_client.get_or_create_collection(name="legal_docs")


In [11]:
# ---------------------- LOAD & STORE PDF ----------------------
def process_and_store_pdf(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    chunks = legal_aware_chunking(text)

    chunk_texts = list(chunks.values())
    embeddings = gemini_embedder(chunk_texts)
    ids = [f"chunk_{i}" for i in range(len(chunks))]
    metadatas = [{"source": key} for key in chunks.keys()]

    collection.add(
        ids=ids,
        documents=chunk_texts,
        embeddings=embeddings,
        metadatas=metadatas
    )

    print(f"[✓] Processed and stored {len(chunks)} chunks from PDF.")
    return text, chunks

In [12]:
# ---------------------- STANDARD QA ----------------------
def generate_answer(question, context):
    prompt = f"""
    Answer the following legal question based on the context below:

    Context:
    {context}

    Question:
    {question}

    Answer:
    """
    model = genai.GenerativeModel("gemini-1.5-flash")
    response = model.generate_content(prompt)
    return response.text

In [16]:
# ---------------------- CHAIN-OF-LAW REASONING ----------------------
def reasoned_generate_answer(question, context):
    reasoning_prompt = f"""
You are a legal reasoning assistant. Follow a structured legal thinking format.

Step 1: Identify relevant IPC sections, statutes, or legal principles in the question or context.
Step 2: Search for matching precedents or clauses within the document.
Step 3: Apply legal logic to give a reasoned, justifiable answer or next action.

Context:
{context}

Legal Question:
{question}

Answer (with step-by-step legal reasoning):
"""
    model = genai.GenerativeModel("gemini-1.5-flash")
    response = model.generate_content(reasoning_prompt)
    return response.text

In [13]:
# ---------------------- LEGAL QA PIPELINE ----------------------
def legal_qa(question, full_text, reasoning=False):
    if reasoning:
        answer = reasoned_generate_answer(question, full_text)
        mode = "Chain-of-Law Reasoning"
    else:
        answer = generate_answer(question, full_text)
        mode = "Standard"

    return {
        "source": f"{mode} | Entire Document",
        "answer": answer.strip()
    }


In [17]:
# ---------------------- EXAMPLE USAGE ----------------------
if __name__ == "__main__":
    pdf_path = "/kaggle/input/document/e4a3f28b3504fbe5be30e204f85e4aaa1053c51b4dd7fd6c30df778f2f27ef471743797266.pdf"
    full_text, chunks = process_and_store_pdf(pdf_path)
    

    metadata = extract_metadata(full_text)
    print("\n[📑 METADATA]")
    for key, val in metadata.items():
        print(f"{key}: {val}")

    question = "What was the final order of the court?"
    result = legal_qa(question, full_text, reasoning=True)
    print(f"\n[❓ Question] {question}")
    print(f"[🧠 Answer]\n{result['answer']}")

[✓] Processed and stored 6 chunks from PDF.

[📑 METADATA]
court_name: HIGH COURT OF JHARKHAND AT RANCHI
case_number: 4727 of 2024
order_date: 27/02/2025
judge: Anil Kumar Choudhary
petitioners: N/A
advocate_petitioner: Mr. Manish Yadav, Advocate
advocate_state: Mr. Rakesh Kr. Sinha, Addl.P.P.
ipc_sections: 3/4 

[❓ Question] What was the final order of the court?
[🧠 Answer]
**Step 1: Identify relevant IPC sections, statutes, or legal principles.**

The relevant IPC sections are 341 (wrongful restraint), 323 (voluntarily causing hurt), 325 (voluntarily causing grievous hurt), 307 (attempt to murder), 504 (intentional insult with intent to provoke breach of the peace), 506 (criminal intimidation), 379 (theft), and 34 (acts done by several persons in furtherance of common intention).  The Witchcraft Act (Section 3/4) is also mentioned.  The court's decision also references Section 482(2) of the Bharatiya Nagarik Suraksha Sanhita, 2023 (BNSS).  The legal principle at play is anticipatory b