⚙️ Config-Driven Legal RAG Indexer with LanceDB + Advanced Chunking (Unstructured & Langchain)

In [1]:
import os
import re
import fitz  # PyMuPDF (Ensure PyMuPDF is installed: pip install PyMuPDF)
import lancedb
import numpy as np
from typing import List, Dict, Optional
from lancedb.pydantic import Vector, LanceModel
import csv

# New imports for advanced processing
import torch
from unstructured.partition.pdf import partition_pdf
# MODIFIED IMPORT for HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings # pip install langchain-huggingface
try:
    from langchain_experimental.text_splitter import SemanticChunker
except ImportError:
    from langchain.text_splitter import SemanticChunker # If it has moved to core langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Your custom embedder (ensure legalbert_embedder.py is present)
from legalbert_embedder import LegalBERTEmbedder
# Optionally, for using other sentence transformers as main_embedder
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 🔧 CONFIGURATION
config = {
    "model_name": "nlpaueb/legal-bert-base-uncased", # Used by LegalBERTEmbedder if not overridden
    "semantic_chunker_model": "thenlper/gte-base", # Model for SemanticChunker
    "comp_pdf": "CompaniesAct.pdf",
    "bank_pdf": "BankruptcyAct.pdf",
    "db_path": "./Data",

    # Table names for LanceDB
    "comp_table": "CompaniesAct",
    "bank_table": "BankruptcyAct",
    "constitution_table": "IndianConstitution",

    # CSV path for Indian Constitution
    "constitution_csv": "Indian_Constitution.csv"
}


In [3]:
def load_pdf_text(path: str) -> List[str]:
    doc = fitz.open(path)
    return [page.get_text() for page in doc]

In [4]:
def extract_sections_bankruptcy_act(full_text: str, config: Dict) -> List[Dict]:
    section_pattern = re.compile(r"(?i)(SECTION\s*\d+[A-Z]?(?:\.\d+)?(?:[A-Z]*)?)")
    part_pattern = re.compile(r"(?i)^\s*(PART\s+[A-Z]+.*?)$", re.MULTILINE)
    chapter_pattern = re.compile(r"(?i)^\s*(CHAPTER\s+[IVXLC]+.*?)$", re.MULTILINE)

    # Find headings
    section_matches = list(section_pattern.finditer(full_text))
    part_matches = list(part_pattern.finditer(full_text))
    chapter_matches = list(chapter_pattern.finditer(full_text))

    # Mapping start locations
    part_map = {m.start(): m.group(1).strip() for m in part_matches}
    chapter_map = {m.start(): m.group(1).strip() for m in chapter_matches}
    part_starts = sorted(part_map.keys())
    chapter_starts = sorted(chapter_map.keys())

    chunks = []
    for i, match in enumerate(section_matches):
        start = match.start()
        end = section_matches[i+1].start() if i+1 < len(section_matches) else len(full_text)
        chunk_text = full_text[start:end].strip()
        section_title = match.group(1).strip()

        # Find closest PART
        part_title = None
        for p_start in reversed(part_starts):
            if p_start <= start:
                part_title = part_map[p_start]
                break

        # Find closest CHAPTER
        chapter_title = None
        for ch_start in reversed(chapter_starts):
            if ch_start <= start:
                chapter_title = chapter_map[ch_start]
                break

        chunks.append({
            "id": f"bankruptcy_section_{i}",
            "chunk": chunk_text,
            "section_title": section_title,
            "chapter_title": chapter_title,
            "part_title": part_title,
            "page": None, # Page info not reliably extracted by regex from full text
            "source": "Bankruptcy Act",
        })

    return chunks


In [5]:
def extract_sections_with_meta_comp(full_text: str, config: Dict) -> List[Dict]:
    section_pattern = re.compile(r"(?i)(SECTION\s*\d+[A-Z]?(?:\.\d+)?[A-Z]*)")
    chapter_pattern = re.compile(r"(?i)^\s*(CHAPTER\s+[IVXLC]+.*?)$", re.MULTILINE)

    matches = list(section_pattern.finditer(full_text))
    chapter_matches = list(chapter_pattern.finditer(full_text))

    # Map chapter start positions to titles
    chapter_map = {m.start(): m.group(1).strip() for m in chapter_matches}
    chapter_starts = sorted(chapter_map.keys())

    chunks = []
    for i, match in enumerate(matches):
        start = match.start()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(full_text)
        chunk_text = full_text[start:end].strip()
        section_title = match.group(1).strip()

        # Get closest preceding chapter
        chapter_title = None
        for ch_start in reversed(chapter_starts):
            if ch_start <= start:
                chapter_title = chapter_map[ch_start]
                break

        chunks.append({
            "id": f"section_{i}",
            "chunk": chunk_text,
            "section_title": section_title,
            "chapter_title": chapter_title,
            "page": None,
            "source": "Companies Act",
        })

    return chunks

In [6]:
def create_lancedb_index_constitution(chunks: List[Dict], embeddings: np.ndarray, db_path: str, table_name: str):
    # Dynamically get embedding dimension from the provided embeddings
    if embeddings.ndim == 1: # Single embedding
        embedding_dim = embeddings.shape[0]
    elif embeddings.ndim == 2: # Array of embeddings
        embedding_dim = embeddings.shape[1]
    else:
        raise ValueError(f"Unexpected embeddings shape: {embeddings.shape}. Expected 1D or 2D array.")
    print(f"   LanceDB: Using embedding dimension {embedding_dim} for table {table_name}")

    class ConstitutionArticle(LanceModel):
        id: str
        chunk: str
        embedding: Vector(embedding_dim)
        section_title: str # Assuming article_id from CSV is always a string
        chapter_title: str # Assuming this is always "Indian Constitution"
        page: Optional[int] = None # Page can be None
        source: str

    try:
        if not os.path.exists(db_path):
            os.makedirs(db_path)
        db = lancedb.connect(db_path)

        data_to_insert = []
        for i in range(len(chunks)):
            item = {
                "id": chunks[i].get("id", f"const_default_id_{i}"),
                "chunk": chunks[i].get("chunk", ""),
                "embedding": embeddings[i].tolist(),
                "section_title": chunks[i].get("section_title", "Unknown Article"),
                "chapter_title": chunks[i].get("chapter_title", "Indian Constitution"),
                "page": chunks[i].get("page"),
                "source": chunks[i].get("source", "Indian Constitution")
            }
            data_to_insert.append(item)

        if table_name in db.table_names():
            db.drop_table(table_name)
            print(f"Dropped existing table: {table_name}")

        table = db.create_table(table_name, schema=ConstitutionArticle)
        if data_to_insert:
            table.add(data_to_insert)
            print(f"Successfully added {len(data_to_insert)} records to {table_name}")
        else:
            print(f"No data to insert into {table_name}")
        return table

    except Exception as e:
        print(f"❌ Error creating LanceDB index for {table_name}: {e}")
        import traceback
        traceback.print_exc()
        return None

In [7]:
def create_lancedb_index_bankruptcy(chunks: List[Dict], embeddings: np.ndarray, db_path: str, table_name: str):
    # Dynamically get embedding dimension from the provided embeddings
    if embeddings.ndim == 1: # Single embedding
        embedding_dim = embeddings.shape[0]
    elif embeddings.ndim == 2: # Array of embeddings
        embedding_dim = embeddings.shape[1]
    else:
        raise ValueError(f"Unexpected embeddings shape: {embeddings.shape}. Expected 1D or 2D array.")
    print(f"   LanceDB: Using embedding dimension {embedding_dim} for table {table_name}")
    
    class Document1(LanceModel):
        id: str
        chunk: str
        embedding: Vector(embedding_dim)
        part_title: Optional[str] = None
        chapter_title: Optional[str] = None
        section_title: Optional[str] = None
        page: Optional[int] = None
        source: str

    try:
        if not os.path.exists(db_path): os.makedirs(db_path)
        db = lancedb.connect(db_path)
        data_to_insert = []
        for i in range(len(chunks)):
            item = {
                "id": chunks[i].get("id", f"bank_default_id_{i}"),
                "chunk": chunks[i].get("chunk", ""),
                "embedding": embeddings[i].tolist(),
                "part_title": chunks[i].get("part_title"), # .get() correctly returns None if key missing
                "chapter_title": chunks[i].get("chapter_title"),
                "section_title": chunks[i].get("section_title"),
                "page": chunks[i].get("page"),
                "source": chunks[i].get("source", "Bankruptcy Act")
            }
            data_to_insert.append(item)

        if table_name in db.table_names():
            db.drop_table(table_name)
            print(f"Dropped existing table: {table_name}")

        table = db.create_table(table_name, schema=Document1)
        if data_to_insert:
            table.add(data_to_insert)
            print(f"Successfully added {len(data_to_insert)} records to {table_name}")
        else:
            print(f"No data to insert into {table_name}")
        return table
    except Exception as e:
        print(f"❌ Error creating LanceDB index for {table_name}: {e}")
        import traceback
        traceback.print_exc()
        return None

In [8]:
def create_lancedb_index_company(chunks: List[Dict], embeddings: np.ndarray, db_path: str, table_name: str):
    # Dynamically get embedding dimension from the provided embeddings
    if embeddings.ndim == 1: # Single embedding
        embedding_dim = embeddings.shape[0]
    elif embeddings.ndim == 2: # Array of embeddings
        embedding_dim = embeddings.shape[1]
    else:
        raise ValueError(f"Unexpected embeddings shape: {embeddings.shape}. Expected 1D or 2D array.")
    print(f"   LanceDB: Using embedding dimension {embedding_dim} for table {table_name}")
    
    class Document2(LanceModel):
        id: str
        chunk: str
        embedding: Vector(embedding_dim)
        section_title: Optional[str] = None
        chapter_title: Optional[str] = None
        page: Optional[int] = None
        source: str

    try:
        if not os.path.exists(db_path): os.makedirs(db_path)
        db = lancedb.connect(db_path)
        data_to_insert = []
        for i in range(len(chunks)):
            item = {
                "id": chunks[i].get("id", f"comp_default_id_{i}"),
                "chunk": chunks[i].get("chunk", ""),
                "embedding": embeddings[i].tolist(),
                "section_title": chunks[i].get("section_title"),
                "chapter_title": chunks[i].get("chapter_title"),
                "page": chunks[i].get("page"),
                "source": chunks[i].get("source", "Companies Act")
            }
            data_to_insert.append(item)

        if table_name in db.table_names():
            db.drop_table(table_name)
            print(f"Dropped existing table: {table_name}")

        table = db.create_table(table_name, schema=Document2)
        if data_to_insert:
            table.add(data_to_insert)
            print(f"Successfully added {len(data_to_insert)} records to {table_name}")
        else:
            print(f"No data to insert into {table_name}")
        return table
    except Exception as e:
        print(f"❌ Error creating LanceDB index for {table_name}: {e}")
        import traceback
        traceback.print_exc()
        return None

In [9]:
def load_constitution_articles(file_path: str) -> List[Dict]:
    articles = []
    try:
        with open(file_path, newline='', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            for i, row in enumerate(reader):
                articles.append({
                    "id": f"constitution_article_{i}",
                    "chunk": row.get("article_desc", "").strip(),
                    "section_title": row.get("article_id", f"UnknownArticle_{i}").strip(), # Corresponds to article_id
                    "chapter_title": "Indian Constitution", # Assigning a general chapter title
                    "page": None,
                    "source": "Indian Constitution"
                })
        return articles
    except FileNotFoundError:
        print(f"❌ CSV file not found: {file_path}")
    except KeyError as ke:
        print(f"❌ Missing column in CSV (expected 'article_desc', 'article_id'): {ke}")
    except Exception as e:
        print(f"❌ Error loading Constitution articles: {e}")
    return []

In [10]:
# --- Global Instance for Semantic Chunker Embeddings ---
_semantic_embeddings_instance = None

def get_semantic_embeddings_instance(model_name="thenlper/gte-base"):
    global _semantic_embeddings_instance
    if _semantic_embeddings_instance is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        print(f"ℹ️ Initializing Semantic Embeddings on '{device}' with model '{model_name}' for chunking.")
        # This will now use the new import path from langchain_huggingface
        _semantic_embeddings_instance = HuggingFaceEmbeddings(
            model_name=model_name,
            model_kwargs={'device': device},
            encode_kwargs={'normalize_embeddings': True} # GTE models often benefit from normalization
        )
    return _semantic_embeddings_instance

In [11]:
def process_pdf_document_advanced(
    pdf_path: str,
    config: Dict,
    document_type: str, # e.g., "CompaniesAct", "BankruptcyAct"
    semantic_chunker_embeddings # Pass the initialized embeddings instance
) -> List[Dict]:
    """
    Processes a PDF document using Unstructured for parsing, regex for coarse structuring,
    and SemanticChunker/RecursiveCharacterTextSplitter for fine-grained chunking.
    """
    print(f"📄 Advanced processing for: {pdf_path}")
    final_processed_chunks = []
    full_text = ""

    try:
        # 1. Use Unstructured to Parse the PDF
        print(f"    Parsing with Unstructured (strategy: hi_res)... This may take a while.")
        print("    Ensure Poppler is installed and in PATH for 'hi_res' PDF strategy if not using OCR-only methods.")
        elements = partition_pdf(
            filename=pdf_path,
            strategy="hi_res", # Uses layout detection. Requires Poppler and Detectron2.
                               # Fallback to "fast" or "ocr_only" (needs Tesseract) if "hi_res" has issues.
            infer_table_structure=True,
            # model_name="yolox" # Default for hi_res, usually not needed to specify
        )
        full_text = "\n\n".join([el.text for el in elements if hasattr(el, 'text') and el.text.strip()])
        if not full_text.strip():
            print(f"⚠️ Unstructured extracted no text from {pdf_path}.")
            # Proceed to Fitz fallback if Unstructured returns nothing, even if no explicit error.
            raise ValueError("Unstructured returned no text.")
        print(f"    Unstructured extracted text length: {len(full_text)} chars.")

    except Exception as e:
        print(f"❌ Error during Unstructured parsing for {pdf_path}: {e}")
        print(f"    Falling back to Fitz (PyMuPDF) for PDF text extraction.")
        try:
            # Ensure PyMuPDF is installed: pip install PyMuPDF
            doc = fitz.open(pdf_path) # This is the correct way to open with PyMuPDF
            page_texts = [page.get_text() for page in doc]
            doc.close()
            full_text = "\n".join(page_texts)
            if not full_text.strip():
                print(f"⚠️ Fitz (PyMuPDF) also extracted no text from {pdf_path}.")
                return [] # Critical failure if both methods yield no text
            print(f"    Fitz (PyMuPDF) extracted text length: {len(full_text)} chars.")
        except Exception as e_fitz:
            print(f"❌ Fitz (PyMuPDF) also failed for {pdf_path}: {e_fitz}")
            return [] # Critical failure

    # ... (rest of the function remains the same as it uses full_text) ...

    # 2. Use existing regex for Coarse Chunking (Sections, Chapters, Parts)
    print(f"    Applying regex for coarse structuring...")
    coarse_chunks_with_meta = []
    if document_type == "BankruptcyAct":
        coarse_chunks_with_meta = extract_sections_bankruptcy_act(full_text, config)
    elif document_type == "CompaniesAct":
        coarse_chunks_with_meta = extract_sections_with_meta_comp(full_text, config)
    else:
        print(f"⚠️ Unknown document type for regex extraction: {document_type}")
        coarse_chunks_with_meta = [{"id": "doc_0", "chunk": full_text, "section_title": "Full Document", "source": document_type}]


    if not coarse_chunks_with_meta:
        print(f"⚠️ Regex extraction yielded no coarse chunks for {pdf_path}.")
        return []
    print(f"    Found {len(coarse_chunks_with_meta)} coarse chunks via regex.")

    # 3. Fine-Grained Chunking for long coarse chunks
    print(f"    Fine-graining long chunks...")
    semantic_text_splitter = SemanticChunker(
        semantic_chunker_embeddings,
        breakpoint_threshold_type="percentile"
    )
    recursive_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=160,
        separators=["\n\nSECTION ", "\n\nPART ", "\n\nCHAPTER ", "\n\n", "\n", ". ", ", ", " ", ""],
        length_function=len
    )

    LENGTH_THRESHOLD_CHARS = 1500
    MAX_CHARS_FOR_SEMANTIC_CHUNK_INPUT = 25000 # Safety for SemanticChunker

    for i, coarse_chunk in enumerate(coarse_chunks_with_meta):
        section_text = coarse_chunk["chunk"]

        if len(section_text) > LENGTH_THRESHOLD_CHARS:
            sub_chunks_texts = []
            try:
                if len(section_text) < MAX_CHARS_FOR_SEMANTIC_CHUNK_INPUT:
                    # print(f"      Semantically splitting coarse chunk {coarse_chunk.get('id',i)} ({len(section_text)} chars)")
                    sub_chunks_texts = semantic_text_splitter.split_text(section_text)
                else:
                    # print(f"      Coarse chunk {coarse_chunk.get('id',i)} too long for SemanticChunker ({len(section_text)} chars). Using Recursive.")
                    sub_chunks_texts = recursive_splitter.split_text(section_text)
            except Exception as e_semantic:
                # print(f"      ⚠️ SemanticChunker failed for coarse chunk {coarse_chunk.get('id', i)}: {e_semantic}. Falling back to Recursive.")
                sub_chunks_texts = recursive_splitter.split_text(section_text)

            for j, sub_chunk_text in enumerate(sub_chunks_texts):
                if not sub_chunk_text.strip(): continue
                final_processed_chunks.append({
                    "id": f"{coarse_chunk.get('id', f'c_{i}')}_sub_{j}",
                    "chunk": sub_chunk_text.strip(),
                    "section_title": coarse_chunk.get("section_title"),
                    "chapter_title": coarse_chunk.get("chapter_title"),
                    "part_title": coarse_chunk.get("part_title"),
                    "page": coarse_chunk.get("page"),
                    "source": coarse_chunk.get("source"),
                })
        elif section_text.strip():
            coarse_chunk["chunk"] = coarse_chunk["chunk"].strip()
            final_processed_chunks.append(coarse_chunk)

    print(f"    Processed into {len(final_processed_chunks)} final chunks for {pdf_path}.")
    return final_processed_chunks

In [12]:
def index_bankruptcy_act_advanced(config: Dict, main_embedder_instance) -> None:
    print("\n⚙️ Indexing Bankruptcy Act (Advanced Pipeline)...")
    semantic_embeddings = get_semantic_embeddings_instance(config.get("semantic_chunker_model", "thenlper/gte-base"))
    processed_chunks = process_pdf_document_advanced(
        pdf_path=config["bank_pdf"],
        config=config,
        document_type="BankruptcyAct",
        semantic_chunker_embeddings=semantic_embeddings
    )
    if not processed_chunks:
        print("⚠️ No chunks to index for Bankruptcy Act.")
        return

    print(f"    Embedding {len(processed_chunks)} chunks with main embedder...")
    embeddings_np = main_embedder_instance.encode([c["chunk"] for c in processed_chunks])
    table_created = create_lancedb_index_bankruptcy(processed_chunks, embeddings_np, config["db_path"], config["bank_table"])
    if table_created: # Check if table creation was successful
        print(f"✅ LanceDB index created/updated for '{config['bank_table']}'.")
    else:
        print(f"⚠️ LanceDB index creation FAILED for '{config['bank_table']}'.")

In [13]:
def index_company_act_advanced(config: Dict, main_embedder_instance) -> None:
    print("\n⚙️ Indexing Company Act (Advanced Pipeline)...")
    semantic_embeddings = get_semantic_embeddings_instance(config.get("semantic_chunker_model", "thenlper/gte-base"))
    processed_chunks = process_pdf_document_advanced(
        pdf_path=config["comp_pdf"],
        config=config,
        document_type="CompaniesAct",
        semantic_chunker_embeddings=semantic_embeddings
    )
    if not processed_chunks:
        print("⚠️ No chunks to index for Company Act.")
        return

    print(f"    Embedding {len(processed_chunks)} chunks with main embedder...")
    embeddings_np = main_embedder_instance.encode([c["chunk"] for c in processed_chunks])
    table_created = create_lancedb_index_company(processed_chunks, embeddings_np, config["db_path"], config["comp_table"])
    if table_created: # Check if table creation was successful
        print(f"✅ LanceDB index created/updated for '{config['comp_table']}'.")
    else:
        print(f"⚠️ LanceDB index creation FAILED for '{config['comp_table']}'.")

In [14]:
def index_constitution_articles_advanced(config: Dict, main_embedder_instance) -> None:
    print("\n⚙️ Indexing Indian Constitution from CSV (Advanced Pipeline)...")
    coarse_chunks_with_meta = load_constitution_articles(config["constitution_csv"])
    if not coarse_chunks_with_meta:
        print("⚠️ No articles loaded from Constitution CSV. Skipping indexing.")
        return
    print(f"    Loaded {len(coarse_chunks_with_meta)} articles from CSV.")

    final_processed_chunks = []
    semantic_embeddings = get_semantic_embeddings_instance(config.get("semantic_chunker_model", "thenlper/gte-base"))
    semantic_text_splitter = SemanticChunker(
        semantic_embeddings, breakpoint_threshold_type="percentile"
    )
    recursive_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800, chunk_overlap=160, separators=["\n\n", "\n", ". ", ", ", " ", ""], length_function=len
    )
    LENGTH_THRESHOLD_CHARS = 1500
    MAX_CHARS_FOR_SEMANTIC_CHUNK_INPUT = 25000

    print(f"    Fine-graining long articles...")
    for i, coarse_chunk in enumerate(coarse_chunks_with_meta):
        article_text = coarse_chunk["chunk"]
        if len(article_text) > LENGTH_THRESHOLD_CHARS:
            sub_chunks_texts = []
            try:
                if len(article_text) < MAX_CHARS_FOR_SEMANTIC_CHUNK_INPUT:
                    sub_chunks_texts = semantic_text_splitter.split_text(article_text)
                else:
                    sub_chunks_texts = recursive_splitter.split_text(article_text)
            except Exception as e_semantic:
                sub_chunks_texts = recursive_splitter.split_text(article_text)

            for j, sub_chunk_text in enumerate(sub_chunks_texts):
                if not sub_chunk_text.strip(): continue
                final_processed_chunks.append({
                    "id": f"{coarse_chunk.get('id', f'csv_{i}')}_sub_{j}",
                    "chunk": sub_chunk_text.strip(),
                    "section_title": coarse_chunk.get("section_title"),
                    "chapter_title": coarse_chunk.get("chapter_title"),
                    "page": coarse_chunk.get("page"),
                    "source": coarse_chunk.get("source"),
                })
        elif article_text.strip():
            coarse_chunk["chunk"] = coarse_chunk["chunk"].strip()
            final_processed_chunks.append(coarse_chunk)

    if not final_processed_chunks:
        print("⚠️ No final chunks to index for Constitution.")
        return

    print(f"    Processed into {len(final_processed_chunks)} final chunks for Constitution.")
    print(f"    Embedding {len(final_processed_chunks)} chunks with main embedder...")
    embeddings_np = main_embedder_instance.encode([c["chunk"] for c in final_processed_chunks])
    table_created = create_lancedb_index_constitution(final_processed_chunks, embeddings_np, config["db_path"], config["constitution_table"])
    if table_created: # Check if table creation was successful
        print(f"✅ LanceDB index created/updated for '{config['constitution_table']}'.")
    else:
        print(f"⚠️ LanceDB index creation FAILED for '{config['constitution_table']}'.")

In [15]:
# RUN ADVANCED INDEXING PIPELINE
print("▶️ Initializing main embedder for LanceDB storage...")

# --- Choose your MAIN embedder for storage ---
# # Option 1: Your existing LegalBERTEmbedder (ensure legalbert_embedder.py is present)
# main_embedder = LegalBERTEmbedder() 
# print(f"   Using LegalBERTEmbedder (model: {config['model_name']}) as the main embedder.")

# Option 2: A different SentenceTransformer model
main_embedder_model_name = "sentence-transformers/all-mpnet-base-v2" # or "thenlper/gte-base"
print(f"   Using '{main_embedder_model_name}' as the main embedder.")
main_embedder = SentenceTransformer(main_embedder_model_name)
# IMPORTANT: If you use Option 2 and the model has a different embedding dimension than 768,
# you MUST update the Vector(dimension) in your LanceModel schemas in the create_lancedb_index_* functions.
# For "all-mpnet-base-v2" or "thenlper/gte-base", the dimension is 768.

config["main_embedder_model_name"] = main_embedder_model_name

index_bankruptcy_act_advanced(config, main_embedder)
index_company_act_advanced(config, main_embedder)
index_constitution_articles_advanced(config, main_embedder)

print("\n🎉 All advanced indexing pipelines completed.")

▶️ Initializing main embedder for LanceDB storage...
   Using 'sentence-transformers/all-mpnet-base-v2' as the main embedder.

⚙️ Indexing Bankruptcy Act (Advanced Pipeline)...
ℹ️ Initializing Semantic Embeddings on 'cpu' with model 'thenlper/gte-base' for chunking.
📄 Advanced processing for: BankruptcyAct.pdf
    Parsing with Unstructured (strategy: hi_res)... This may take a while.
    Ensure Poppler is installed and in PATH for 'hi_res' PDF strategy if not using OCR-only methods.
    Unstructured extracted text length: 385848 chars.
    Applying regex for coarse structuring...
    Found 507 coarse chunks via regex.
    Fine-graining long chunks...
    Processed into 597 final chunks for BankruptcyAct.pdf.
    Embedding 597 chunks with main embedder...
   LanceDB: Using embedding dimension 768 for table BankruptcyAct
Successfully added 597 records to BankruptcyAct
✅ LanceDB index created/updated for 'BankruptcyAct'.

⚙️ Indexing Company Act (Advanced Pipeline)...
📄 Advanced processin

In [None]:
import lancedb
# from legalbert_embedder import LegalBERTEmbedder # Already imported earlier if main_embedder is LegalBERTEmbedder
# from sentence_transformers import SentenceTransformer # Already imported earlier if main_embedder is SentenceTransformer
import pandas as pd

In [20]:
class LegalRetriever:
    def __init__(self, main_embedder_instance, top_k: int = 5):
        self.model = main_embedder_instance # Use the same embedder instance used for indexing
        self.top_k = top_k
        self.dbs = {}

    def _get_table(self, db_path: str, table_name: str):
        db_key = f"{db_path}_{table_name}" # Unique key for db connection + table
        if db_key not in self.dbs:
            try:
                db_conn = lancedb.connect(db_path)
                self.dbs[db_key] = db_conn.open_table(table_name)
            except Exception as e:
                raise RuntimeError(f"Failed to connect to DB at {db_path} or open table {table_name}: {str(e)}")
        return self.dbs[db_key]

    def query_multiple(self, query_text: str, tables_to_search: list[dict]) -> list:
        try:
            query_vec = self.model.encode([query_text])[0].tolist()
        except Exception as e:
            raise RuntimeError(f"Failed to embed query: {str(e)}")

        all_results_dfs = []

        for tbl_info in tables_to_search:
            try:
                table_obj = self._get_table(tbl_info["db_path"], tbl_info["table_name"])
                df = table_obj.search(query_vec).limit(self.top_k).to_pandas()
                all_results_dfs.append(df)
            except Exception as e:
                print(f"Warning: Failed to query table '{tbl_info['table_name']}' in DB '{tbl_info['db_path']}': {str(e)}")
        
        if not all_results_dfs:
            return []

        try:
            merged_df = pd.concat(all_results_dfs, ignore_index=True)
            if "_distance" in merged_df.columns:
                merged_df = merged_df.sort_values(by="_distance", ascending=True)
            else:
                 print("Warning: '_distance' column not found in search results for sorting.")
        except Exception as e:
            # If concat fails (e.g. empty list of dfs), return empty list or handle
            if not all_results_dfs:
                return []
            raise RuntimeError(f"Failed to process merged results: {str(e)}")
        
        # Format results
        output_results = []
        for _, row in merged_df.head(self.top_k).iterrows(): # Ensure only top_k overall are returned
            output_results.append({
                "id": row.get("id"),
                "chunk": row.get("chunk"),
                "part_title": row.get("part_title"),
                "chapter_title": row.get("chapter_title"),
                "section_title": row.get("section_title"),
                "page": row.get("page"),
                "source": row.get("source"),
                "score": row.get("_distance")
            })
        return output_results

In [21]:
# Initialize retriever with the same main_embedder instance used for indexing
# This ensures consistency in embeddings for query and storage.
# 'main_embedder' should be defined from the indexing execution cell above.
main_embedder_model_name = "sentence-transformers/all-mpnet-base-v2" # or "thenlper/gte-base"
print(f"   Using '{main_embedder_model_name}' as the main embedder.")
main_embedder = SentenceTransformer(main_embedder_model_name)

retriever = LegalRetriever(main_embedder_instance=main_embedder, top_k=5)

def query_legal_documents(query: str) -> List[Dict]:
    tables_to_query = [
        {"db_path": config["db_path"], "table_name": config["comp_table"]},
        {"db_path": config["db_path"], "table_name": config["bank_table"]},
        {"db_path": config["db_path"], "table_name": config["constitution_table"]}
    ]
    return retriever.query_multiple(query, tables_to_query)

In [None]:
results1 = query_legal_documents(query = "What are the requirements for corporate insolvency resolution under the Companies Act and the Insolvency and Bankruptcy Code?")
for res in results1:
    print(f"Source: {res['source']}, Section: {res.get('section_title', res.get('id'))}, Score: {res['score']:.2f}")
    print(f"Chunk: {res['chunk'][:300]}...\n")

In [None]:
results2 = query_legal_documents(query = "Provided that a company in respect of which such appeal or reference or inquiry stands abated under this clause may make reference to the National Company Law Tribunal under the Insolvency and Bankruptcy Code, 2016 within one hundred and eighty days from the commencement of the Insolvency and Bankruptcy Code, 2016 in accordance with the provisions of the Insolvency and Bankruptcy Code, 2016:")
for res in results2:
    print(f"Source: {res['source']}, Section: {res.get('section_title', res.get('id'))}, Score: {res['score']:.2f}")
    print(f"Chunk: {res['chunk'][:300]}...\n")

In [22]:
results3 = query_legal_documents(query = "What are the regulations on buying back shares under the Companies Act?")
for res in results3:
    print(f"Source: {res['source']}, Section: {res.get('section_title', res.get('id'))}, Score: {res['score']:.2f}")
    print(f"Chunk: {res['chunk'][:300]}...\n")

Source: Companies Act, Section: section 447, Score: 0.59
Chunk: 58 
 
68. Power of company to purchase its own securities.—(1) Notwithstanding anything contained in 
this Act, but subject to the provisions of sub-section (2), a company may purchase its own shares or other 
specified securities (hereinafter referred to as buy-back) out of— 
(a) its free reserves;...

Source: Companies Act, Section: section 133, Score: 0.65
Chunk: section 133 or any other provision of this Act and a certificate to that effect by the 
company’s auditor has been filed with the Tribunal. 
(4) The order of confirmation of the reduction of share capital by the Tribunal under sub-section (3) 
shall be published by the company in such manner as the ...

Source: Companies Act, Section: section 62, Score: 0.66
Chunk: section 62 or other specified securities within a period of six 
months except by way of a bonus issue or in the discharge of subsisting obligations such as conversion of 
warrants, stock option sche

In [None]:
import lancedb
import pyarrow.compute as pc

# Example: Inspect a specific document from BankruptcyAct
try:
    db_inspect = lancedb.connect(config["db_path"])
    table_inspect_name = config["bank_table"]
    if db_inspect.table_exists(table_inspect_name):
        table_inspect = db_inspect.open_table(table_inspect_name)
        
        # Fetch all IDs to find one for testing, or use a known one
        all_data = table_inspect.to_arrow()
        if len(all_data) > 0:
            # Pick the first ID as an example, if IDs are structured like 'bankruptcy_section_0_sub_0'
            doc_id_to_find = all_data['id'][0].as_py()
            print(f"Attempting to fetch document with ID: {doc_id_to_find} from {table_inspect_name}")
            
            mask = pc.equal(all_data["id"], doc_id_to_find)
            filtered_table = all_data.filter(mask)
            result = filtered_table.to_pylist()
            
            if result:
                print("\nDocument found:")
                print(result[0])
                print("\nKeys in document:", result[0].keys())
            else:
                print(f"Document with ID {doc_id_to_find} not found in {table_inspect_name}.")
        else:
            print(f"Table {table_inspect_name} is empty.")
    else:
        print(f"Table {table_inspect_name} does not exist in {config['db_path']}")
except Exception as e:
    print(f"Error during debug inspection: {e}")