⚙️ Config-Driven Legal RAG Indexer with LanceDB + Legal-BERT

In [3]:
import os
import re
import fitz
import lancedb
import numpy as np
from typing import List, Dict
from lancedb.pydantic import Vector, LanceModel
from legalbert_embedder import LegalBERTEmbedder
import csv

In [4]:
# 🔧 CONFIGURATION
config = {
    "model_name": "nlpaueb/legal-bert-base-uncased",
    "comp_pdf": "CompaniesAct.pdf",
    "bank_pdf": "BankruptcyAct.pdf",
    "db_path": "./Data",

    # Table names for LanceDB
    "comp_table": "CompaniesAct",
    "bank_table": "BankruptcyAct",
    "constitution_table": "IndianConstitution",

    # CSV path for Indian Constitution
    "constitution_csv": "Indian_Constitution.csv"
}


In [8]:
def load_pdf_text(path: str) -> List[str]:
    doc = fitz.open(path)
    return [page.get_text() for page in doc]

In [9]:
def extract_sections_bankruptcy_act(full_text: str, config: Dict) -> List[Dict]:
    section_pattern = re.compile(r"(?i)(SECTION\s*\d+[A-Z]?(?:\.\d+)?(?:[A-Z]*)?)")
    part_pattern = re.compile(r"(?i)^\s*(PART\s+[A-Z]+.*?)$", re.MULTILINE)
    chapter_pattern = re.compile(r"(?i)^\s*(CHAPTER\s+[IVXLC]+.*?)$", re.MULTILINE)

    # Find headings
    section_matches = list(section_pattern.finditer(full_text))
    part_matches = list(part_pattern.finditer(full_text))
    chapter_matches = list(chapter_pattern.finditer(full_text))

    # Mapping start locations
    part_map = {m.start(): m.group(1).strip() for m in part_matches}
    chapter_map = {m.start(): m.group(1).strip() for m in chapter_matches}
    part_starts = sorted(part_map.keys())
    chapter_starts = sorted(chapter_map.keys())

    chunks = []
    for i, match in enumerate(section_matches):
        start = match.start()
        end = section_matches[i+1].start() if i+1 < len(section_matches) else len(full_text)
        chunk_text = full_text[start:end].strip()
        section_title = match.group(1).strip()

        # Find closest PART
        part_title = None
        for p_start in reversed(part_starts):
            if p_start <= start:
                part_title = part_map[p_start]
                break

        # Find closest CHAPTER
        chapter_title = None
        for ch_start in reversed(chapter_starts):
            if ch_start <= start:
                chapter_title = chapter_map[ch_start]
                break

        chunks.append({
            "id": f"bankruptcy_section_{i}",
            "chunk": chunk_text,
            "section_title": section_title,
            "chapter_title": chapter_title,
            "part_title": part_title,
            "page": None,
            "source": "Bankruptcy Act",
        })

    return chunks


In [10]:
def extract_sections_with_meta_comp(full_text: str, config: Dict) -> List[Dict]:
    section_pattern = re.compile(r"(?i)(SECTION\s*\d+[A-Z]?(?:\.\d+)?[A-Z]*)")
    chapter_pattern = re.compile(r"(?i)^\s*(CHAPTER\s+[IVXLC]+.*?)$", re.MULTILINE)

    matches = list(section_pattern.finditer(full_text))
    chapter_matches = list(chapter_pattern.finditer(full_text))

    # Map chapter start positions to titles
    chapter_map = {m.start(): m.group(1).strip() for m in chapter_matches}
    chapter_starts = sorted(chapter_map.keys())

    chunks = []
    for i, match in enumerate(matches):
        start = match.start()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(full_text)
        chunk_text = full_text[start:end].strip()
        section_title = match.group(1).strip()

        # Get closest preceding chapter
        chapter_title = None
        for ch_start in reversed(chapter_starts):
            if ch_start <= start:
                chapter_title = chapter_map[ch_start]
                break

        chunks.append({
            "id": f"section_{i}",
            "chunk": chunk_text,
            "section_title": section_title,
            "chapter_title": chapter_title,
            "page": None,
            "source": "Companies Act",
        })

    return chunks

In [11]:
def create_lancedb_index_constitution(chunks: List[Dict], embeddings: np.ndarray, db_path: str, table_name: str):
    class ConstitutionArticle(LanceModel):
        id: str
        chunk: str
        embedding: Vector(768)
        section_title: str
        chapter_title: str
        page: int = None
        source: str

    try:
        if not os.path.exists(db_path):
            os.makedirs(db_path)

        db = lancedb.connect(db_path)

        for i in range(len(chunks)):
            chunks[i]["embedding"] = embeddings[i].tolist()

        table = db.create_table(table_name, data=chunks, schema=ConstitutionArticle, mode="overwrite")
        return table

    except Exception as e:
        print(f"❌ Error creating LanceDB index for Constitution: {e}")
        return None

In [12]:
def create_lancedb_index_bankruptcy(chunks: List[Dict], embeddings: np.ndarray, db_path: str, table_name: str):
    class Document1(LanceModel):
        id: str
        chunk: str
        embedding: Vector(768)
        part_title: str = None
        chapter_title: str = None
        section_title: str = None
        page: int
        source: str
    if not os.path.exists(db_path): os.makedirs(db_path)
    db = lancedb.connect(db_path)
    for i in range(len(chunks)):
        chunks[i]["embedding"] = embeddings[i].tolist()
    table = db.create_table(table_name, data=chunks, schema=Document1, mode="overwrite")
    return table

In [13]:
def create_lancedb_index_company(chunks: List[Dict], embeddings: np.ndarray, db_path: str, table_name: str):
    class Document2(LanceModel):
        id: str
        chunk: str
        embedding: Vector(768)
        section_title: str = None
        chapter_title: str = None
        page: int = None
        source: str
    if not os.path.exists(db_path): os.makedirs(db_path)
    db = lancedb.connect(db_path)
    for i in range(len(chunks)):
        chunks[i]["embedding"] = embeddings[i].tolist()
    table = db.create_table(table_name, data=chunks, schema=Document2, mode="overwrite")
    return table

In [14]:
def load_constitution_articles(file_path: str) -> List[Dict]:
    articles = []
    try:
        with open(file_path, newline='', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            for i, row in enumerate(reader):
                articles.append({
                    "id": f"constitution_article_{i}",
                    "chunk": row["article_desc"].strip(),
                    "section_title": row["article_id"].strip(),
                    "chapter_title": "Indian Constitution",
                    "page": None,
                    "source": "Indian Constitution"
                })
        return articles
    except FileNotFoundError:
        print(f"❌ CSV file not found: {file_path}")
    except KeyError as ke:
        print(f"❌ Missing column in CSV: {ke}")
    except Exception as e:
        print(f"❌ Error loading Constitution articles: {e}")
    return []

In [None]:
def index_constitution_articles(config: Dict) -> None:
    try:
        model = LegalBERTEmbedder()
        chunks = load_constitution_articles(config["constitution_csv"])

        if not chunks:
            print("⚠️ No articles loaded. Skipping indexing.")
            return
        print(f"Extracted {len(chunks)} articles in the Indian Constitution CSV.")
        embeddings = model.encode([c["chunk"] for c in chunks])
        result = create_lancedb_index_constitution(chunks, embeddings, config["db_path"], config["constitution_table"])

        if result:
            print("✅ LanceDB index created for Indian Constitution.")
    except Exception as e:
        print(f"❌ Error indexing Constitution articles: {e}")

In [20]:
def index_bankruptcy_act(config: Dict) -> None:
    model = LegalBERTEmbedder()
    pages = load_pdf_text(config["bank_pdf"])
    full_text = "\n".join(pages)
    chunks = extract_sections_bankruptcy_act(full_text, config)
    print(f"Extracted {len(chunks)} semantic sections.")
    embeddings = model.encode([c["chunk"] for c in chunks])
    create_lancedb_index_bankruptcy(chunks, embeddings, config["db_path"], config["bank_table"])
    print("✅ LanceDB index created for Bankruptcy Act.")

In [21]:
# RUN PIPELINE
def index_company_act(config: Dict) -> None:
    model = LegalBERTEmbedder()
    pages = load_pdf_text(config["comp_pdf"])
    full_text = "\n".join(pages)
    chunks = extract_sections_with_meta_comp(full_text, config)
    print(f"Extracted {len(chunks)} semantic sections.")
    embeddings = model.encode([c["chunk"] for c in chunks])
    create_lancedb_index_company(chunks, embeddings, config["db_path"], config["comp_table"])
    print("✅ LanceDB index created for Company Act.")

In [22]:
index_bankruptcy_act(config)
index_company_act(config)
index_constitution_articles(config)

Extracted 507 semantic sections.
✅ LanceDB index created for Bankruptcy Act.
Extracted 566 semantic sections.
✅ LanceDB index created for Company Act.
📄 Found 454 articles in the Indian Constitution CSV.
✅ LanceDB index created for Indian Constitution.


In [11]:
import lancedb
from legalbert_embedder import LegalBERTEmbedder
import pandas as pd

In [28]:
class LegalRetriever:
    def __init__(self, top_k: int = 5):
        self.model = LegalBERTEmbedder()
        self.top_k = top_k
        self.dbs = {}

    def _get_table(self, db_path: str, table_name: str):
        if db_path not in self.dbs:
            try:
                self.dbs[db_path] = lancedb.connect(db_path)
            except Exception as e:
                raise RuntimeError(f"Failed to connect to DB at {db_path}: {str(e)}")
        try:
            return self.dbs[db_path].open_table(table_name)
        except Exception as e:
            raise RuntimeError(f"Failed to open table '{table_name}' in DB '{db_path}': {str(e)}")

    def query_multiple(self, query_text: str, tables: list[dict]) -> list:
        try:
            query_vec = self.model.encode([query_text])[0].tolist()
        except Exception as e:
            raise RuntimeError(f"Failed to embed query: {str(e)}")

        all_results = []

        for tbl in tables:
            try:
                table = self._get_table(tbl["db_path"], tbl["table_name"])
                df = table.search(query_vec).limit(self.top_k).to_pandas()
                all_results.append(df)
            except Exception as e:
                print(f"Warning: Failed to query table '{tbl['table_name']}' in DB '{tbl['db_path']}': {str(e)}")
        if not all_results:
            return []

        try:
            merged_df = pd.concat(all_results, ignore_index=True)
            if "_distance" in merged_df.columns:
                merged_df = merged_df.sort_values(by="_distance", ascending=True)
        except Exception as e:
            raise RuntimeError(f"Failed to process merged results: {str(e)}")
        # print(merged_df)
        return [
            {
                "id": row.get("id"),
                "chunk": row.get("chunk"),
                "part_title": row.get("part_title"),
                "chapter_title": row.get("chapter_title"),
                "section_title": row.get("section_title"),
                "page": row.get("page"),
                "source": row.get("source"),
                "score": row.get("_distance")
            }
            for _, row in merged_df.iterrows()
        ][:self.top_k]

In [29]:
retriever = LegalRetriever(top_k=5)
def query_legal_documents(query: str) -> List[Dict]:
    tables = [
        {"db_path": config["db_path"], "table_name": config["comp_table"]},
        # {"db_path": config["db_path"], "table_name": config["bank_table"]},
        # {"db_path": config["db_path"], "table_name": config["constitution_table"]}
    ]
    return retriever.query_multiple(query, tables)

In [19]:
query_legal_documents(query = "What are the requirements for corporate insolvency resolution under the Companies Act and the Insolvency and Bankruptcy Code?")

            id                                              chunk  \
0  section_204  section 3 of the \nChartered Accountants Act, ...   
1  section_561  section 197. \n3[(b) where the company— \n(i) ...   
2  section_480  section 10E of the \nCompanies Act, 1956 (1 of...   
3  section_144  section 179 in the ordinary course of its busi...   
4  section_258  section 134 shall disclose the composition of ...   

  section_title                                      chapter_title  page  \
0     section 3                                          CHAPTER X   NaN   
1   section 197                                       CHAPTER XXIX   NaN   
2   section 10E  Chapter VI of the Finance Act, 2017, shall con...   NaN   
3   section 179                                        CHAPTER VII   NaN   
4   section 134                                        CHAPTER XII   NaN   

          source                                          embedding  _distance  
0  Companies Act  [-0.26554003, 0.16485201, 0.1

[{'id': 'section_204',
  'chunk': 'section 3 of the \nChartered Accountants Act, 1949 (38 of 1949), in consultation with and after examination of the \nrecommendations made by the National Financial Reporting Authority: \n                                                           \n1. Subs. by Act 1 of 2018,  s. 43, for “internal financial controls system” (w.e.f. 9-2-2018). \n2. Subs. by notification No. S.O. 2226(E), dated 4th September, 2014 for certain words (w.e.f. 4-9-2014).  \n\n103 \n \nProvided that until any auditing standards are notified, any standard or standards of auditing specified \nby the Institute of Chartered Accountants of India shall be deemed to be the auditing standards. \n(11) The Central Government may, in consultation with the National Financial Reporting Authority, \nby general or special order, direct, in respect of such class or description of companies, as may be specified \nin the order, that the auditor’s report shall also include a statement on such ma

In [20]:
query_legal_documents(query = "Provided that a company in respect of which such appeal or reference or inquiry stands abated under this clause may make reference to the National Company Law Tribunal under the Insolvency and Bankruptcy Code, 2016 within one hundred and eighty days from the commencement of the Insolvency and Bankruptcy Code, 2016 in accordance with the provisions of the Insolvency and Bankruptcy Code, 2016:")

            id                                              chunk  \
0  section_561  section 197. \n3[(b) where the company— \n(i) ...   
1  section_480  section 10E of the \nCompanies Act, 1956 (1 of...   
2   section_20  section 2 of the Company Secretaries Act, 1980...   
3  section_406  section 232 or otherwise, the Tribunal may at ...   
4  section_204  section 3 of the \nChartered Accountants Act, ...   

  section_title                                      chapter_title  page  \
0   section 197                                       CHAPTER XXIX   NaN   
1   section 10E  Chapter VI of the Finance Act, 2017, shall con...   NaN   
2     section 2                                          CHAPTER I   NaN   
3   section 232                                         CHAPTER XX   NaN   
4     section 3                                          CHAPTER X   NaN   

          source                                          embedding  _distance  
0  Companies Act  [-0.2850526, 0.081864506, 0.2

[{'id': 'section_561',
  'chunk': 'section 197. \n3[(b) where the company— \n(i) is a newly incorporated company, for a period of seven years from the date of its \nincorporation, or \n(ii) is a sick company, for whom a scheme of revival or rehabilitation has been ordered by the \nBoard for Industrial and Financial Reconstruction for a period of five years from the date of \nsanction of scheme of revival, or  \n(iii) is a company in relation to which a resolution plan has been approved by the National \nCompany Law Tribunal under the Insolvency and Bankruptcy Code, 2016 (31 of 2016) for a period \nof five years from the date of such approval,  \nit may pay 4[any remuneration to its managerial persons  2[or other directors]].] \n(c) where remuneration of a managerial person 2[or other director] exceeds the limits in Section II but \nthe remuneration has been fixed by the Board for Industrial and Financial Reconstruction or the \nNational Company Law Tribunal: \nProvided that the limits 

In [30]:
query_legal_documents(query = "What are the regulations on buying back shares under the Companies Act?")

[{'id': 'section_20',
  'chunk': 'section 2 of the Company Secretaries Act, 1980 (56 of 1980); \n(26) “contributory” means a person liable to contribute towards the assets of the company in the \nevent of its being wound up. \nExplanation.—For the purposes of this clause, it is hereby clarified that a person holding fully  paid-\nup shares in a company shall be considered as a contributory but shall have no liabilities of a contributory \nunder the Act whilst retaining rights of such a contributory; \n(27) “control” shall include the right to appoint majority of the directors or to control the management \nor policy decisions exercisable by a person or persons acting individually or in concert, directly or \nindirectly, including by virtue of their shareholding or management rights or shareholders agreements \nor voting agreements or in any other manner; \n                                                           \n1. Subs. by Act 31 of 2016, s. 255 and the Eleventh Schedule, for clau

In [None]:
import lancedb

# Connect to your LanceDB database directory
db = lancedb.connect("./Data")

# Access the table by name
table = db.open_table("BankruptcyAct")

# Fetch the document by ID
doc_id = "bankruptcy_section_449"  # assuming your ID is stored in a column named "id"
results = table.search("").where(f"id = '{doc_id}'").to_list()

# Display result
if results:
    print("Document found:")
    print(results[0])
else:
    print("Document not found.")


In [52]:
import lancedb
import pyarrow.compute as pc

# Connect to LanceDB
db = lancedb.connect("./Data")

# Open the table
table = db.open_table("BankruptcyAct")

# Load the table into Arrow format
arrow_table = table.to_arrow()

# Define the doc ID
doc_id = "bankruptcy_section_449"

# Filter using pyarrow.compute
mask = pc.equal(arrow_table["id"], doc_id)
filtered_table = arrow_table.filter(mask)

# Convert to list and print
result = filtered_table.to_pylist()
if result:
    print(result[0].keys())
else:
    print("No document found.")


dict_keys(['id', 'chunk', 'section_title', 'chapter_title', 'part_title', 'page', 'source', 'embedding'])
