# Indexers

In [None]:
import pandas as pd

def transform_qa_data(df: pd.DataFrame) -> pd.DataFrame:
    """Transform the DataFrame by combining each question-answer pair into a dictionary."""
    transformed_df = pd.DataFrame({
        "article_number": df["Article Number"],
        "article_content": df["Article Content"],
        "qa_pair": df.apply(lambda row: {"question": row["Question"], "answer": row["Answer"]}, axis=1)
    })

    return transformed_df


doc = transform_qa_data(
    pd.read_csv("Constitution_Qa.csv")
)

In [None]:
doc.to_csv("Constitution_Qa_transformed.csv", index=False)

In [26]:
import os
import pandas as pd
from typing import List
from tqdm import tqdm

from lancedb.pydantic import LanceModel, Vector
import lancedb

from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_core.embeddings import Embeddings
from legalbert_embedder import LegalBERTEmbedder
# === Configuration ===
CSV_FILE = "Indian_Constitution.csv"  # Your new file with article_id and article_desc
DB_PATH = "./Data"
TABLE_NAME = "constitution_articles"
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")  # Make sure this is set

# === Embedding Setup ===
embeddings_model = LegalBERTEmbedder()
# === LanceDB Schema ===
class SimpleArticleSchema(LanceModel):
    article_id: str
    article_desc: str
    embedding: Vector(768)

# === Embedding Function ===
def get_embeddings_batch(texts: List[str]) -> List[List[float]]:
    return embeddings_model.encode(texts)

# === Process CSV Data ===
def process_data(csv_file: str) -> List[SimpleArticleSchema]:
    df = pd.read_csv(csv_file)
    records = []

    texts_to_embed = df["article_desc"].tolist()
    print("üîÑ Generating embeddings...")
    embeddings = get_embeddings_batch(texts_to_embed)

    for i, row in tqdm(df.iterrows(), total=len(df)):
        record = SimpleArticleSchema(
            article_id=row["article_id"],
            article_desc=row["article_desc"],
            embedding=embeddings[i]
        )
        records.append(record)

    return records

# === LanceDB Insertion ===
def insert_into_lancedb(db_path: str, table_name: str, records: List[SimpleArticleSchema]):
    db = lancedb.connect(db_path)

    if table_name in db.table_names():
        table = db.open_table(table_name)
    else:
        table = db.create_table(
            table_name,
            data=[record.dict() for record in records],
            schema=SimpleArticleSchema,
            mode="overwrite"
        )
        table.create_fts_index(["article_id", "article_desc"], replace=True)
        table.create_index("embedding", replace=True)

    print(f"‚úÖ Inserted {len(records)} records into '{table_name}' at {db_path}")

# === Main ===
if __name__ == "__main__":
    data = process_data(CSV_FILE)
    insert_into_lancedb(DB_PATH, TABLE_NAME, data)


üîÑ Generating embeddings...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 454/454 [00:00<00:00, 8358.67it/s]

‚úÖ Inserted 454 records into 'constitution_articles' at ./Data





In [None]:
import lancedb
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_core.embeddings import Embeddings
import os

# === Configuration ===
DB_PATH = "./Data"

from legalbert_embedder import LegalBERTEmbedder

embeddings_model = LegalBERTEmbedder()

# === Connect to LanceDB ===
db = lancedb.connect(DB_PATH)
raw_table = db.open_table("constitution_articles")
qa_table = db.open_table("constitution_articles_qa")

# === Embedding Function ===
def embed_query(query: str):
    return embeddings_model.embed_query(query)


In [28]:
import pandas as pd
import json
import numpy as np
from typing import List
from tqdm import tqdm
import ast

from lancedb.pydantic import LanceModel, Vector
import lancedb
import pyarrow as pa

from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_core.embeddings import Embeddings

# === Configuration ===
CSV_FILE = "Constitution_Qa_transformed.csv"       # Replace with your file
DB_PATH = "./Data"
TABLE_NAME = "constitution_articles_qa"
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")  # Replace with your key

# === Embedding Setup ===
from legalbert_embedder import LegalBERTEmbedder
embeddings_model = LegalBERTEmbedder()

# === LanceDB Schema ===
class ArticleSchema(LanceModel):
    article_number: str
    article_content: str
    qa_pair: str  # JSON string
    qa_embedding: Vector(768)

# === Embedding Function ===
def get_embeddings_batch(texts: List[str]) -> List[List[float]]:
    return embeddings_model.encode(texts)

def safe_json_parse(value):
    try:
        if isinstance(value, dict):
            return value
        return ast.literal_eval(value)  # safely parse stringified dicts
    except Exception as e:
        print(f"‚ö†Ô∏è Skipping row due to parsing error: {e}\nValue: {value}")
        return {"question": "", "answer": ""}

def process_data(csv_file: str) -> List[ArticleSchema]:
    df = pd.read_csv(csv_file)
    records = []

    df['qa_pair'] = df['qa_pair'].apply(safe_json_parse)

    texts_to_embed = []
    for _, row in df.iterrows():
        qa_dict = row['qa_pair']
        question = qa_dict.get("question", "")
        answer = qa_dict.get("answer", "")
        qa_text = f"Q: {question} A: {answer}"
        texts_to_embed.append(qa_text)

    print("üîÑ Embedding texts in batches...")
    batch_embeddings = get_embeddings_batch(texts_to_embed)

    for i, row in tqdm(df.iterrows(), total=len(df)):
        record = ArticleSchema(
            article_number=row['article_number'],
            article_content=row['article_content'],
            qa_pair=json.dumps(row['qa_pair']),  # store as string
            qa_embedding=batch_embeddings[i]
        )
        records.append(record)

    return records

# === LanceDB Insertion ===
def insert_into_lancedb(db_path: str, table_name: str, records: List[ArticleSchema]):
    db = lancedb.connect(db_path)

    if table_name in db.table_names():
        table = db.open_table(table_name)
    else:
        table = db.create_table(
            table_name,
            data=[record.dict() for record in records],
            schema=ArticleSchema,
            mode="overwrite"
        )
        table.create_fts_index(["article_number", "article_content"], replace=True)
        table.create_index(
            vector_column_name="qa_embedding",
            metric="cosine",   # or "l2", "dot", depending on what your use case needs
            replace=True
        )

    print(f"‚úÖ Inserted {len(records)} records into '{table_name}' at {db_path}")

# === Main ===
if __name__ == "__main__":
    data = process_data(CSV_FILE)
    insert_into_lancedb(DB_PATH, TABLE_NAME, data)


üîÑ Embedding texts in batches...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4360/4360 [00:00<00:00, 6368.95it/s]

‚úÖ Inserted 4360 records into 'constitution_articles_qa' at ./Data





# Retrievers

In [37]:
import pandas as pd
import json
import numpy as np
from typing import List
from tqdm import tqdm
import ast

from lancedb.pydantic import LanceModel, Vector
import lancedb
import pyarrow as pa

from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_core.embeddings import Embeddings

# === Configuration ===
DB_PATH = "./Data"
db = lancedb.connect(DB_PATH)
raw_table = db.open_table("constitution_articles")
qa_table = db.open_table("constitution_articles_qa")

from legalbert_embedder import LegalBERTEmbedder
embeddings_model = LegalBERTEmbedder()

In [63]:
def query_raw_articles(query: str, k: int = 3):
    embedding = embeddings_model.encode(query)
    
    print(f"\nüîç Top {k} raw articles matching:\n‚û°Ô∏è {query}\n")
    
    results = raw_table.search(embedding).limit(k).to_pandas()
    results = results[:3]
    for idx, row in results.iterrows():
        print(f"üîπ {row['article_id']}")
        print(f"üìù {row['article_desc'][:300]}...\n")


In [64]:
import json

def query_qa_articles(query: str, k: int = 3):
    embedding = embeddings_model.encode(query)

    print(f"\nüîç Top {k} QA articles matching:\n‚û°Ô∏è {query}\n")

    results = qa_table.search(embedding).limit(k).to_pandas()
    results = results[:3]
    for idx, row in results.iterrows():
        qa = json.loads(row["qa_pair"])
        print(f"üîπ {row['article_number']}")
        print(f"Q: {qa.get('question', '')}")
        print(f"A: {qa.get('answer', '')[:300]}...\n")


In [68]:
query = "President of India"
query_raw_articles(query)
query_qa_articles(query)



üîç Top 3 raw articles matching:
‚û°Ô∏è President of India

üîπ Article 211 of Indian Constitution
üìù Restriction on discussion in the Legislature No discussion shall take place in the Legislature of a State with respect to the conduct of any Judge of the Supreme Court or of a High Court in the discharge of his duties...

üîπ Article 362 of Indian Constitution
üìù Rights and privileges of Rulers of Indian States Rep by the Constitution (Twenty sixth Amendment) Act, 1971 ,...

üîπ Article 215 of Indian Constitution
üìù High Courts to be courts of record Every High Court shall be a court of record and shall have all the powers of such a court including the power to punish for contempt of itself...


üîç Top 3 QA articles matching:
‚û°Ô∏è President of India

üîπ Article 170
Q: What is the maximum term for the Legislative Assembly in India?
A: The maximum term for the Legislative Assembly in India is five years from the date appointed for its first meeting, unless dissolved earl

In [66]:
query = "How do mergers and acquisitions affect shareholder value?"
query_raw_articles(query)
query_qa_articles(query)


üîç Top 3 raw articles matching:
‚û°Ô∏è How do mergers and acquisitions affect shareholder value?

üîπ Article 371I of Indian Constitution
üìù Special provision with respect to the State of Goa Notwithstanding anything in this Constitution, the Legislative Assembly of the State of Goa shall consist of not less than thirty members...

üîπ Article 2A of Indian Constitution
üìù Sikkim to be associated with the Union Rep by the Constitution Thirty six Amendment Act, 1975 , Section 5 (w e f 26 04 1975 )...

üîπ Article 329A of Indian Constitution
üìù Special provision as to elections to Parliament in the case of Prime Minister and Speaker Omitted PART XVI SPECIAL PROVISIONS RELATING TO CERTAIN CLASSES...


üîç Top 3 QA articles matching:
‚û°Ô∏è How do mergers and acquisitions affect shareholder value?

üîπ Article 170
Q: What is the maximum term for the Legislative Assembly in India?
A: The maximum term for the Legislative Assembly in India is five years from the date appointed fo