# Legal RAG Evaluation - Ground Truth Dataset Creation

This notebook creates a ground truth Q&A dataset for evaluating the legal RAG pipeline.
Focus: Data usage questions with large context chunks (2000+ tokens)

In [1]:
import os
import sys
import json
import random
import textwrap
from typing import List, Dict, Any
from tqdm import tqdm
import time


from legal_rag import LegalRAGBackend
from langchain_openai import AzureChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

In [None]:
AZURE_OPENAI_ENDPOINT = "https://ironclad-openai-001.openai.azure.com/"
AZURE_OPENAI_API_KEY = "936856630b764210913d9a8fd6c8212b"
AZURE_DEPLOYMENT_NAME = "gpt-4o"

rag_backend = LegalRAGBackend()


azure_llm = AzureChatOpenAI(
    api_key=AZURE_OPENAI_API_KEY,
    api_version="2023-05-15",
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    azure_deployment=AZURE_DEPLOYMENT_NAME,
    temperature=0.7
)

In [3]:
DATA_USAGE_QUERIES = [
    "Can we use client data to develop or test new services?",
    "client data usage for development and testing",
    "PHI data usage restrictions",
    "artificial intelligence machine learning restrictions",
    "data retention requirements timelines",
    "client consent requirements data usage",
    "third-party vendor data processing",
    "human oversight AI decision making",
    "IP ownership rights client data",
    "cloud storage limitations PHI",
    "data sharing restrictions",
    "client data anonymization requirements"
]

KEY_CLIENTS = ["Aetna Life Insurance Company", "Aerotek", "1199 SEIU National Benefit Funds"]

In [4]:
def get_large_context_chunks(queries: List[str], clients: List[str], top_k: int = 10) -> List[Dict[str, Any]]:
    contexts = []
    
    for query in queries:
        for client in clients:
            try:
                response = rag_backend.query_s3_vector_store(
                    query_text=query,
                    client_account_filter=client,
                    top_k=top_k
                )
                
                if response and 'vectors' in response:
                    for vector in response['vectors']:
                        metadata = vector.get('metadata', {})
                        text = metadata.get('text', '')
                        
                        if len(text) >= 2000:
                            contexts.append({
                                'text': text,
                                'client': client,
                                'source': metadata.get('s3_path', 'unknown'),
                                'document_type': metadata.get('document_type', 'unknown'),
                                'query_used': query
                            })
                            
                time.sleep(0.5)
                
            except Exception as e:
                print(f"Error querying {query} for {client}: {e}")
                continue
    
    return contexts

In [5]:
def escape_curly_braces(text: str) -> str:
    text = str(text)
    return text.replace("{", "{{").replace("}", "}}")


def generate_qa_pair(context: Dict[str, Any]) -> Dict[str, Any]:
    system_prompt = textwrap.dedent(f"""
        You are an expert creating multiple-choice questions from legal documents.
        Create questions about data usage, privacy, AI/ML restrictions, and related legal topics.
        
        Your output should be like the following Python dictionary structure:
        
        {escape_curly_braces({
            "question": "Put here the question text",
            "options": {
                "A": "Option A text",
                "B": "Option B text",
                "C": "Option C text",
                "D": "Option D text"
            },
            "correct_answer": "Give the correct option letter (A, B, C, or D)",
            "explanation": "Give a brief explanation of why this is the correct answer"
        })}
        
        For example:
        
        {escape_curly_braces({
            "question": "What are the data retention requirements for client PHI?",
            "options": {
                "A": "1 year after contract termination",
                "B": "3 years after contract termination",
                "C": "5 years after contract termination",
                "D": "Permanent retention required"
            },
            "correct_answer": "B",
            "explanation": "The contract specifies a 3-year retention period for client PHI after contract termination."
        })}
        
        ONLY output the requested dictionary.
        """)
    
    human_prompt = textwrap.dedent(f"""
        Create a multiple-choice question based on the following legal document text:
        
        Client: {escape_curly_braces(context['client'])}
        Document Type: {escape_curly_braces(context['document_type'])}
        
        Text:
        {escape_curly_braces(context['text'][:3000])}
        """)
    
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            ("human", human_prompt),
        ]
    )
    
    try:
        # Create LangChain chain
        chain = prompt | azure_llm
        output = chain.invoke({})
        
        qa_text = output.content.strip()
        qa_dict = eval(qa_text)
        
        qa_dict['source'] = context['source']
        qa_dict['client'] = context['client']
        qa_dict['document_type'] = context['document_type']
        
        return qa_dict
        
    except Exception as e:
        print(f"Error generating Q&A: {e}")
        return None

In [6]:
def shuffle_options(qa_list: List[Dict], seed: int = 42) -> List[Dict]:
    random.seed(seed)
    
    for qa in qa_list:
        options = list(qa["options"].items())
        random.shuffle(options)
        
        new_option_keys = ['A', 'B', 'C', 'D']
        new_options = {new_key: value for new_key, (_, value) in zip(new_option_keys, options)}
        
        correct_answer_value = qa["options"][qa["correct_answer"]]
        correct_answer_key = next(new_key for new_key, value in new_options.items() if value == correct_answer_value)
        
        qa["options"] = new_options
        qa["correct_answer"] = correct_answer_key
    
    return qa_list

## Step 1: Collect Large Context Chunks

In [7]:
print("Collecting large context chunks for data usage topics...")
contexts = get_large_context_chunks(DATA_USAGE_QUERIES, KEY_CLIENTS, top_k=5)

print(f"Collected {len(contexts)} context chunks")
print(f"Average length: {sum(len(c['text']) for c in contexts) / len(contexts):.0f} characters")

for i, ctx in enumerate(contexts[:1]):
    print(f"\nContext {i+1}:")
    print(f"Client: {ctx['client']}")
    print(f"Doc Type: {ctx['document_type']}")
    print(f"Length: {len(ctx['text'])} chars")
    print(f"Preview: {ctx['text'][:200]}...")

Collecting large context chunks for data usage topics...
Collected 7 context chunks
Average length: 2074 characters

Context 1:
Client: Aetna Life Insurance Company
Doc Type: Addendum
Length: 2110 chars
Preview: All systems processing and/or storing Company PHI must have at least an annual 
system risk assessment/security review which provides assurance that administrative, physical, and 
technical controls a...
Collected 7 context chunks
Average length: 2074 characters

Context 1:
Client: Aetna Life Insurance Company
Doc Type: Addendum
Length: 2110 chars
Preview: All systems processing and/or storing Company PHI must have at least an annual 
system risk assessment/security review which provides assurance that administrative, physical, and 
technical controls a...


In [8]:
ctx['text']

'All systems processing and/or storing Company PHI must have at least an annual \nsystem risk assessment/security review which provides assurance that administrative, physical, and \ntechnical controls are funct ioning effectively and providing adequate levels of protection. Reviews should \ninclude vulnerability scanning tools. B. Log Reviews. All systems processing and/or storing Company PHI must have a routine procedure in place \nto review system logs for unauthorize d access. C. Change Control. All systems processing and/or storing Company PHI must have a documented change \ncontrol procedure that ensures separation of duties and protects the confidentiality, integrity and \navailability of data. Business Continuity / Disas ter Recovery Controls  \nA. Emergency Mode Operation Plan. Vendor  must establish a documented plan to enable continuation of \ncritical business processes and protection of the security of electronic Company PHI in the event of an \nemergency. Emergency means 

## Step 2: Generate Q&A Pairs

In [9]:
TARGET_QA_COUNT = 30
qa_dataset = []

random.seed(42)
selected_contexts = random.sample(contexts, min(TARGET_QA_COUNT, len(contexts)))

print(f"Generating {TARGET_QA_COUNT} Q&A pairs...")

for i, context in enumerate(tqdm(selected_contexts, desc="Generating Q&A")):
    qa_pair = generate_qa_pair(context)
    
    if qa_pair:
        qa_dataset.append(qa_pair)
        print(f"\nQ&A {len(qa_dataset)}:")
        print(f"Q: {qa_pair['question']}")
        print(f"A: {qa_pair['correct_answer']} - {qa_pair['options'][qa_pair['correct_answer']]}")
    
    time.sleep(1)
    
    if len(qa_dataset) >= TARGET_QA_COUNT:
        break

print(f"\nGenerated {len(qa_dataset)} Q&A pairs")

Generating 30 Q&A pairs...


Generating Q&A:   0%|          | 0/7 [00:00<?, ?it/s]


Q&A 1:
Q: What is the time frame covered by the Supplemental Criminal Record Conviction Database search according to the Security Addendum?
A: B - 7 years


Generating Q&A:  14%|█▍        | 1/7 [00:02<00:12,  2.12s/it]


Q&A 2:
Q: What is the minimum frequency required for backing up Company PHI according to the document?
A: B - Weekly full backup and monthly offsite storage


Generating Q&A:  29%|██▊       | 2/7 [00:04<00:10,  2.12s/it]


Q&A 3:
Q: What is the minimum requirement for system risk assessment/security reviews for systems processing or storing Company PHI?
A: C - Annual risk assessment/security review


Generating Q&A:  43%|████▎     | 3/7 [00:06<00:08,  2.08s/it]


Q&A 4:
Q: What must be done if attempts to verify an applicant’s educational credentials are unsuccessful?
A: B - Customer refuses to accept an Approved Copy and deems the individual ineligible for assignment.


Generating Q&A:  57%|█████▋    | 4/7 [00:08<00:06,  2.20s/it]


Q&A 5:
Q: What is the duration for which the Supplemental Criminal Record Conviction Database searches past records?
A: C - 7 years


Generating Q&A:  71%|███████▏  | 5/7 [00:10<00:04,  2.11s/it]


Q&A 6:
Q: What is the duration for which the Supplemental Criminal Record Conviction Database searches past criminal records?
A: C - 7 years


Generating Q&A:  86%|████████▌ | 6/7 [00:12<00:02,  2.13s/it]


Q&A 7:
Q: According to the Security Addendum, what should be done if attempts to verify an applicant’s educational credentials are unsuccessful?
A: D - Contact Customer to discuss further exceptions.


Generating Q&A: 100%|██████████| 7/7 [00:15<00:00,  2.16s/it]


Generated 7 Q&A pairs





## Step 3: Process and Save Dataset

In [10]:
qa_dataset_shuffled = shuffle_options(qa_dataset, seed=42)

with open("qa_list.json", "w") as f:
    json.dump(qa_dataset_shuffled, f, indent=4)

print(f"Saved {len(qa_dataset_shuffled)} Q&A pairs to qa_list.json")

print("\nDataset summary:")
clients = [qa['client'] for qa in qa_dataset_shuffled]
doc_types = [qa['document_type'] for qa in qa_dataset_shuffled]

from collections import Counter
print(f"Clients: {dict(Counter(clients))}")
print(f"Document types: {dict(Counter(doc_types))}")

print("\nSample Q&A:")
for i, qa in enumerate(qa_dataset_shuffled[:2]):
    print(f"\n{i+1}. {qa['question']}")
    for opt, text in qa['options'].items():
        marker = "✓" if opt == qa['correct_answer'] else " "
        print(f"  {marker} {opt}: {text}")

Saved 7 Q&A pairs to qa_list.json

Dataset summary:
Clients: {'Aerotek': 5, 'Aetna Life Insurance Company': 2}
Document types: {'Security Addendum/Exhibit': 5, 'Addendum': 2}

Sample Q&A:

1. What is the time frame covered by the Supplemental Criminal Record Conviction Database search according to the Security Addendum?
    A: 5 years
  ✓ B: 7 years
    C: Lifetime
    D: 10 years

2. What is the minimum frequency required for backing up Company PHI according to the document?
    A: Quarterly full backup and quarterly offsite storage
    B: Monthly full backup and weekly offsite storage
    C: Daily full backup and daily offsite storage
  ✓ D: Weekly full backup and monthly offsite storage
