# Legal RAG Evaluation - Ground Truth Dataset Creation

This notebook creates a ground truth Q&A dataset for evaluating the legal RAG pipeline.
Focus: Data usage questions with large context chunks (2000+ tokens)

In [None]:
import os
import sys
import json
import random
import textwrap
from typing import List, Dict, Any
from tqdm import tqdm
import time

sys.path.append('../backend')
from legal_rag import LegalRAGBackend
from langchain_openai import AzureOpenAI
from langchain_core.prompts import ChatPromptTemplate

In [None]:
AZURE_OPENAI_ENDPOINT = "https://ironclad-openai-001.openai.azure.com/"
AZURE_OPENAI_API_KEY = "936856630b764210913d9a8fd6c8212b"
AZURE_DEPLOYMENT_NAME = "gpt-4o"

rag_backend = LegalRAGBackend()

azure_client = AzureOpenAI(
    api_key=AZURE_OPENAI_API_KEY,
    api_version="2023-05-15",
    azure_endpoint=AZURE_OPENAI_ENDPOINT
)

In [None]:
DATA_USAGE_QUERIES = [
    "Can we use client data to develop or test new services?",
    "client data usage for development and testing",
    "PHI data usage restrictions",
    "artificial intelligence machine learning restrictions",
    "data retention requirements timelines",
    "client consent requirements data usage",
    "third-party vendor data processing",
    "human oversight AI decision making",
    "IP ownership rights client data",
    "cloud storage limitations PHI",
    "data sharing restrictions",
    "client data anonymization requirements"
]

KEY_CLIENTS = ["Aetna Life Insurance Company", "Aerotek", "1199 SEIU National Benefit Funds"]

In [None]:
def get_large_context_chunks(queries: List[str], clients: List[str], top_k: int = 10) -> List[Dict[str, Any]]:
    contexts = []
    
    for query in queries:
        for client in clients:
            try:
                response = rag_backend.query_s3_vector_store(
                    query_text=query,
                    client_account_filter=client,
                    top_k=top_k
                )
                
                if response and 'vectors' in response:
                    for vector in response['vectors']:
                        metadata = vector.get('metadata', {})
                        text = metadata.get('text', '')
                        
                        if len(text) >= 2000:
                            contexts.append({
                                'text': text,
                                'client': client,
                                'source': metadata.get('s3_path', 'unknown'),
                                'document_type': metadata.get('document_type', 'unknown'),
                                'query_used': query
                            })
                            
                time.sleep(0.5)
                
            except Exception as e:
                print(f"Error querying {query} for {client}: {e}")
                continue
    
    return contexts

In [None]:
def generate_qa_pair(context: Dict[str, Any]) -> Dict[str, Any]:
    system_prompt = textwrap.dedent("""
        You are an expert creating multiple-choice questions from legal documents.
        Create questions about data usage, privacy, AI/ML restrictions, and related legal topics.
        
        Output format (Python dictionary):
        {
            "question": "Question text",
            "options": {
                "A": "Option A text",
                "B": "Option B text", 
                "C": "Option C text",
                "D": "Option D text"
            },
            "correct_answer": "A/B/C/D",
            "explanation": "Brief explanation"
        }
        
        ONLY output the dictionary.
        """)
    
    human_prompt = f"""
        Create a multiple-choice question based on this legal document text:
        
        Client: {context['client']}
        Document Type: {context['document_type']}
        
        Text:
        {context['text'][:3000]}
        """
    
    try:
        response = azure_client.chat.completions.create(
            model=AZURE_DEPLOYMENT_NAME,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": human_prompt}
            ],
            temperature=0.7
        )
        
        qa_text = response.choices[0].message.content.strip()
        qa_dict = eval(qa_text)
        
        qa_dict['source'] = context['source']
        qa_dict['client'] = context['client']
        qa_dict['document_type'] = context['document_type']
        
        return qa_dict
        
    except Exception as e:
        print(f"Error generating Q&A: {e}")
        return None

In [None]:
def shuffle_options(qa_list: List[Dict], seed: int = 42) -> List[Dict]:
    random.seed(seed)
    
    for qa in qa_list:
        options = list(qa["options"].items())
        random.shuffle(options)
        
        new_option_keys = ['A', 'B', 'C', 'D']
        new_options = {new_key: value for new_key, (_, value) in zip(new_option_keys, options)}
        
        correct_answer_value = qa["options"][qa["correct_answer"]]
        correct_answer_key = next(new_key for new_key, value in new_options.items() if value == correct_answer_value)
        
        qa["options"] = new_options
        qa["correct_answer"] = correct_answer_key
    
    return qa_list

## Step 1: Collect Large Context Chunks

In [None]:
print("Collecting large context chunks for data usage topics...")
contexts = get_large_context_chunks(DATA_USAGE_QUERIES, KEY_CLIENTS, top_k=5)

print(f"Collected {len(contexts)} context chunks")
print(f"Average length: {sum(len(c['text']) for c in contexts) / len(contexts):.0f} characters")

for i, ctx in enumerate(contexts[:3]):
    print(f"\nContext {i+1}:")
    print(f"Client: {ctx['client']}")
    print(f"Doc Type: {ctx['document_type']}")
    print(f"Length: {len(ctx['text'])} chars")
    print(f"Preview: {ctx['text'][:200]}...")

## Step 2: Generate Q&A Pairs

In [None]:
TARGET_QA_COUNT = 30
qa_dataset = []

random.seed(42)
selected_contexts = random.sample(contexts, min(TARGET_QA_COUNT, len(contexts)))

print(f"Generating {TARGET_QA_COUNT} Q&A pairs...")

for i, context in enumerate(tqdm(selected_contexts, desc="Generating Q&A")):
    qa_pair = generate_qa_pair(context)
    
    if qa_pair:
        qa_dataset.append(qa_pair)
        print(f"\nQ&A {len(qa_dataset)}:")
        print(f"Q: {qa_pair['question']}")
        print(f"A: {qa_pair['correct_answer']} - {qa_pair['options'][qa_pair['correct_answer']]}")
    
    time.sleep(1)
    
    if len(qa_dataset) >= TARGET_QA_COUNT:
        break

print(f"\nGenerated {len(qa_dataset)} Q&A pairs")

## Step 3: Process and Save Dataset

In [None]:
qa_dataset_shuffled = shuffle_options(qa_dataset, seed=42)

with open("qa_list.json", "w") as f:
    json.dump(qa_dataset_shuffled, f, indent=4)

print(f"Saved {len(qa_dataset_shuffled)} Q&A pairs to qa_list.json")

print("\nDataset summary:")
clients = [qa['client'] for qa in qa_dataset_shuffled]
doc_types = [qa['document_type'] for qa in qa_dataset_shuffled]

from collections import Counter
print(f"Clients: {dict(Counter(clients))}")
print(f"Document types: {dict(Counter(doc_types))}")

print("\nSample Q&A:")
for i, qa in enumerate(qa_dataset_shuffled[:2]):
    print(f"\n{i+1}. {qa['question']}")
    for opt, text in qa['options'].items():
        marker = "✓" if opt == qa['correct_answer'] else " "
        print(f"  {marker} {opt}: {text}")