# Legal RAG Evaluation - Ground Truth Dataset Creation

This notebook creates a ground truth Q&A dataset for evaluating the legal RAG pipeline.
Focus: Data usage questions with large context chunks (2000+ tokens)

In [1]:
import json
import random
import textwrap
from typing import List, Dict, Any
from tqdm import tqdm
import time


from legal_rag import LegalRAGBackend
from langchain_openai import AzureChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

In [2]:
AZURE_OPENAI_ENDPOINT = "https://ironclad-openai-001.openai.azure.com/"
AZURE_OPENAI_API_KEY = "936856630b764210913d9a8fd6c8212b"
AZURE_DEPLOYMENT_NAME = "gpt-4o"

rag_backend = LegalRAGBackend()


azure_llm = AzureChatOpenAI(
    api_key=AZURE_OPENAI_API_KEY,
    api_version="2023-05-15",
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    azure_deployment=AZURE_DEPLOYMENT_NAME,
    temperature=0.7
)

In [9]:
DATA_USAGE_QUERIES = [
    "Can we use client data to develop or test new services?",
    "client data usage for development and testing",
    "PHI data usage restrictions",
    "artificial intelligence machine learning restrictions",
    "data retention requirements timelines",
    "client consent requirements data usage",
    "third-party vendor data processing",
    "human oversight AI decision making",
    "IP ownership rights client data",
    "cloud storage limitations PHI",
    "data sharing restrictions",
    "client data anonymization requirements"
]

KEY_CLIENTS = rag_backend._load_clients_data()['Client']

In [10]:
CHUNK_SIZE = 1500  # Minimum chunk size for large context

def get_large_context_chunks(queries: List[str], clients: List[str], top_k: int = 10) -> List[Dict[str, Any]]:
    contexts = []
    
    for query in queries:
        for client in clients:
            try:
                response = rag_backend.query_s3_vector_store(
                    query_text=query,
                    client_account_filter=client,
                    top_k=top_k
                )
                
                if response and 'vectors' in response:
                    for vector in response['vectors']:
                        metadata = vector.get('metadata', {})
                        text = metadata.get('text', '')
                        
                        if len(text) >= CHUNK_SIZE:
                            contexts.append({
                                'text': text,
                                'client': client,
                                'source': metadata.get('s3_path', 'unknown'),
                                'document_type': metadata.get('document_type', 'unknown'),
                                'query_used': query
                            })
                            
                time.sleep(0.5)
                
            except Exception as e:
                print(f"Error querying {query} for {client}: {e}")
                continue
    
    return contexts

In [11]:
def escape_curly_braces(text: str) -> str:
    text = str(text)
    return text.replace("{", "{{").replace("}", "}}")


def generate_qa_pair(context: Dict[str, Any]) -> Dict[str, Any]:
    system_prompt = textwrap.dedent(f"""
        You are an expert creating multiple-choice questions from legal documents.
        Create questions about data usage, privacy, AI/ML restrictions, and related legal topics.
        
        Your output should be like the following Python dictionary structure:
        
        {escape_curly_braces({
            "question": "Put here the question text",
            "options": {
                "A": "Option A text",
                "B": "Option B text",
                "C": "Option C text",
                "D": "Option D text"
            },
            "correct_answer": "Give the correct option letter (A, B, C, or D)",
            "explanation": "Give a brief explanation of why this is the correct answer"
        })}
        
        For example:
        
        {escape_curly_braces({
            "question": "What are the data retention requirements for client PHI?",
            "options": {
                "A": "1 year after contract termination",
                "B": "3 years after contract termination",
                "C": "5 years after contract termination",
                "D": "Permanent retention required"
            },
            "correct_answer": "B",
            "explanation": "The contract specifies a 3-year retention period for client PHI after contract termination."
        })}
        
        ONLY output the requested dictionary.
        """)
    
    human_prompt = textwrap.dedent(f"""
        Create a multiple-choice question based on the following legal document text:
        
        Client: {escape_curly_braces(context['client'])}
        Document Type: {escape_curly_braces(context['document_type'])}
        
        Text:
        {escape_curly_braces(context['text'][:3000])}
        """)
    
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            ("human", human_prompt),
        ]
    )
    
    try:
        # Create LangChain chain
        chain = prompt | azure_llm
        output = chain.invoke({})
        
        qa_text = output.content.strip()
        qa_dict = eval(qa_text)
        
        qa_dict['source'] = context['source']
        qa_dict['client'] = context['client']
        qa_dict['document_type'] = context['document_type']
        qa_dict['context'] = context['text'][:3000]  # Limit context to first 3000 characters
        
        return qa_dict
        
    except Exception as e:
        print(f"Error generating Q&A: {e}")
        return None

In [13]:
def shuffle_options(qa_list: List[Dict], seed: int = 42) -> List[Dict]:
    random.seed(seed)
    
    for qa in qa_list:
        options = list(qa["options"].items())
        random.shuffle(options)
        
        new_option_keys = ['A', 'B', 'C', 'D']
        new_options = {new_key: value for new_key, (_, value) in zip(new_option_keys, options)}
        
        correct_answer_value = qa["options"][qa["correct_answer"]]
        correct_answer_key = next(new_key for new_key, value in new_options.items() if value == correct_answer_value)
        
        qa["options"] = new_options
        qa["correct_answer"] = correct_answer_key
    
    return qa_list

## Step 1: Collect Large Context Chunks

In [14]:
print("Collecting large context chunks for data usage topics...")
contexts = get_large_context_chunks(DATA_USAGE_QUERIES, KEY_CLIENTS, top_k=5)

print(f"Collected {len(contexts)} context chunks")
print(f"Average length: {sum(len(c['text']) for c in contexts) / len(contexts):.0f} characters")

for i, ctx in enumerate(contexts[:1]):
    print(f"\nContext {i+1}:")
    print(f"Client: {ctx['client']}")
    print(f"Doc Type: {ctx['document_type']}")
    print(f"Length: {len(ctx['text'])} chars")
    print(f"Preview: {ctx['text'][:200]}...")

Collecting large context chunks for data usage topics...
Collected 955 context chunks
Average length: 1846 characters

Context 1:
Client: 1199 SEIU National Benefit Funds
Doc Type: Work Order / Change Request
Length: 1610 chars
Preview: This WO #1 may be executed in two or more counterparts, each of which shall be 
deemed to be an original as against any Party whose signature appears thereon, but all of which 
together shall constitu...


In [15]:
print(ctx['text'])

This WO #1 may be executed in two or more counterparts, each of which shall be 
deemed to be an original as against any Party whose signature appears thereon, but all of which 
together shall constitute one and the same instrument. IN WITNESS WHEREOF, the undersigned, intending to be legally bound, have executed this 
WO #1 as of the date of last signature below. Cotiviti, Inc.   
1199SEIU National Benefit Fund for Health 
and Human Service Employees 
1199SEIU Greater New York Benefit Fund 
1199SEIU National Benefit Fund for Home 
Care Employees 
 
By:       By:  
Name:       Name:  
Title:       Title:  
Date:       Date:  
     
       
       
       
       
       
 
 
 
Angela Scott
Chief of Benefit Operations
12 / 11 / 2023
DocuSign Envelope ID: 4776A02C-397E-465F-82F7-C0CBB6A8A78F
Cheri Moehring
SVP Clinical Chart Validation
December 11, 2023 | 5:25 PM EST
EXHIBIT A  
 
 
 
 
 
 
 
High Level IT Ro les and 
Responsib ilities 
with Deliverables  
 
 
 
 
 
 
 
 
Prepared  for:  

## Step 2: Generate Q&A Pairs

In [16]:
TARGET_QA_COUNT = 200
qa_dataset = []

random.seed(42)
selected_contexts = random.sample(contexts, min(TARGET_QA_COUNT, len(contexts)))

print(f"Generating {TARGET_QA_COUNT} Q&A pairs...")

for i, context in enumerate(tqdm(selected_contexts, desc="Generating Q&A")):
    qa_pair = generate_qa_pair(context)
    
    if qa_pair:
        qa_dataset.append(qa_pair)
        if i%20==0:
            print(f"\nQ&A {len(qa_dataset)}:")
            print(f"Q: {qa_pair['question']}")
            print(f"A: {qa_pair['correct_answer']} - {qa_pair['options'][qa_pair['correct_answer']]}")
    
    time.sleep(1)
    
    if len(qa_dataset) >= TARGET_QA_COUNT:
        break

print(f"\nGenerated {len(qa_dataset)} Q&A pairs")

Generating 200 Q&A pairs...


Generating Q&A:   0%|          | 0/200 [00:00<?, ?it/s]


Q&A 1:
Q: Under what conditions can either party assign their rights and obligations under this agreement?
A: C - Without consent if assigning to a successor of substantially all business assets.


Generating Q&A:  10%|█         | 20/200 [00:41<06:19,  2.11s/it]


Q&A 21:
Q: What is the duration of the confidentiality obligations after the expiration or termination of the agreement?
A: C - 3 years after the Effective Date


Generating Q&A:  20%|██        | 40/200 [01:26<05:56,  2.23s/it]


Q&A 41:
Q: What action must a user take if hardware and software requirements change for electronic disclosures?
A: C - Receive an email notification and have the right to withdraw consent


Generating Q&A:  30%|███       | 60/200 [02:08<04:43,  2.02s/it]


Q&A 61:
Q: What is the definition of "Medically Necessary" under the provisions of this SOW?
A: B - Services or supplies determined to be appropriate and necessary for the symptoms, diagnosis, or treatment of the medical condition.


Generating Q&A:  40%|████      | 80/200 [02:51<04:15,  2.13s/it]


Q&A 81:
Q: What is the consequence of withdrawing consent to receive notices and disclosures electronically?
A: C - Slower transaction speed


Generating Q&A:  50%|█████     | 100/200 [03:34<03:32,  2.12s/it]


Q&A 101:
Q: What restrictions are imposed on the Client regarding the Verisk IP?
A: C - Client cannot distribute or sublicense Verisk IP.


Generating Q&A:  60%|██████    | 120/200 [04:14<02:41,  2.01s/it]


Q&A 121:
Q: What is excluded from Comparative Data according to the General VH Master Services Agreement?
A: B - Protected Health Information


Generating Q&A:  70%|███████   | 140/200 [04:55<02:10,  2.18s/it]


Q&A 141:
Q: What is required for remote computing sessions that involve access to Customer Confidential Information according to the document?
A: B - Access via a secure Virtual Private Network (VPN)


Generating Q&A:  80%|████████  | 160/200 [05:36<01:21,  2.04s/it]


Q&A 161:
Q: What is the data sharing requirement for Licensee under the Agreement?
A: B - Licensee must provide claims and enrollment data semi-annually.


Generating Q&A:  90%|█████████ | 180/200 [06:18<00:42,  2.14s/it]


Q&A 181:
Q: What remedy may be sought if a party breaches the agreement according to the legal document text?
A: B - Specific performance and/or injunctive relief


Generating Q&A: 100%|█████████▉| 199/200 [06:59<00:02,  2.11s/it]


Generated 200 Q&A pairs





## Step 3: Process and Save Dataset

In [17]:
def add_ids_to_qa(qa_list: List[Dict]) -> List[Dict]:
    for i, qa in enumerate(qa_list):
        qa['id'] = f"qa_{i+1}"
    return qa_list


def evaluate_qa_relevance(qa_item: Dict[str, Any]) -> Dict[str, Any]:
    """Evaluate the relevance of the context to the client and query before factuality assessment."""
    
    system_prompt = textwrap.dedent(f"""
        You are an expert at evaluating document quality and relevance.
        Your task is to determine if the provided context contains meaningful legal information relevant to the client and query.
        
        Evaluation criteria:
        1. Context should contain substantive legal information (contracts, policies, agreements)
        2. Content should be relevant to the client mentioned
        3. Context should not be junk data (headers, footers, metadata, gibberish)
        4. Information should be useful for generating meaningful legal questions
        
        Your output should be like the following Python dictionary structure:
        
        {escape_curly_braces({
            "relevance": 1, 
            "critic_explanation": "Explanation of why the context is relevant or not relevant"
        })}
        
        ONLY output the requested dictionary.
        """)
    
    human_prompt = textwrap.dedent(f"""
        Evaluate the relevance of this context for generating legal Q&A:
        
        **Client:** {escape_curly_braces(qa_item.get('client', 'unknown'))}
        **Document Type:** {escape_curly_braces(qa_item.get('document_type', 'unknown'))}
        **Query Used:** {escape_curly_braces(qa_item.get('query_used', 'unknown'))}
        
        **Context:**
        {escape_curly_braces(qa_item.get('context', ''))}
        """)
    
    prompt = ChatPromptTemplate.from_messages([
        ("system", system_prompt),
        ("human", human_prompt),
    ])
    
    try:
        chain = prompt | azure_llm
        output = chain.invoke({})
        evaluation_text = output.content.strip()
        
        import re
        evaluation_text = re.sub(r'\\u[0-9a-fA-F]{4}', '', evaluation_text)
        evaluation_text = re.sub(r'```[a-zA-Z]*\n?', '', evaluation_text)
        evaluation_text = re.sub(r'\s+', ' ', evaluation_text).strip()
        
        dict_match = re.search(r'\{.*\}', evaluation_text, re.DOTALL)
        if dict_match:
            evaluation_text = dict_match.group(0)
        
        evaluation_dict = eval(evaluation_text)
        
        if not isinstance(evaluation_dict, dict) or 'relevance' not in evaluation_dict or 'critic_explanation' not in evaluation_dict:
            raise ValueError("Invalid response format from LLM")
        
        qa_item_evaluated = qa_item.copy()
        qa_item_evaluated['relevance'] = evaluation_dict['relevance']
        qa_item_evaluated['relevance_explanation'] = evaluation_dict['critic_explanation']
        
        return qa_item_evaluated
        
    except Exception as e:
        print(f"Error evaluating Q&A relevance: {e}")
        qa_item_error = qa_item.copy()
        qa_item_error['relevance'] = 0
        qa_item_error['relevance_explanation'] = f"Error during evaluation: {e}"
        return qa_item_error


def evaluate_qa_factuality(qa_item: Dict[str, Any]) -> Dict[str, Any]:
    """Evaluate the factuality of a Q&A pair given its context using an LLM critic."""
    
    system_prompt = textwrap.dedent(f"""
        You are an expert legal document analyst and fact-checker.
        Your task is to evaluate whether a multiple-choice question and its correct answer are factually accurate based on the provided legal document context.
        
        Evaluation criteria:
        1. The question should be answerable from the given context
        2. The correct answer should be directly supported by the context
        3. The answer should not contain information not present in the context
        4. The question should not be ambiguous or misleading
        
        Your output should be like the following Python dictionary structure:
        
        {escape_curly_braces({
            "factuality": 1, 
            "critic_explanation": "Explanation of why the Q&A is factually correct or incorrect based on the context"
        })}
        
        ONLY output the requested dictionary.
        """)
    
    human_prompt = textwrap.dedent(f"""
        Evaluate the factuality of this Q&A pair based on the provided legal document context:
        
        **Context:**
        {escape_curly_braces(qa_item.get('context', ''))}
        
        **Question:**
        {escape_curly_braces(qa_item['question'])}
        
        **Options:**
        {escape_curly_braces(qa_item['options'])}
        
        **Marked Correct Answer:**
        {escape_curly_braces(qa_item['correct_answer'])} - {escape_curly_braces(qa_item['options'][qa_item['correct_answer']])}
        
        **Explanation:**
        {escape_curly_braces(qa_item['explanation'])}
        
        **Client:** {escape_curly_braces(qa_item.get('client', 'unknown'))}
        **Document Type:** {escape_curly_braces(qa_item.get('document_type', 'unknown'))}
        """)
    
    prompt = ChatPromptTemplate.from_messages([
        ("system", system_prompt),
        ("human", human_prompt),
    ])
    
    try:
        chain = prompt | azure_llm
        output = chain.invoke({})
        evaluation_text = output.content.strip()
        
        import re
        evaluation_text = re.sub(r'\\u[0-9a-fA-F]{4}', '', evaluation_text)
        evaluation_text = re.sub(r'```[a-zA-Z]*\n?', '', evaluation_text)
        evaluation_text = re.sub(r'\s+', ' ', evaluation_text).strip()
        
        dict_match = re.search(r'\{.*\}', evaluation_text, re.DOTALL)
        if dict_match:
            evaluation_text = dict_match.group(0)
        
        evaluation_dict = eval(evaluation_text)
        
        if not isinstance(evaluation_dict, dict) or 'factuality' not in evaluation_dict or 'critic_explanation' not in evaluation_dict:
            raise ValueError("Invalid response format from LLM")
        
        qa_item_evaluated = qa_item.copy()
        qa_item_evaluated['factuality'] = evaluation_dict['factuality']
        qa_item_evaluated['critic_explanation'] = evaluation_dict['critic_explanation']
        
        return qa_item_evaluated
        
    except Exception as e:
        print(f"Error evaluating Q&A factuality: {e}")
        print(f"Raw LLM response: {output.content[:200] if 'output' in locals() else 'No output captured'}...")
        qa_item_error = qa_item.copy()
        qa_item_error['factuality'] = 0
        qa_item_error['critic_explanation'] = f"Error during evaluation: {e}"
        return qa_item_error

In [18]:
qa_dataset_shuffled = shuffle_options(qa_dataset, seed=42)
qa_dataset_shuffled = add_ids_to_qa(qa_dataset_shuffled)

with open("qa_list.json", "w") as f:
    json.dump(qa_dataset_shuffled, f, indent=4)

print(f"Saved {len(qa_dataset_shuffled)} Q&A pairs to qa_list.json")

print("\nDataset summary:")
clients = [qa['client'] for qa in qa_dataset_shuffled]
doc_types = [qa['document_type'] for qa in qa_dataset_shuffled]

from collections import Counter
print(f"Clients: {dict(Counter(clients))}")
print(f"Document types: {dict(Counter(doc_types))}")

print("\nSample Q&A:")
for i, qa in enumerate(qa_dataset_shuffled[:2]):
    print(f"\n{i+1}. {qa['question']}")
    for opt, text in qa['options'].items():
        marker = "✓" if opt == qa['correct_answer'] else " "
        print(f"  {marker} {opt}: {text}")

Saved 200 Q&A pairs to qa_list.json

Dataset summary:
Clients: {'Ochsner Health System': 8, 'ATRIO Health Plans': 8, 'Accenture': 7, 'Capital Health Plan, Inc.': 9, 'Crumdale Partners - Better Benefits': 12, 'Maryland Physicians Care (MPC)': 13, 'Blue Shield of California': 5, "Martin's Point Health Care": 9, 'ACA Health Benefits Fund': 7, 'Wellth, Inc.': 7, 'Advise Insurance': 5, '1199 SEIU National Benefit Funds': 2, 'Arthur J Gallagher & Co.': 1, '7-Eleven': 12, 'Abarca Health LLC': 10, 'Community First Health Plans': 5, 'Albertsons Companies, Inc.': 9, 'Blue Cross and Blue Shield of Nebraska': 9, 'Boston Benefit Partners': 3, 'AdvantMed': 9, 'Sierra Health Plan of Nevada': 6, 'Self Insured Services Co': 11, 'The Standard': 7, 'Humana': 4, 'ACT.md': 8, 'Inogen': 3, 'Horizon Blue Cross Blue Shield of New Jersey': 2, 'Aetna Life Insurance Company': 4, 'AIA Health Insurance Pty Ltd': 4, 'AxialHealthcare': 1}
Document types: {'unknown': 124, 'NDA': 45, 'MSA': 22, 'Work Order / Change Re

## Step 4: Evaluate Q&A Factuality

In [19]:
print("Evaluating relevance and factuality of generated Q&A pairs...")
evaluated_qa_dataset = []

for i, qa_item in enumerate(tqdm(qa_dataset_shuffled, desc="Evaluating Q&A")):
    # First evaluate relevance
    relevance_evaluated = evaluate_qa_relevance(qa_item)
    
    # Only evaluate factuality if relevant
    if relevance_evaluated['relevance'] == 1:
        final_qa = evaluate_qa_factuality(relevance_evaluated)
    else:
        final_qa = relevance_evaluated
        final_qa['factuality'] = 0
        final_qa['critic_explanation'] = "Skipped factuality check due to low relevance"
    
    evaluated_qa_dataset.append(final_qa)
    
    if i % 20 == 0:
        print(f"\nQ&A {i+1} Evaluation:")
        print(f"Question: {qa_item['question'][:100]}...")
        print(f"Relevance: {final_qa['relevance']}")
        print(f"Factuality: {final_qa['factuality']}")
        print(f"Critic: {final_qa.get('critic_explanation', final_qa.get('relevance_explanation', ''))[:150]}...")
    
    time.sleep(1)

print(f"\nCompleted evaluation for {len(evaluated_qa_dataset)} Q&A pairs")

relevant_count = sum(1 for qa in evaluated_qa_dataset if qa['relevance'] == 1)
factual_count = sum(1 for qa in evaluated_qa_dataset if qa['factuality'] == 1)
print(f"Relevant: {relevant_count}/{len(evaluated_qa_dataset)} ({relevant_count/len(evaluated_qa_dataset)*100:.1f}%)")
print(f"Factually correct: {factual_count}/{len(evaluated_qa_dataset)} ({factual_count/len(evaluated_qa_dataset)*100:.1f}%)")

Evaluating relevance and factuality of generated Q&A pairs...


Evaluating Q&A:   0%|          | 0/200 [00:00<?, ?it/s]


Q&A 1 Evaluation:
Question: Under what conditions can either party assign their rights and obligations under this agreement?...
Relevance: 1
Factuality: 1
Critic: The marked correct answer is factually accurate based on the provided context. The context explicitly states that either party can assign their rights...


Evaluating Q&A:  10%|█         | 20/200 [00:54<08:39,  2.89s/it]


Q&A 21 Evaluation:
Question: What is the duration of the confidentiality obligations after the expiration or termination of the a...
Relevance: 1
Factuality: 1
Critic: The question is answerable from the given context and the correct answer is supported by the context. The document states that the obligations regardi...


Evaluating Q&A:  20%|██        | 40/200 [01:51<08:06,  3.04s/it]


Q&A 41 Evaluation:
Question: What action must a user take if hardware and software requirements change for electronic disclosures...
Relevance: 0
Factuality: 0
Critic: Skipped factuality check due to low relevance...


Evaluating Q&A:  30%|███       | 60/200 [02:46<06:21,  2.73s/it]


Q&A 61 Evaluation:
Question: What is the definition of "Medically Necessary" under the provisions of this SOW?...
Relevance: 1
Factuality: 1
Critic: The correct answer A is factually accurate as it directly aligns with the definition of "Medically Necessary" provided in the context. The context spe...


Evaluating Q&A:  40%|████      | 80/200 [03:43<05:24,  2.70s/it]


Q&A 81 Evaluation:
Question: What is the consequence of withdrawing consent to receive notices and disclosures electronically?...
Relevance: 1
Factuality: 1
Critic: The question is directly answerable from the context provided, and the marked correct answer (A: Slower transaction speed) is explicitly supported by ...


Evaluating Q&A:  50%|█████     | 100/200 [04:40<04:38,  2.78s/it]


Q&A 101 Evaluation:
Question: What restrictions are imposed on the Client regarding the Verisk IP?...
Relevance: 1
Factuality: 1
Critic: The question is answerable from the given context, and the correct answer is directly supported by the context. The document clearly states that the C...


Evaluating Q&A:  60%|██████    | 120/200 [05:36<03:27,  2.59s/it]


Q&A 121 Evaluation:
Question: What is excluded from Comparative Data according to the General VH Master Services Agreement?...
Relevance: 1
Factuality: 1
Critic: The question is directly answerable from the provided context, and the marked correct answer "D - Protected Health Information" is supported by the st...


Evaluating Q&A:  70%|███████   | 140/200 [06:33<02:42,  2.71s/it]


Q&A 141 Evaluation:
Question: What is required for remote computing sessions that involve access to Customer Confidential Informat...
Relevance: 1
Factuality: 1
Critic: The question is directly answerable from the provided context, which specifies that permitted and authorized remote computing sessions involving acces...


Evaluating Q&A:  76%|███████▌  | 151/200 [07:07<02:25,  2.98s/it]

Error evaluating Q&A relevance: unterminated string literal (detected at line 1) (<string>, line 1)


Evaluating Q&A:  80%|████████  | 160/200 [07:35<01:58,  2.96s/it]


Q&A 161 Evaluation:
Question: What is the data sharing requirement for Licensee under the Agreement?...
Relevance: 1
Factuality: 1
Critic: The question is answerable from the given context and the marked correct answer is directly supported by the context. The agreement indeed requires th...


Evaluating Q&A:  90%|█████████ | 180/200 [08:29<00:57,  2.86s/it]


Q&A 181 Evaluation:
Question: What remedy may be sought if a party breaches the agreement according to the legal document text?...
Relevance: 1
Factuality: 1
Critic: The question is answerable from the context and the correct answer, C, is directly supported by the context. The document specifies that specific perf...


Evaluating Q&A: 100%|██████████| 200/200 [09:29<00:00,  2.85s/it]


Completed evaluation for 200 Q&A pairs
Relevant: 181/200 (90.5%)
Factually correct: 178/200 (89.0%)





In [20]:
# filter datasets by relevance and factuality
relevant_qa_dataset = [qa for qa in evaluated_qa_dataset if qa['relevance'] == 1]
factual_qa_dataset = [qa for qa in evaluated_qa_dataset if qa['factuality'] == 1]
relevant_and_factual_qa_dataset = [qa for qa in evaluated_qa_dataset if qa['relevance'] == 1 and qa['factuality'] == 1]

print(f"Filtered datasets:")
print(f"- Relevant: {len(relevant_qa_dataset)} Q&A pairs")
print(f"- Factually correct: {len(factual_qa_dataset)} Q&A pairs") 
print(f"- Both relevant and factual: {len(relevant_and_factual_qa_dataset)} Q&A pairs")

# save all datasets
with open("qa_list_evaluated.json", "w") as f:
    json.dump(evaluated_qa_dataset, f, indent=4)

with open("qa_list_relevant_only.json", "w") as f:
    json.dump(relevant_qa_dataset, f, indent=4)

with open("qa_list_factual_only.json", "w") as f:
    json.dump(factual_qa_dataset, f, indent=4)

with open("qa_list_relevant_and_factual.json", "w") as f:
    json.dump(relevant_and_factual_qa_dataset, f, indent=4)

print(f"\nSaved datasets:")
print(f"- qa_list_evaluated.json: {len(evaluated_qa_dataset)} Q&A pairs (with all scores)")
print(f"- qa_list_relevant_only.json: {len(relevant_qa_dataset)} relevant Q&A pairs")
print(f"- qa_list_factual_only.json: {len(factual_qa_dataset)} factually correct Q&A pairs")
print(f"- qa_list_relevant_and_factual.json: {len(relevant_and_factual_qa_dataset)} high-quality Q&A pairs")

Filtered datasets:
- Relevant: 181 Q&A pairs
- Factually correct: 178 Q&A pairs
- Both relevant and factual: 178 Q&A pairs

Saved datasets:
- qa_list_evaluated.json: 200 Q&A pairs (with all scores)
- qa_list_relevant_only.json: 181 relevant Q&A pairs
- qa_list_factual_only.json: 178 factually correct Q&A pairs
- qa_list_relevant_and_factual.json: 178 high-quality Q&A pairs
