# Legal RAG Evaluation - Ground Truth Dataset Creation

This notebook creates a ground truth Q&A dataset for evaluating the legal RAG pipeline.
Focus: Data usage questions with large context chunks (2000+ tokens)

In [1]:
import os
import sys
import json
import random
import textwrap
from typing import List, Dict, Any
from tqdm import tqdm
import time


from legal_rag import LegalRAGBackend
from langchain_openai import AzureChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

In [2]:
AZURE_OPENAI_ENDPOINT = "https://ironclad-openai-001.openai.azure.com/"
AZURE_OPENAI_API_KEY = "936856630b764210913d9a8fd6c8212b"
AZURE_DEPLOYMENT_NAME = "gpt-4o"

rag_backend = LegalRAGBackend()


azure_llm = AzureChatOpenAI(
    api_key=AZURE_OPENAI_API_KEY,
    api_version="2023-05-15",
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    azure_deployment=AZURE_DEPLOYMENT_NAME,
    temperature=0.7
)

In [3]:
DATA_USAGE_QUERIES = [
    "Can we use client data to develop or test new services?",
    "client data usage for development and testing",
    "PHI data usage restrictions",
    "artificial intelligence machine learning restrictions",
    "data retention requirements timelines",
    "client consent requirements data usage",
    "third-party vendor data processing",
    "human oversight AI decision making",
    "IP ownership rights client data",
    "cloud storage limitations PHI",
    "data sharing restrictions",
    "client data anonymization requirements"
]

KEY_CLIENTS = ["Aetna Life Insurance Company", "Aerotek", "1199 SEIU National Benefit Funds"]

In [4]:
def get_large_context_chunks(queries: List[str], clients: List[str], top_k: int = 10) -> List[Dict[str, Any]]:
    contexts = []
    
    for query in queries:
        for client in clients:
            try:
                response = rag_backend.query_s3_vector_store(
                    query_text=query,
                    client_account_filter=client,
                    top_k=top_k
                )
                
                if response and 'vectors' in response:
                    for vector in response['vectors']:
                        metadata = vector.get('metadata', {})
                        text = metadata.get('text', '')
                        
                        if len(text) >= 1000:
                            contexts.append({
                                'text': text,
                                'client': client,
                                'source': metadata.get('s3_path', 'unknown'),
                                'document_type': metadata.get('document_type', 'unknown'),
                                'query_used': query
                            })
                            
                time.sleep(0.5)
                
            except Exception as e:
                print(f"Error querying {query} for {client}: {e}")
                continue
    
    return contexts

In [5]:
def escape_curly_braces(text: str) -> str:
    text = str(text)
    return text.replace("{", "{{").replace("}", "}}")


def generate_qa_pair(context: Dict[str, Any]) -> Dict[str, Any]:
    system_prompt = textwrap.dedent(f"""
        You are an expert creating multiple-choice questions from legal documents.
        Create questions about data usage, privacy, AI/ML restrictions, and related legal topics.
        
        Your output should be like the following Python dictionary structure:
        
        {escape_curly_braces({
            "question": "Put here the question text",
            "options": {
                "A": "Option A text",
                "B": "Option B text",
                "C": "Option C text",
                "D": "Option D text"
            },
            "correct_answer": "Give the correct option letter (A, B, C, or D)",
            "explanation": "Give a brief explanation of why this is the correct answer"
        })}
        
        For example:
        
        {escape_curly_braces({
            "question": "What are the data retention requirements for client PHI?",
            "options": {
                "A": "1 year after contract termination",
                "B": "3 years after contract termination",
                "C": "5 years after contract termination",
                "D": "Permanent retention required"
            },
            "correct_answer": "B",
            "explanation": "The contract specifies a 3-year retention period for client PHI after contract termination."
        })}
        
        ONLY output the requested dictionary.
        """)
    
    human_prompt = textwrap.dedent(f"""
        Create a multiple-choice question based on the following legal document text:
        
        Client: {escape_curly_braces(context['client'])}
        Document Type: {escape_curly_braces(context['document_type'])}
        
        Text:
        {escape_curly_braces(context['text'][:3000])}
        """)
    
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            ("human", human_prompt),
        ]
    )
    
    try:
        # Create LangChain chain
        chain = prompt | azure_llm
        output = chain.invoke({})
        
        qa_text = output.content.strip()
        qa_dict = eval(qa_text)
        
        qa_dict['source'] = context['source']
        qa_dict['client'] = context['client']
        qa_dict['document_type'] = context['document_type']
        qa_dict['context'] = context['text'][:3000]  # Limit context to first 3000 characters
        
        return qa_dict
        
    except Exception as e:
        print(f"Error generating Q&A: {e}")
        return None

In [6]:
def shuffle_options(qa_list: List[Dict], seed: int = 42) -> List[Dict]:
    random.seed(seed)
    
    for qa in qa_list:
        options = list(qa["options"].items())
        random.shuffle(options)
        
        new_option_keys = ['A', 'B', 'C', 'D']
        new_options = {new_key: value for new_key, (_, value) in zip(new_option_keys, options)}
        
        correct_answer_value = qa["options"][qa["correct_answer"]]
        correct_answer_key = next(new_key for new_key, value in new_options.items() if value == correct_answer_value)
        
        qa["options"] = new_options
        qa["correct_answer"] = correct_answer_key
    
    return qa_list

## Step 1: Collect Large Context Chunks

In [7]:
print("Collecting large context chunks for data usage topics...")
contexts = get_large_context_chunks(DATA_USAGE_QUERIES, KEY_CLIENTS, top_k=5)

print(f"Collected {len(contexts)} context chunks")
print(f"Average length: {sum(len(c['text']) for c in contexts) / len(contexts):.0f} characters")

for i, ctx in enumerate(contexts[:1]):
    print(f"\nContext {i+1}:")
    print(f"Client: {ctx['client']}")
    print(f"Doc Type: {ctx['document_type']}")
    print(f"Length: {len(ctx['text'])} chars")
    print(f"Preview: {ctx['text'][:200]}...")

Collecting large context chunks for data usage topics...
Collected 65 context chunks
Average length: 1590 characters

Context 1:
Client: Aetna Life Insurance Company
Doc Type: Amendment
Length: 1915 chars
Preview: Initially, clinical services program functions shall include those designated below, subject 
to the limitatio ns specified:  
 
Describe Overall Service To Be Provided : Upon written approval from Ae...
Collected 65 context chunks
Average length: 1590 characters

Context 1:
Client: Aetna Life Insurance Company
Doc Type: Amendment
Length: 1915 chars
Preview: Initially, clinical services program functions shall include those designated below, subject 
to the limitatio ns specified:  
 
Describe Overall Service To Be Provided : Upon written approval from Ae...


In [8]:
print(ctx['text'])

Initially, clinical services program functions shall include those designated below, subject 
to the limitatio ns specified:  
 
Describe Overall Service To Be Provided : Upon written approval from Aetna Aetna , 
Supplier  will perform  the following services for Medicare Advantage Membership only:  
 Post service, post payment review of inpatient claims paid under a DRG 
meth odology to check that it was medically appropriate for the patient to be treated 
inpatient rather than outpatient. Supplier  will provide a written notice to the 
provider that explains the reason for change in setting for services audited and will 
provide c linical  and/or CMS payment policy  sourcing to support findings.  Post service, post payment review of Home Health Care claims for 1) qualification 
of services; 2 ) adherence  to medically appropriate protocols and procedures; and 3)  
validation of correct cod ing and billing by the provider. Supplier  will provide a 
written notice to the provider tha

## Step 2: Generate Q&A Pairs

In [19]:
TARGET_QA_COUNT = 20
qa_dataset = []

random.seed(42)
selected_contexts = random.sample(contexts, min(TARGET_QA_COUNT, len(contexts)))

print(f"Generating {TARGET_QA_COUNT} Q&A pairs...")

for i, context in enumerate(tqdm(selected_contexts, desc="Generating Q&A")):
    qa_pair = generate_qa_pair(context)
    
    if qa_pair:
        qa_dataset.append(qa_pair)
        print(f"\nQ&A {len(qa_dataset)}:")
        print(f"Q: {qa_pair['question']}")
        print(f"A: {qa_pair['correct_answer']} - {qa_pair['options'][qa_pair['correct_answer']]}")
    
    time.sleep(1)
    
    if len(qa_dataset) >= TARGET_QA_COUNT:
        break

print(f"\nGenerated {len(qa_dataset)} Q&A pairs")

Generating 20 Q&A pairs...


Generating Q&A:   0%|          | 0/20 [00:00<?, ?it/s]


Q&A 1:
Q: What must the Subcontractor provide to the Business Associate regarding insurance coverage?
A: B - A certificate of insurance evidencing coverage


Generating Q&A:   5%|▌         | 1/20 [00:02<00:40,  2.11s/it]


Q&A 2:
Q: What does the term "Processing" refer to in the context of Verscend Information Assets?
A: B - Any operation performed upon information assets, whether automatic or manual


Generating Q&A:  10%|█         | 2/20 [00:04<00:40,  2.23s/it]


Q&A 3:
Q: What is required for Cotiviti Global Assets to access Client’s Claims or Protected Health Information from outside the U.S.?
A: C - Access must be via a Virtual Private Network (VPN) Citrix tunnel and comply with DSA obligations


Generating Q&A:  15%|█▌        | 3/20 [00:06<00:39,  2.35s/it]


Q&A 4:
Q: What is required of the Subcontractor in terms of insurance coverage under the MSA with Aerotek?
A: B - Subcontractor must maintain insurance at its sole expense, consistent with industry standards.


Generating Q&A:  20%|██        | 4/20 [00:09<00:39,  2.45s/it]


Q&A 5:
Q: What is required of the Subcontractor in terms of insurance coverage according to the MSA with Aerotek?
A: B - Subcontractor must maintain insurance coverage consistent with industry standards at its sole expense.


Generating Q&A:  25%|██▌       | 5/20 [00:12<00:38,  2.59s/it]


Q&A 6:
Q: What security standards must the Vendor comply with when handling Highly Sensitive Information?
A: B - The HITRUST Security control standards or other accepted industry practices


Generating Q&A:  30%|███       | 6/20 [00:14<00:35,  2.56s/it]


Q&A 7:
Q: Under what condition can Verscend suspend the provision of Support Services to the Customer?
A: C - If the Customer has an overdue balance under this Agreement or any other agreement with Verscend.


Generating Q&A:  35%|███▌      | 7/20 [00:18<00:36,  2.82s/it]


Q&A 8:
Q: What security standards must the Vendor comply with when handling Highly Sensitive Information according to the Security Addendum?
A: D - HITRUST Security control standards or other accepted industry practices, and PCI DSS if handling payment cardholder information


Generating Q&A:  40%|████      | 8/20 [00:21<00:33,  2.82s/it]


Q&A 9:
Q: What is the time frame within which the subcontractor must report unauthorized use or disclosure of Protected Health Information?
A: B - Within 24 hours


Generating Q&A:  45%|████▌     | 9/20 [00:23<00:31,  2.83s/it]


Q&A 10:
Q: What is the time period covered by the Supplemental Criminal Record Conviction Database search as mentioned in the Security Addendum/Exhibit?
A: C - 7 years


Generating Q&A:  50%|█████     | 10/20 [00:26<00:26,  2.63s/it]


Q&A 11:
Q: What is HDI required to do with Aetna's confidential information, according to the MSA?
A: C - Keep it confidential unless required by law


Generating Q&A:  55%|█████▌    | 11/20 [00:28<00:23,  2.62s/it]


Q&A 12:
Q: How can a client withdraw their consent to receive future notices and disclosures electronically from Cotiviti?
A: B - By declining to sign a document in DocuSign and selecting the checkbox to withdraw consent


Generating Q&A:  60%|██████    | 12/20 [00:31<00:21,  2.63s/it]


Q&A 13:
Q: What is the required action for the Subcontractor in case of a breach of unsecured Protected Health Information according to the BAA?
A: C - Provide an initial report to Business Associate within 24 hours


Generating Q&A:  65%|██████▌   | 13/20 [00:33<00:17,  2.54s/it]


Q&A 14:
Q: How can a client withdraw their consent to receive electronic notices and disclosures from Cotiviti?
A: A - By declining to sign a document within their DocuSign account and selecting a checkbox


Generating Q&A:  70%|███████   | 14/20 [00:36<00:15,  2.62s/it]


Q&A 15:
Q: What is the procedure if attempts to verify an applicant’s educational credentials are unsuccessful?
A: C - Customer accepts an Approved Copy of the credential if the original was viewed in-person.


Generating Q&A:  75%|███████▌  | 15/20 [00:38<00:12,  2.55s/it]


Q&A 16:
Q: What does the legal document specify about the processing of Verscend Information Assets?
A: B - Subcontractor must process the information assets only as authorized and necessary to perform the Services.


Generating Q&A:  80%|████████  | 16/20 [00:41<00:10,  2.53s/it]


Q&A 17:
Q: What are the minimum recordkeeping requirements for initial examinations by psychologists for Medicaid/NJ FamilyCare patients?
A: D - All of the above must be included


Generating Q&A:  85%|████████▌ | 17/20 [00:43<00:07,  2.54s/it]


Q&A 18:
Q: What is one of the responsibilities of the Business Associate regarding PHI and EPHI security?
A: A - Ensure photocopiers and fax machines do not store any PHI or EPHI.


Generating Q&A:  90%|█████████ | 18/20 [00:46<00:05,  2.64s/it]


Q&A 19:
Q: What is the maximum time allowed for installing critical security patches on systems processing Company PHI?
A: B - 30 days after vendor release


Generating Q&A:  95%|█████████▌| 19/20 [00:48<00:02,  2.44s/it]


Q&A 20:
Q: What is included under the definition of Personal Information according to the MSA?
A: C - All data obtained from a consumer reporting agency and data revealing race, ethnicity, and sexual orientation


Generating Q&A:  95%|█████████▌| 19/20 [00:51<00:02,  2.69s/it]


Generated 20 Q&A pairs





## Step 3: Process and Save Dataset

In [35]:
# create a function to add an id to each of the questions
def add_ids_to_qa(qa_list: List[Dict]) -> List[Dict]:
    for i, qa in enumerate(qa_list):
        qa['id'] = f"qa_{i+1}"
    return qa_list


def evaluate_qa_factuality(qa_item: Dict[str, Any]) -> Dict[str, Any]:
    """Evaluate the factuality of a Q&A pair given its context using an LLM critic."""
    
    system_prompt = textwrap.dedent(f"""
        You are an expert legal document analyst and fact-checker.
        Your task is to evaluate whether a multiple-choice question and its correct answer are factually accurate based on the provided legal document context.
        
        Evaluation criteria:
        1. The question should be answerable from the given context
        2. The correct answer should be directly supported by the context
        3. The answer should not contain information not present in the context
        4. The question should not be ambiguous or misleading
        
        Your output should be like the following Python dictionary structure:
        
        {escape_curly_braces({
            "factuality": 1, 
            "critic_explanation": "Explanation of why the Q&A is factually correct or incorrect based on the context"
        })}
        
        ONLY output the requested dictionary.
        """)
    
    human_prompt = textwrap.dedent(f"""
        Evaluate the factuality of this Q&A pair based on the provided legal document context:
        
        **Context:**
        {escape_curly_braces(qa_item.get('context', ''))}
        
        **Question:**
        {escape_curly_braces(qa_item['question'])}
        
        **Options:**
        {escape_curly_braces(qa_item['options'])}
        
        **Marked Correct Answer:**
        {escape_curly_braces(qa_item['correct_answer'])} - {escape_curly_braces(qa_item['options'][qa_item['correct_answer']])}
        
        **Explanation:**
        {escape_curly_braces(qa_item['explanation'])}
        
        **Client:** {escape_curly_braces(qa_item.get('client', 'unknown'))}
        **Document Type:** {escape_curly_braces(qa_item.get('document_type', 'unknown'))}
        """)
    
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            ("human", human_prompt),
        ]
    )
    
    try:
        # Create LangChain chain
        chain = prompt | azure_llm
        output = chain.invoke({})
        
        evaluation_text = output.content.strip()
        
        # Clean unicode escape sequences and other problematic characters
        import re
        # Remove Unicode escape sequences
        evaluation_text = re.sub(r'\\u[0-9a-fA-F]{4}', '', evaluation_text)
        # Remove potential markdown code blocks
        evaluation_text = re.sub(r'```[a-zA-Z]*\n?', '', evaluation_text)
        # Remove extra whitespace and newlines
        evaluation_text = re.sub(r'\s+', ' ', evaluation_text).strip()
        
        # Try to extract dictionary if it's wrapped in other text
        dict_match = re.search(r'\{.*\}', evaluation_text, re.DOTALL)
        if dict_match:
            evaluation_text = dict_match.group(0)
        
        evaluation_dict = eval(evaluation_text)
        
        # Validate that we got the expected keys
        if not isinstance(evaluation_dict, dict) or 'factuality' not in evaluation_dict or 'critic_explanation' not in evaluation_dict:
            raise ValueError("Invalid response format from LLM")
        
        # Add evaluation results to the original qa_item
        qa_item_evaluated = qa_item.copy()
        qa_item_evaluated['factuality'] = evaluation_dict['factuality']
        qa_item_evaluated['critic_explanation'] = evaluation_dict['critic_explanation']
        
        return qa_item_evaluated
        
    except Exception as e:
        print(f"Error evaluating Q&A factuality: {e}")
        print(f"Raw LLM response: {output.content[:200] if 'output' in locals() else 'No output captured'}...")
        # Return original item with error indication
        qa_item_error = qa_item.copy()
        qa_item_error['factuality'] = 0
        qa_item_error['critic_explanation'] = f"Error during evaluation: {e}"
        return qa_item_error

In [36]:
qa_dataset_shuffled = shuffle_options(qa_dataset, seed=42)
qa_dataset_shuffled = add_ids_to_qa(qa_dataset_shuffled)

with open("qa_list.json", "w") as f:
    json.dump(qa_dataset_shuffled, f, indent=4)

print(f"Saved {len(qa_dataset_shuffled)} Q&A pairs to qa_list.json")

print("\nDataset summary:")
clients = [qa['client'] for qa in qa_dataset_shuffled]
doc_types = [qa['document_type'] for qa in qa_dataset_shuffled]

from collections import Counter
print(f"Clients: {dict(Counter(clients))}")
print(f"Document types: {dict(Counter(doc_types))}")

print("\nSample Q&A:")
for i, qa in enumerate(qa_dataset_shuffled[:2]):
    print(f"\n{i+1}. {qa['question']}")
    for opt, text in qa['options'].items():
        marker = "✓" if opt == qa['correct_answer'] else " "
        print(f"  {marker} {opt}: {text}")

Saved 20 Q&A pairs to qa_list.json

Dataset summary:
Clients: {'Aerotek': 12, '1199 SEIU National Benefit Funds': 5, 'Aetna Life Insurance Company': 3}
Document types: {'MSA': 8, 'SOW': 3, 'Security Addendum/Exhibit': 4, 'Amendment': 1, 'BAA': 2, 'Addendum': 2}

Sample Q&A:

1. What must the Subcontractor provide to the Business Associate regarding insurance coverage?
    A: A verbal confirmation of insurance coverage
  ✓ B: A certificate of insurance evidencing coverage
    C: A copy of the insurance policy
    D: A letter from the insurance provider

2. What does the term "Processing" refer to in the context of Verscend Information Assets?
    A: The erasure of sensitive data only
    B: The disclosure of information assets to third parties
  ✓ C: Any operation performed upon information assets, whether automatic or manual
    D: Only the storage of information assets


## Step 4: Evaluate Q&A Factuality

In [37]:
qa_dataset_shuffled

[{'question': 'What must the Subcontractor provide to the Business Associate regarding insurance coverage?',
  'options': {'A': 'A verbal confirmation of insurance coverage',
   'B': 'A certificate of insurance evidencing coverage',
   'C': 'A copy of the insurance policy',
   'D': 'A letter from the insurance provider'},
  'correct_answer': 'B',
  'explanation': 'The document specifies that the Subcontractor must provide a certificate of insurance evidencing coverage to the Business Associate upon request.',
  'source': 's3://ml-legal-restricted/contract-docs/51804/Aerotek-Verscend Master Staffing Agency Agreement v1 03-29-18 client redlined (SBC 06.04.18).docx',
  'client': 'Aerotek',
  'document_type': 'MSA',
  'context': 'Subcontractor shall indemnify and hold harmless Business Associate against, and reimburse Business Associate for, any expense, loss, damages, fees (including reasonable attorney’s fees), costs, claims or liabilities of any kind arising out of or related to any act

In [38]:
print("Evaluating factuality of generated Q&A pairs...")
evaluated_qa_dataset = []

for i, qa_item in enumerate(tqdm(qa_dataset_shuffled, desc="Evaluating factuality")):
    evaluated_qa = evaluate_qa_factuality(qa_item)
    evaluated_qa_dataset.append(evaluated_qa)
    
    print(f"\nQ&A {i+1} Evaluation:")
    print(f"Question: {qa_item['question'][:100]}...")
    print(f"Factuality: {evaluated_qa['factuality']}")
    print(f"Critic: {evaluated_qa['critic_explanation'][:150]}...")
    
    time.sleep(1)  # Rate limiting

print(f"\nCompleted factuality evaluation for {len(evaluated_qa_dataset)} Q&A pairs")

# Show factuality statistics
factual_count = sum(1 for qa in evaluated_qa_dataset if qa['factuality'] == 1)
print(f"Factually correct: {factual_count}/{len(evaluated_qa_dataset)} ({factual_count/len(evaluated_qa_dataset)*100:.1f}%)")

Evaluating factuality of generated Q&A pairs...


Evaluating factuality:   0%|          | 0/20 [00:00<?, ?it/s]


Q&A 1 Evaluation:
Question: What must the Subcontractor provide to the Business Associate regarding insurance coverage?...
Factuality: 1
Critic: The question is directly answerable from the context provided, which specifies that the Subcontractor must provide a certificate of insurance evidenci...


Evaluating factuality:   5%|▌         | 1/20 [00:02<00:42,  2.21s/it]


Q&A 2 Evaluation:
Question: What does the term "Processing" refer to in the context of Verscend Information Assets?...
Factuality: 1
Critic: The question is answerable from the given context, and the correct answer is directly supported by the context. The definition of "Processing" in the ...


Evaluating factuality:  10%|█         | 2/20 [00:04<00:37,  2.06s/it]


Q&A 3 Evaluation:
Question: What is required for Cotiviti Global Assets to access Client’s Claims or Protected Health Informatio...
Factuality: 1
Critic: The question is answerable from the given context, and the correct answer is directly supported by the context provided. The document specifies that C...


Evaluating factuality:  15%|█▌        | 3/20 [00:06<00:37,  2.19s/it]


Q&A 4 Evaluation:
Question: What is required of the Subcontractor in terms of insurance coverage under the MSA with Aerotek?...
Factuality: 1
Critic: The question is answerable from the given context, and the marked correct answer is directly supported by the context. The context explicitly states t...


Evaluating factuality:  20%|██        | 4/20 [00:08<00:34,  2.15s/it]


Q&A 5 Evaluation:
Question: What is required of the Subcontractor in terms of insurance coverage according to the MSA with Aerot...
Factuality: 1
Critic: The question is answerable from the given context, and the correct answer is directly supported by the context. The context explicitly states that the...


Evaluating factuality:  25%|██▌       | 5/20 [00:11<00:33,  2.26s/it]


Q&A 6 Evaluation:
Question: What security standards must the Vendor comply with when handling Highly Sensitive Information?...
Factuality: 1
Critic: The question is directly answerable from the given context. The context specifies that the Vendor must implement safeguards no less rigorous than acce...


Evaluating factuality:  30%|███       | 6/20 [00:13<00:31,  2.28s/it]


Q&A 7 Evaluation:
Question: Under what condition can Verscend suspend the provision of Support Services to the Customer?...
Factuality: 1
Critic: The question is directly answerable from the context, which states that Verscend may suspend Support Services if the Customer has an overdue balance u...


Evaluating factuality:  35%|███▌      | 7/20 [00:15<00:28,  2.20s/it]


Q&A 8 Evaluation:
Question: What security standards must the Vendor comply with when handling Highly Sensitive Information accor...
Factuality: 1
Critic: The question is answerable from the given context, and the correct answer is directly supported by the context. The context clearly states that the Ve...


Evaluating factuality:  40%|████      | 8/20 [00:17<00:26,  2.21s/it]


Q&A 9 Evaluation:
Question: What is the time frame within which the subcontractor must report unauthorized use or disclosure of ...
Factuality: 1
Critic: The question is directly answerable from the context, which states that the subcontractor must report any unauthorized use or disclosure of Protected ...


Evaluating factuality:  45%|████▌     | 9/20 [00:19<00:23,  2.17s/it]


Q&A 10 Evaluation:
Question: What is the time period covered by the Supplemental Criminal Record Conviction Database search as me...
Factuality: 1
Critic: Explanation of why the Q&A is factually correct or incorrect based on the context. The context states that the Supplemental Criminal Record Conviction...


Evaluating factuality:  50%|█████     | 10/20 [00:21<00:21,  2.20s/it]


Q&A 11 Evaluation:
Question: What is HDI required to do with Aetna's confidential information, according to the MSA?...
Factuality: 1
Critic: The question is answerable from the context provided, and the correct answer is directly supported by the context. The context explicitly states that ...


Evaluating factuality:  55%|█████▌    | 11/20 [00:24<00:19,  2.19s/it]


Q&A 12 Evaluation:
Question: How can a client withdraw their consent to receive future notices and disclosures electronically fro...
Factuality: 1
Critic: The question is answerable from the context, and the correct answer C is directly supported by the context. The context specifies that consent can be ...


Evaluating factuality:  60%|██████    | 12/20 [00:26<00:17,  2.21s/it]


Q&A 13 Evaluation:
Question: What is the required action for the Subcontractor in case of a breach of unsecured Protected Health ...
Factuality: 1
Critic: The question is answerable from the given context, and the correct answer is directly supported by the context. The context states that the Subcontrac...


Evaluating factuality:  65%|██████▌   | 13/20 [00:29<00:16,  2.37s/it]


Q&A 14 Evaluation:
Question: How can a client withdraw their consent to receive electronic notices and disclosures from Cotiviti?...
Factuality: 1
Critic: The correct answer is factually accurate according to the context provided. The context specifies that a client can withdraw consent by declining to s...


Evaluating factuality:  70%|███████   | 14/20 [00:32<00:15,  2.66s/it]


Q&A 15 Evaluation:
Question: What is the procedure if attempts to verify an applicant’s educational credentials are unsuccessful?...
Factuality: 1
Critic: The correct answer "D" is factually accurate based on the context. The document specifies that if verification attempts are unsuccessful, the Customer...


Evaluating factuality:  75%|███████▌  | 15/20 [00:34<00:12,  2.43s/it]


Q&A 16 Evaluation:
Question: What does the legal document specify about the processing of Verscend Information Assets?...
Factuality: 1
Critic: The correct answer B is factually accurate based on the context provided. The document specifies that the Subcontractor shall process Verscend Informa...


Evaluating factuality:  80%|████████  | 16/20 [00:36<00:08,  2.24s/it]


Q&A 17 Evaluation:
Question: What are the minimum recordkeeping requirements for initial examinations by psychologists for Medica...
Factuality: 1
Critic: The question is answerable from the given context, and the marked correct answer (D) is directly supported by the context. The context lists all the c...


Evaluating factuality:  85%|████████▌ | 17/20 [00:38<00:07,  2.41s/it]


Q&A 18 Evaluation:
Question: What is one of the responsibilities of the Business Associate regarding PHI and EPHI security?...
Factuality: 0
Critic: The marked correct answer is incorrect based on the context. The context specifies that the Business Associate must implement and use safeguards to pr...


Evaluating factuality:  90%|█████████ | 18/20 [00:41<00:05,  2.51s/it]


Q&A 19 Evaluation:
Question: What is the maximum time allowed for installing critical security patches on systems processing Comp...
Factuality: 1
Critic: The question is directly answerable from the context, which states that all applicable patches must be installed within thirty (30) days of vendor rel...


Evaluating factuality:  95%|█████████▌| 19/20 [00:43<00:02,  2.40s/it]


Q&A 20 Evaluation:
Question: What is included under the definition of Personal Information according to the MSA?...
Factuality: 1
Critic: The question is answerable from the given context, and the marked correct answer C is directly supported by the provided context. The context explicit...


Evaluating factuality: 100%|██████████| 20/20 [00:46<00:00,  2.30s/it]


Completed factuality evaluation for 20 Q&A pairs
Factually correct: 19/20 (95.0%)





In [39]:
# Filter only factually correct Q&A pairs
factual_qa_dataset = [qa for qa in evaluated_qa_dataset if qa['factuality'] == 1]

print(f"Filtered to {len(factual_qa_dataset)} factually correct Q&A pairs")

# Save both full and filtered datasets
with open("qa_list_evaluated.json", "w") as f:
    json.dump(evaluated_qa_dataset, f, indent=4)

with open("qa_list_factual_only.json", "w") as f:
    json.dump(factual_qa_dataset, f, indent=4)

print("\nSaved datasets:")
print(f"- qa_list_evaluated.json: {len(evaluated_qa_dataset)} Q&A pairs (with factuality scores)")
print(f"- qa_list_factual_only.json: {len(factual_qa_dataset)} factually correct Q&A pairs")

# Show examples of factual and non-factual items
print("\n=== Examples ===")
if factual_qa_dataset:
    print("\nFactually correct example:")
    example = factual_qa_dataset[0]
    print(f"Q: {example['question']}")
    print(f"A: {example['correct_answer']} - {example['options'][example['correct_answer']]}")
    print(f"Critic: {example['critic_explanation']}")

non_factual = [qa for qa in evaluated_qa_dataset if qa['factuality'] == 0]
if non_factual:
    print("\nNon-factual example:")
    example = non_factual[0]
    print(f"Q: {example['question']}")
    print(f"A: {example['correct_answer']} - {example['options'][example['correct_answer']]}")
    print(f"Critic: {example['critic_explanation']}")

Filtered to 19 factually correct Q&A pairs

Saved datasets:
- qa_list_evaluated.json: 20 Q&A pairs (with factuality scores)
- qa_list_factual_only.json: 19 factually correct Q&A pairs

=== Examples ===

Factually correct example:
Q: What must the Subcontractor provide to the Business Associate regarding insurance coverage?
A: B - A certificate of insurance evidencing coverage
Critic: The question is directly answerable from the context provided, which specifies that the Subcontractor must provide a certificate of insurance evidencing coverage to the Business Associate upon request. The marked correct answer, option B, is supported by the context, and no additional information is included in the answer beyond what is stated in the document.

Non-factual example:
Q: What is one of the responsibilities of the Business Associate regarding PHI and EPHI security?
A: B - Ensure photocopiers and fax machines do not store any PHI or EPHI.
Critic: The marked correct answer is incorrect based on 