In [4]:
import json
import pandas as pd


# 1. The path to your LOCAL TriviaQA file (the one you downloaded)
input_filepath = r'D:\LLM-Hallucination\data\TriviaQA\\triviaqa-rc\qa\web-dev.json' 

# 2. The path to your output JSON file
output_filepath = r'D:\LLM-Hallucination\data\prompts\\triviaqa_prompts.json'

# 3. The number of rows you want to sample
num_samples_to_add = 200




def create_robust_context(search_results):

    if not search_results or not isinstance(search_results, list):
        return "" # Return an empty string if there's nothing to process

    top_results = search_results[:NUM_SEARCH_RESULTS_FOR_CONTEXT]
    context_parts = []
    
    for result in top_results:
        # Ensure the result is a dictionary before proceeding
        if isinstance(result, dict):
            # Use .get() to safely access keys and strip() to remove whitespace
            title = result.get('Title', '').strip()
            description = result.get('Description', '').strip()
            
            # Only add the part if there is actual content in the title OR description
            if title or description:
                context_parts.append(f"Title: {title}\nDescription: {description}")
    
    return "\n\n".join(context_parts)







def sample_triviaqa_from_local_file(input_file, output_file, num_samples):
   
    print(f"Loading local TriviaQA data from '{input_file}'...")
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            # The data is inside the 'Data' key
            trivia_data = json.load(f)['Data']
    except (FileNotFoundError, KeyError) as e:
        print(f"Error: The file '{input_file}' was not found or has an unexpected format. Details: {e}")
        return

    processed_questions = []
    for item in trivia_data:
        # Step 1: Always try to generate a context from the search results.
        generated_context = create_robust_context(item.get('SearchResults'))
        
        # Step 2: The ULTIMATE filter. Only proceed if the generated context is a non-empty string.
        if generated_context:
            processed_questions.append({
                'id': item['QuestionId'],
                'question': item['Question'],
                'answer_text': item['Answer']['Value'],
                'context': generated_context
            })
    
    print(f"Successfully filtered for quality. Found {len(processed_questions)} questions with usable context.")

    # Convert the clean list to a pandas DataFrame for easy sampling
    df = pd.DataFrame(processed_questions)

    # --- Sampling Logic ---
    if len(df) == 0:
        print("Error: After filtering, no valid questions with context were found. Cannot proceed.")
        return
        
    if len(df) < num_samples:
        print(f"Warning: Requested {num_samples} samples, but only {len(df)} are available after quality filtering.")
        sampled_df = df
    else:
        print(f"Randomly sampling {num_samples} questions...")
        sampled_df = df.sample(n=num_samples, random_state=42) # random_state for reproducibility
    
    samples_to_append = sampled_df.to_dict(orient='records')

    # --- Appending Logic ---
    try:
        with open(output_file, 'r', encoding='utf-8') as f:
            existing_data = json.load(f)
    except (FileNotFoundError, json.JSONDecodeError):
        existing_data = []

    existing_data.extend(samples_to_append)

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(existing_data, f, indent=4)

    print("-" * 20)
    print("Process complete!")
    print(f"The file '{output_file}' now contains a total of {len(existing_data)} entries.")


sample_triviaqa_from_local_file(input_filepath, output_filepath, num_samples_to_add)

Loading local TriviaQA data from 'D:\LLM-Hallucination\data\TriviaQA\\triviaqa-rc\qa\web-dev.json'...
Successfully filtered for quality. Found 9533 questions with usable context.
Randomly sampling 200 questions...
--------------------
Process complete!
The file 'D:\LLM-Hallucination\data\prompts\\triviaqa_prompts.json' now contains a total of 200 entries.
