In [12]:
import json
import csv

def convert_and_save_csv(input_json_path, output_json_path):
    """
    Convert input JSON to a format:
    { "user_input": "...", "reference": "...", "response": "", "retrieved_contexts": "" }
    """
    try:
        # Read input JSON
        with open(input_json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # Print first entry to understand its structure
        if data and len(data) > 0:
            print("First entry keys:", list(data[0].keys()))
            print("Sample data:", data[0])

        # Possible field mappings (try different common field names)
        question_fields = ["user_input", "question", "query", "input", "q"]
        answer_fields = ["reference", "answer", "ground_truth", "target", "a"]

        # Convert to required format
        formatted_data = []
        for entry in data:
            # Find the question field
            question_value = ""
            for field in question_fields:
                if field in entry:
                    question_value = entry[field]
                    break

            # Find the reference/answer field
            reference_value = ""
            for field in answer_fields:
                if field in entry:
                    reference_value = entry[field]
                    break

            formatted_entry = {
                "user_input": question_value,
                "reference": reference_value,
                "response": "",
                "retrieved_contexts": ""
            }
            formatted_data.append(formatted_entry)

        # Print sample of formatted data
        if formatted_data and len(formatted_data) > 0:
            print("First entry after formatting:", formatted_data[0])

        # Write output JSON
        with open(output_json_path, 'w', encoding='utf-8') as f:
            json.dump(formatted_data, f, indent=2, ensure_ascii=False)

        print(f"Successfully converted and saved to {output_json_path}")

    except Exception as e:
        print(f"Error processing file: {e}")

# Example usage
if __name__ == "__main__":
    convert_and_save_csv("given_dataset/MultiHopRAG.json", "ragas_ready_data.json")


First entry keys: ['query', 'answer', 'question_type', 'evidence_list']
Sample data: {'query': 'Who is the individual associated with the cryptocurrency industry facing a criminal trial on fraud and conspiracy charges, as reported by both The Verge and TechCrunch, and is accused by prosecutors of committing fraud for personal gain?', 'answer': 'Sam Bankman-Fried', 'question_type': 'inference_query', 'evidence_list': [{'title': 'The FTX trial is bigger than Sam Bankman-Fried', 'author': 'Elizabeth Lopatto', 'url': 'https://www.theverge.com/2023/9/28/23893269/ftx-sam-bankman-fried-trial-evidence-crypto', 'source': 'The Verge', 'category': 'technology', 'published_at': '2023-09-28T12:00:00+00:00', 'fact': 'Before his fall, Bankman-Fried made himself out to be the Good Boy of crypto — the trustworthy face of a sometimes-shady industry.'}, {'title': 'SBF’s trial starts soon, but how did he — and FTX — get here?', 'author': 'Jacquelyn Melinek', 'url': 'https://techcrunch.com/2023/10/01/ftx-l