In [14]:
import json
with open("./shortanswer_ratings_cache.json", 'r', encoding='utf-8') as f:
    sa_cache = json.load(f)

sa_qdict = {}
for qaid, qdata in sa_cache.items():
    qid = qdata['qid']
    if qid not in sa_qdict:
        sa_qdict[qid] = {"question": qdata['question'], "correct_answer": qdata['correct_answer']}

In [4]:
with open("./compiled_results_sqa/grok-3-latest_phase1_compiled.json", 'r', encoding='utf-8') as f:
    claude_results = json.load(f)

for qid in claude_results["results"].keys():
    if qid not in sa_qdict:
        print(f"Warning: {qid} not found in sa_qdict")


In [None]:
import re
from base_game_class import BaseGameClass

judge_model = "claude-opus-4-20250514"
sa_to_mc_file = "./SimpleMC.jsonl"

fout = open(sa_to_mc_file, 'a', encoding='utf-8')

judge = BaseGameClass(subject_id=None, subject_name=judge_model, is_human_player=False, log_dir=None)
sysprompt=""
prompt = """I need your help turning a short-answer quiz into a multiple-choice quiz. I'm going to show you a question and its correct answer, and I want you to generate three distractors. 
Each distractor should be a plausible answer that is NOT the correct answer. Each should be the same \"type\" of answer as the correct answer (e.g., date, person name, number, etc), and follow the format of the correct answer.
Output each distractor a separate line, and do not include any other text. Your entire response should be just the distractors, one per line.
Here is the question and correct answer:

Question: {question}

Correct Answer: {correct_answer}

Distractors:
"""
for ctr, (qid, qdata) in enumerate(sa_qdict.items()):
    question = qdata['question']
    correct_answer = qdata['correct_answer'].strip()
    print(f"Processing question {ctr+1}, ID: {qid}")
    
    while True:
        resp, _, _ = judge._get_llm_answer(options=None, q_text=prompt.format(question=question, correct_answer=correct_answer), message_history=[], keep_appending=False, setup_text=sysprompt, MAX_TOKENS=None, temp=1.0)
        ans_list = re.split(r'\n+', resp) if resp else []
        ans_set = set([a.strip().upper() for a in ans_list] + [correct_answer.upper()]) 
        if len(ans_set) == 4 and all(ans.strip() for ans in ans_list):
            break
        else:
            print(f"Invalid response format for question {qid}. Retrying...")
    fout.write(json.dumps({"qid": qid, "question": question, "correct_answer": correct_answer, "distractors": ans_list}, ensure_ascii=False) + "\n")
    fout.flush()


Provider: Anthropic
Processing question 499, ID: sqa_test_2469
In model_call, provider=Anthropic, attempt=1
Processing question 500, ID: sqa_test_1573
In model_call, provider=Anthropic, attempt=1


In [27]:
from load_and_format_datasets import load_and_format_dataset
qs=load_and_format_dataset("SimpleMC")


Attempting to load SimpleMC...
Dataset loaded successfully.
Attempting to load SimpleQA (test split)...
Dataset loaded successfully.
Formatting 4326 questions...
Successfully formatted 4326 unique questions from SimpleQA.
Formatting 500 questions...
Successfully formatted 500 unique questions from SimpleMC.


In [28]:
for i, q in enumerate(qs):
    if q['question'] == 'Which of the three Olympic fencing weapons was the last one to transition to using electrical equipment?':
        print(f"Found question at index {i}: {q}")
        break

Found question at index 36: {'id': 'sqa_test_479', 'question': 'Which of the three Olympic fencing weapons was the last one to transition to using electrical equipment?', 'options': {'A': 'Sabre', 'B': 'Foil', 'C': 'Rapier', 'D': 'Épée'}, 'correct_answer': 'A', 'answer_type': 'Person', 'topic': 'Politics'}


In [30]:
sqa=load_and_format_dataset("SimpleQA")

Attempting to load SimpleQA (test split)...
Dataset loaded successfully.
Formatting 4326 questions...
Successfully formatted 4326 unique questions from SimpleQA.


In [31]:
for i, q in enumerate(sqa):
    if q['question'] == 'Which of the three Olympic fencing weapons was the last one to transition to using electrical equipment?':
        print(f"Found question at index {i}: {q}")
        break

Found question at index 1500: {'id': 'sqa_test_789', 'question': 'Which of the three Olympic fencing weapons was the last one to transition to using electrical equipment?', 'correct_answer': 'Sabre', 'answer_type': 'Other', 'topic': 'Sports'}


In [None]:
import hashlib
import os
import json
def text_to_id(text):
    return "sqa_test_" + hashlib.sha256(text.encode('utf-8')).hexdigest()

## replace every "id" field value in "phase1_questions" and "phase2_questions" with a new id based on the "question" text, and every "question_id" in "results" with a new id based on the "question_text" text
for filename in os.listdir("delegate_game_logs"):
   if "_Simple" in filename and filename.endswith("_game_data.json"):
       fname = os.path.join("delegate_game_logs", filename)
       with open(fname, 'r', encoding='utf-8') as f:
           game_data = json.load(f)
       for q in game_data['phase1_questions']:
           q['id'] = text_to_id(q['question'])
       for q in game_data['phase2_questions']:
           q['id'] = text_to_id(q['question'])
       for q in game_data['results']:
           q['question_id'] = text_to_id(q['question_text'])
       with open(fname, 'w', encoding='utf-8') as f:
           json.dump(game_data, f, ensure_ascii=False, indent=2)

In [None]:
targ_dir = "compiled_results_sqa"
for filename in os.listdir(targ_dir):
   if not "claude-3-5-sonnet-20241022_phase1_compiled.json" in filename:
       continue
   if filename.endswith(".json"):
    fname = os.path.join(targ_dir, filename)
    with open(fname, 'r', encoding='utf-8') as f:
        game_data = json.load(f)
    
    # Create new results dict with updated keys
    new_results = {}
    for old_id, result_data in game_data['results'].items():
        new_id = text_to_id(result_data['question'])
        new_results[new_id] = result_data
    
    # Replace the results dict
    game_data['results'] = new_results
    
    with open(fname, 'w', encoding='utf-8') as f:
        json.dump(game_data, f, ensure_ascii=False, indent=2)

In [41]:
## check for duplicate ids
from collections import Counter
targ_dir = "compiled_results_sqa"
for filename in os.listdir(targ_dir):
   if filename.endswith(".json"):
    fname = os.path.join(targ_dir, filename)
    with open(fname, 'r', encoding='utf-8') as f:
        game_data = json.load(f)
    id_counts = Counter(game_data['results'].keys())
    duplicates = [id for id, count in id_counts.items() if count > 1]
    if duplicates:
        print(f"Duplicate IDs found in {filename}: {duplicates}")
    else:
        print(f"No duplicate IDs found in {filename}")

No duplicate IDs found in gemini-2.0-flash-001_phase1_compiled.json
No duplicate IDs found in deepseek-chat_phase1_compiled.json
No duplicate IDs found in gemini-2.5-flash-preview-04-17_phase1_compiled.json
No duplicate IDs found in grok-3-latest_phase1_compiled.json
No duplicate IDs found in gpt-4o-2024-08-06_phase1_compiled.json
No duplicate IDs found in claude-sonnet-4-20250514_phase1_compiled.json
No duplicate IDs found in claude-3-5-sonnet-20241022_phase1_compiled.json
