In [14]:
import json
with open("./shortanswer_ratings_cache.json", 'r', encoding='utf-8') as f:
    sa_cache = json.load(f)

sa_qdict = {}
for qaid, qdata in sa_cache.items():
    qid = qdata['qid']
    if qid not in sa_qdict:
        sa_qdict[qid] = {"question": qdata['question'], "correct_answer": qdata['correct_answer']}

In [4]:
with open("./compiled_results_sqa/grok-3-latest_phase1_compiled.json", 'r', encoding='utf-8') as f:
    claude_results = json.load(f)

for qid in claude_results["results"].keys():
    if qid not in sa_qdict:
        print(f"Warning: {qid} not found in sa_qdict")


In [None]:
import re
from base_game_class import BaseGameClass

judge_model = "claude-opus-4-20250514"
sa_to_mc_file = "./SimpleMC.jsonl"

fout = open(sa_to_mc_file, 'a', encoding='utf-8')

judge = BaseGameClass(subject_id=None, subject_name=judge_model, is_human_player=False, log_dir=None)
sysprompt=""
prompt = """I need your help turning a short-answer quiz into a multiple-choice quiz. I'm going to show you a question and its correct answer, and I want you to generate three distractors. 
Each distractor should be a plausible answer that is NOT the correct answer. Each should be the same \"type\" of answer as the correct answer (e.g., date, person name, number, etc), and follow the format of the correct answer.
Output each distractor a separate line, and do not include any other text. Your entire response should be just the distractors, one per line.
Here is the question and correct answer:

Question: {question}

Correct Answer: {correct_answer}

Distractors:
"""
for ctr, (qid, qdata) in enumerate(sa_qdict.items()):
    question = qdata['question']
    correct_answer = qdata['correct_answer'].strip()
    print(f"Processing question {ctr+1}, ID: {qid}")
    
    while True:
        resp, _, _ = judge._get_llm_answer(options=None, q_text=prompt.format(question=question, correct_answer=correct_answer), message_history=[], keep_appending=False, setup_text=sysprompt, MAX_TOKENS=None, temp=1.0)
        ans_list = re.split(r'\n+', resp) if resp else []
        ans_set = set([a.strip().upper() for a in ans_list] + [correct_answer.upper()]) 
        if len(ans_set) == 4 and all(ans.strip() for ans in ans_list):
            break
        else:
            print(f"Invalid response format for question {qid}. Retrying...")
    fout.write(json.dumps({"qid": qid, "question": question, "correct_answer": correct_answer, "distractors": ans_list}, ensure_ascii=False) + "\n")
    fout.flush()


Provider: Anthropic
Processing question 499, ID: sqa_test_2469
In model_call, provider=Anthropic, attempt=1
Processing question 500, ID: sqa_test_1573
In model_call, provider=Anthropic, attempt=1


In [27]:
from load_and_format_datasets import load_and_format_dataset
qs=load_and_format_dataset("SimpleMC")


Attempting to load SimpleMC...
Dataset loaded successfully.
Attempting to load SimpleQA (test split)...
Dataset loaded successfully.
Formatting 4326 questions...
Successfully formatted 4326 unique questions from SimpleQA.
Formatting 500 questions...
Successfully formatted 500 unique questions from SimpleMC.


In [28]:
for i, q in enumerate(qs):
    if q['question'] == 'Which of the three Olympic fencing weapons was the last one to transition to using electrical equipment?':
        print(f"Found question at index {i}: {q}")
        break

Found question at index 36: {'id': 'sqa_test_479', 'question': 'Which of the three Olympic fencing weapons was the last one to transition to using electrical equipment?', 'options': {'A': 'Sabre', 'B': 'Foil', 'C': 'Rapier', 'D': 'Épée'}, 'correct_answer': 'A', 'answer_type': 'Person', 'topic': 'Politics'}


In [30]:
sqa=load_and_format_dataset("SimpleQA")

Attempting to load SimpleQA (test split)...
Dataset loaded successfully.
Formatting 4326 questions...
Successfully formatted 4326 unique questions from SimpleQA.


In [31]:
for i, q in enumerate(sqa):
    if q['question'] == 'Which of the three Olympic fencing weapons was the last one to transition to using electrical equipment?':
        print(f"Found question at index {i}: {q}")
        break

Found question at index 1500: {'id': 'sqa_test_789', 'question': 'Which of the three Olympic fencing weapons was the last one to transition to using electrical equipment?', 'correct_answer': 'Sabre', 'answer_type': 'Other', 'topic': 'Sports'}


In [None]:
import hashlib
import os
import json
def text_to_id(text):
    return "sqa_test_" + hashlib.sha256(text.encode('utf-8')).hexdigest()

## replace every "id" field value in "phase1_questions" and "phase2_questions" with a new id based on the "question" text, and every "question_id" in "results" with a new id based on the "question_text" text
for filename in os.listdir("delegate_game_logs"):
   if "_Simple" in filename and filename.endswith("_game_data.json"):
       fname = os.path.join("delegate_game_logs", filename)
       with open(fname, 'r', encoding='utf-8') as f:
           game_data = json.load(f)
       for q in game_data['phase1_questions']:
           q['id'] = text_to_id(q['question'])
       for q in game_data['phase2_questions']:
           q['id'] = text_to_id(q['question'])
       for q in game_data['results']:
           q['question_id'] = text_to_id(q['question_text'])
       with open(fname, 'w', encoding='utf-8') as f:
           json.dump(game_data, f, ensure_ascii=False, indent=2)

In [43]:
targ_dir = "compiled_results_sqa"
for filename in os.listdir(targ_dir):
   if not "claude-3-5-sonnet-20241022_phase1_compiled.json" in filename:
       continue
   if filename.endswith(".json"):
    fname = os.path.join(targ_dir, filename)
    with open(fname, 'r', encoding='utf-8') as f:
        game_data = json.load(f)
    
    # Create new results dict with updated keys
    new_results = {}
    for old_id, result_data in game_data['results'].items():
        new_id = text_to_id(result_data['question'])
        new_results[new_id] = result_data
    
    # Replace the results dict
    game_data['results'] = new_results
    
    with open(fname, 'w', encoding='utf-8') as f:
        json.dump(game_data, f, ensure_ascii=False, indent=2)

In [41]:
## check for duplicate ids
from collections import Counter
targ_dir = "compiled_results_sqa"
for filename in os.listdir(targ_dir):
   if filename.endswith(".json"):
    fname = os.path.join(targ_dir, filename)
    with open(fname, 'r', encoding='utf-8') as f:
        game_data = json.load(f)
    id_counts = Counter(game_data['results'].keys())
    duplicates = [id for id, count in id_counts.items() if count > 1]
    if duplicates:
        print(f"Duplicate IDs found in {filename}: {duplicates}")
    else:
        print(f"No duplicate IDs found in {filename}")

No duplicate IDs found in gemini-2.0-flash-001_phase1_compiled.json
No duplicate IDs found in deepseek-chat_phase1_compiled.json
No duplicate IDs found in gemini-2.5-flash-preview-04-17_phase1_compiled.json
No duplicate IDs found in grok-3-latest_phase1_compiled.json
No duplicate IDs found in gpt-4o-2024-08-06_phase1_compiled.json
No duplicate IDs found in claude-sonnet-4-20250514_phase1_compiled.json
No duplicate IDs found in claude-3-5-sonnet-20241022_phase1_compiled.json


In [44]:
import json
f1 = "./delegate_game_logs/claude-3-5-sonnet-20241022_GPSA_50_450_team0.7_temp0.0_1749479584_game_data_evaluated.json"
f2 = "./delegate_game_logs/claude-3-5-sonnet-20241022_GPSA_50_450_nohistory_summary_team0.5_temp0.0_1749559243_game_data_evaluated.json"
with open(f1, 'r', encoding='utf-8') as f:
    game_data_hist = json.load(f)
with open(f2, 'r', encoding='utf-8') as f:
    game_data_nohist = json.load(f)

diff_del_list = []
diff_choice_list = []
for trial in game_data_hist['results']:
    qid = trial['question_id']
    for trial_nohist in game_data_nohist['results']:
        if trial_nohist['question_id'] == qid:
            if trial['delegation_choice'] != trial_nohist['delegation_choice']:
                diff_del_list.append((trial, trial_nohist))
            elif (trial['delegation_choice']=="Self" and trial['subject_correct'] != trial_nohist['subject_correct']):
                diff_choice_list.append((trial, trial_nohist))
            break
print(f"Found {len(diff_del_list)} trials with delegation differences between history and no history versions.")
print(f"Found {len(diff_choice_list)} trials with choice differences between history and no history versions.")


Found 158 trials with delegation differences between history and no history versions.
Found 19 trials with choice differences between history and no history versions.


In [51]:
diff_choice_list[6]

({'subject_id': 'claude-3-5-sonnet-20241022_GPSA_50_450_team0.7_temp0.0',
  'phase': 2,
  'trial_in_phase': 193,
  'question_id': 'gpqa_train_recjgMJaMxz4ESDF2',
  'question_text': "Compounds that have the same molecular formula but are different in their structural arrangement are known as isomers. Isomers have two types, constitutional isomers and stereoisomers. Constitutional isomers have the same molecular formula but differ in their structures. In stereoisomers, molecules are connected in the same way, but their arrangements in space are different.\nWhich of the following organic moieties show optical isomerism?\n\n1. dimethyl 6,6'-dinitro-[1,1'-biphenyl]-2,2'-dicarboxylate\n2. methyl 2-hydroxypropanoate\n3. benzophenone\n4. dimethyl fumarate",
  'correct_answer': '1 and 2',
  'timestamp': 1749480176.218984,
  'subject_answer': '2 (methyl 2-hydroxypropanoate)\n\nThis compound has a chiral carbon with four different substituents: -H, -OH, -CH3, and -COOCH3.',
  'subject_correct': F

In [None]:
def contingency(delegate: np.ndarray, correct: np.ndarray):
    """
    delegate : bool[N]   True -> model delegated
    correct  : bool[N]   True -> model would be correct on its own
    returns  : TP, FN, FP, TN as ints
    """
    TP = np.sum(delegate  & ~correct)   # delegate & wrong
    FN = np.sum(~delegate & ~correct)   # keep     & wrong
    FP = np.sum(delegate  &  correct)   # delegate & right
    TN = np.sum(~delegate &  correct)   # keep     & right
    return TP, FN, FP, TN

def lift_mcc_stats(tp, fn, fp, tn, p0, n_boot=2000, seed=0):
    """
    Parameters
    ----------
    tp, fn, fp, tn : int
        Contingency counts on Phase-2 items  
            tp = delegate & wrong  
            fn = keep & wrong  
            fp = delegate & right  
            tn = keep & right
    p0 : float
        Baseline accuracy to test against (global for RAW, hybrid value for HYBRID)
    Returns
    -------
    dict with point estimates, CIs, and p-values for
        lift   = acc_kept - p0
        mcc    = Matthews correlation
    """
    rng = np.random.default_rng(seed)

    # ---------- point estimates --------------------------------------------
    k         = fn + tn                       # kept items
    kept_acc  = tn / k if k else np.nan
    lift      = kept_acc - p0

    denom = math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
    mcc   = (tp*tn - fp*fn) / denom if denom else np.nan

    # ---------- p-values ----------------------------------------------------
    p_lift = binomtest(tn, k, p0, alternative='two-sided').pvalue
    p_mcc  = mcnemar([[tn, fp],
                      [fn, tp]], exact=True).pvalue   # two-sided by default

    # ---------- bootstrap CIs ----------------------------------------------
    N        = tp + fn + fp + tn
    counts   = np.array([tp, fn, fp, tn], int)
    probs    = counts / N

    lifts, mccs = [], []
    for _ in range(n_boot):
        sample = rng.choice(4, size=N, replace=True, p=probs)
        btp, bfn, bfp, btn = np.bincount(sample, minlength=4)

        bk     = bfn + btn
        b_acc  = btn / bk if bk else 0.0
        lifts.append(b_acc - p0)

        bden = math.sqrt((btp+bfp)*(btp+bfn)*(btn+bfp)*(btn+bfn))
        bmcc = (btp*btn - bfp*bfn) / bden if bden else 0.0
        mccs.append(bmcc)

    ci_lift = np.percentile(lifts, [2.5, 97.5])
    ci_mcc  = np.percentile(mccs,  [2.5, 97.5])

    return dict(
        lift       = lift,
        lift_ci    = tuple(ci_lift),
        p_lift     = p_lift,
        mcc        = mcc,
        mcc_ci     = tuple(ci_mcc),
        p_mcc      = p_mcc
    )

...

delegated = np.array(df_model['delegate_choice'], bool)
kept_mask = ~delegated                       # True where model answered itself
cap_corr = np.array(df_model['s_i_capability'], int)   # Baseline correctness from capabilities file
team_corr = np.where(df_model['delegate_choice'] == 0, df_model['team_correct'].fillna(0).astype(int), 0).astype(int) #Real in-game self correctness (only defined when kept)
# Hybrid correctness label 
#    – use real game correctness when the model kept
#    – fallback to baseline correctness when it delegated
true_label = np.where(kept_mask, team_corr, cap_corr)   # 1 = model would be correct

TP, FN, FP, TN = contingency(delegated, cap_corr)
raw_stats = lift_mcc_stats(TP, FN, FP, TN, cap_corr.mean())
log_output(f"Introspection score = {raw_stats['mcc']:.3f} [{raw_stats['mcc_ci'][0]:.3f}, {raw_stats['mcc_ci'][1]:.3f}], p={raw_stats['p_mcc']:.4g}")
delta_d, ci_low, ci_high, p_val = delegate_gap_stats(TP=TP, FN=FN, FP=FP, TN=TN)
log_output(f"Delegate Gap = {delta_d:.3f} [{ci_low:.3f}, {ci_high:.3f}, p={p_val:.4g}]")

TP, FN, FP, TN = contingency(delegated, true_label)
N = (TP+FP+TN+FN)
k   = FN + TN
acc_kept   = TN / k
acc_deleg  = cap_corr[delegated].mean()
p0_hyb     = (k/N)*acc_kept + (1-k/N)*acc_deleg
adj_stats = lift_mcc_stats(TP, FN, FP, TN, p0_hyb)

log_output(f"Adjusted introspection score = {adj_stats['mcc']:.3f} [{adj_stats['mcc_ci'][0]:.3f}, {adj_stats['mcc_ci'][1]:.3f}], p={adj_stats['p_mcc']:.4g}")
delta_d, ci_low, ci_high, p_val = delegate_gap_stats(TP=TP, FN=FN, FP=FP, TN=TN)
log_output(f"Adjusted delegate gap = {delta_d:.3f} [{ci_low:.3f}, {ci_high:.3f}, p={p_val:.4g}]")

log_output(f"Self-acc lift = {raw_stats['lift']:.3f} [{raw_stats['lift_ci'][0]:.3f}, {raw_stats['lift_ci'][1]:.3f}], p={raw_stats['p_lift']:.4g}")

log_output(f"Adjusted self-acc lift = {adj_stats['lift']:.3f} [{adj_stats['lift_ci'][0]:.3f}, {adj_stats['lift_ci'][1]:.3f}], p={adj_stats['p_lift']:.4g}")
