In [1]:
import pandas as pd    
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_json(path_or_buf='gpqa_diamond/nikhil_gpqa.jsonl', lines=True)

In [3]:
data = data[(data["rating_osq"]>=4) &   (data["rating_multians"]>=4)]

In [4]:
models = data["model"][0]

In [5]:
responses = {model: [] for model in models}
scores = {model: [] for model in models}
questions = []

for i in range(len(data)):
    for j,model in enumerate(data["model"].iloc[i]):
        scores[model] += [int(data["rating_match"].iloc[i][j] >= 4)]
        responses[model] += [data["response"].iloc[i][j]]
    questions += [data["question_text"].iloc[i]]
        
        
        
    

In [6]:
print(questions)

['A spin-half particle is in a linear superposition 0.5|\\uparrow\\rangle+sqrt(3)/2|\\downarrow\\rangle of its spin-up and spin-down states. If |\\uparrow\\rangle and |\\downarrow\\rangle are the eigenstates of \\sigma{z} , then what is the expectation value up to one decimal place, of the operator 10\\sigma{z}+5\\sigma_{x} ? Here, symbols have their usual meanings', '7-(tert-butoxy)bicyclo[2.2.1]hepta-2,5-diene is combined with 2 equivalents of 5,6-bis(dibromomethyl)cyclohexa-1,3-diene and sodium iodide at elevated temperature, forming product 1.\n\n1 is treated with aqueous sulfuric acid, forming 2\n\n2 is treated with SO3 and pyridine in DMSO, forming 3.\n\n3 is heated at 150C, forming final product 4. how many chemically distinct hydrogen atoms are there on 4?', 'toluene is treated with nitric acid and sulfuric acid, forming product 1.\n\n1 is treated with MnO2 and H2SO4, forming product 2.\n\n2 is treated with acetone and aqueous sodium hydroxide, forming product 3.\n\nwhat is the

In [7]:
for key in scores.keys():
    print(np.mean(scores[key]))

0.4418604651162791
0.3178294573643411
0.43410852713178294
0.4263565891472868


In [8]:
model_pairs = [(models[i],models[j]) for i in range(4) for j in range(i)]

In [9]:
model_pairs

[('openai/gpt-4o', 'deepseek/deepseek-chat-v3-0324'),
 ('meta-llama/llama-4-maverick', 'deepseek/deepseek-chat-v3-0324'),
 ('meta-llama/llama-4-maverick', 'openai/gpt-4o'),
 ('qwen/qwen3-32b', 'deepseek/deepseek-chat-v3-0324'),
 ('qwen/qwen3-32b', 'openai/gpt-4o'),
 ('qwen/qwen3-32b', 'meta-llama/llama-4-maverick')]

In [10]:
for model_pair in model_pairs: 
    for i in range(len(data)):
        print(responses[model_pair[0]][i],responses[model_pair[1]][i])

-0.7 -0.7
8 2
C_s Cₛ
m_{H_2}^2 \sim \frac{y_i^2}{16\pi^2} \Lambda^2 \( m_{H_2} \approx \frac{y_i^2 x}{4\pi} \)
C6H10O(C6H5)F 2-(fluoro(phenyl)methyl)cyclohexanone
Ethanethiolate > Hydroxide > 4-methylcyclohexan-1-olate > Propionate > Methanol Ethanethiolate > Hydroxide > 4-methylcyclohexan-1-olate > Propionate > Methanol
Enhancers or regulatory regions where IKAROS is indirectly associated with DNA through protein-protein interactions. Disappearing peaks are most likely found at binding sites of other transcription factors that recruit IKAROS indirectly via protein-protein interactions.
D3h D₅h
2 2
\frac{9.08}{k} 36.0 fm
Cyclohexene derivative with a butyl group 5-ethyl-6-propylnona-1,5-diene
The unknown substance likely acted as a base, neutralizing H+ ions, increasing the pH, and slowing the reaction rate. The unknown substance was likely a base or buffer that neutralized H⁺ ions, reducing their concentration and slowing the reaction, while releasing heat from the neutralization proc

In [11]:
def get_judge_prompt_with_gt(question, target, response, incorrect_options=None, cot=False):
    """
    Generate a prompt for the judge with ground truth.
    
    Args:
        question: The question being asked
        target: The ground truth answer
        response: The response to judge
        incorrect_options: Optional string containing incorrect options
        cot: Whether to use a COT prompt
        
    Returns:
        A formatted prompt string for the judge
    """
    # The response can have more information than the ground-truth. It can be more specific (for example, "Labrador" is more specific than "dog"), or have additional possible answers. But it must cover everything mentioned in the ground-truth. It is okay if it covers it in different words, i.e. paraphrased.

    prompt = f"""Your task is to judge whether the given response to a question matches a given ground truth answer or not. You are provided with a question, a ground truth response, and the response you need to judge.
For a response to "match", it must have at least as much information as the ground-truth. 
The response can have more information than the ground-truth. It can be more specific (for example, "Labrador" is more specific than "dog"), or have additional possible correct answers. But it must cover everything mentioned in the ground-truth. It is okay if it covers it in different words, i.e. paraphrased. 
For numeric answers, the relative error, defined as |response - ground truth| / mean(response, ground truth), must be less than 1% for the response to be judged as a correct match. Here, if the ground truth is a specific numeric quantity but the response is a range, then they don't match (even if the range contains the ground truth).

Possible judgments:

"0": The response does not match the ground-truth answer.
"1": The response matches the ground-truth.

Question: "{question}"
Ground truth: "{target}"
"""

    if incorrect_options:
        prompt += f"\n{incorrect_options}"
        
    prompt += f"""Response: "{response}"

Your job is to ONLY check whether the given response matches the ground truth answer or not in the context of the question. You DO NOT NEED to assess the correctness of the response. This is part of an automated evaluation process, therefore you MUST OUTPUT your final answer as "0" or "1"."""
        
    prompt += " Answer: "
        
# Think step by step and end your response with <answer>0</answer> OR <answer>1</answer> TAGS.
# YOU SHOULD ALWAYS END YOUR RESPONSE WITH <answer>0</answer> OR <answer>1</answer> TAGS.

    return prompt

In [12]:
tokenizer = AutoTokenizer.from_pretrained("/fast/groups/sf/huggingface-models/meta-llama--Llama-4-Scout-17B-16E")
model = AutoModelForCausalLM.from_pretrained("/fast/groups/sf/huggingface-models/meta-llama--Llama-4-Scout-17B-16E",torch_dtype=torch.bfloat16,device_map="auto")

`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|████████████████| 49/49 [02:52<00:00,  3.53s/it]


In [13]:
results = {m[0]+"_vs_"+m[1]: [] for m in model_pairs}

for m in model_pairs: 
    for i in tqdm(range(len(data))):
        gt = responses[m[0]][i]
        answer = responses[m[1]][i]
        question = questions[i]
        prompt = get_judge_prompt_with_gt(question, gt, answer)
        results[m[0]+"_vs_"+m[1]] += [int(tokenizer.decode(model.generate(**tokenizer(prompt, return_tensors='pt').to(model.device),max_new_tokens=1)[0])[-1])]

100%|█████████████████████████████████████████| 129/129 [01:32<00:00,  1.39it/s]
100%|█████████████████████████████████████████| 129/129 [01:30<00:00,  1.43it/s]
100%|█████████████████████████████████████████| 129/129 [01:29<00:00,  1.44it/s]
100%|█████████████████████████████████████████| 129/129 [01:31<00:00,  1.42it/s]
100%|█████████████████████████████████████████| 129/129 [01:31<00:00,  1.42it/s]
100%|█████████████████████████████████████████| 129/129 [01:30<00:00,  1.43it/s]


In [16]:
for key in results:
    np.save("gpqa_diamond/scout_"+key.replace("/","_"),results[key])

In [14]:
for i in range(len(data)):
    print(results['openai/gpt-4o_vs_deepseek/deepseek-chat-v3-0324'][i],scores["openai/gpt-4o"][i]
         -scores['deepseek/deepseek-chat-v3-0324'][i])

1 0
0 0
1 0
0 0
1 0
1 0
1 1
0 0
1 0
0 0
0 0
1 0
1 0
1 0
1 0
1 0
1 0
1 0
1 -1
0 0
0 0
0 0
0 -1
1 0
1 -1
0 0
1 0
1 0
1 -1
1 1
0 0
1 0
1 0
1 0
0 0
1 0
1 -1
0 0
1 0
0 1
1 0
1 0
1 0
0 0
0 0
0 0
1 -1
1 0
1 0
0 -1
1 0
1 0
0 0
0 -1
1 0
1 0
1 0
1 0
0 0
1 0
1 0
1 0
0 0
1 0
0 0
1 0
0 0
1 -1
1 0
0 0
0 0
1 0
0 -1
0 0
0 1
1 0
1 0
1 0
1 0
1 1
1 0
1 0
0 1
1 0
0 0
1 0
1 0
0 -1
1 0
0 0
1 0
0 0
1 0
0 0
1 0
0 -1
1 0
0 -1
1 -1
1 0
0 0
1 0
1 0
1 0
1 0
0 -1
0 -1
1 0
1 -1
1 0
0 -1
0 0
1 0
1 0
0 0
0 0
0 0
0 0
1 0
0 -1
1 0
1 0
0 -1
0 0
0 0
0 0
0 -1
1 0
0 -1


In [15]:
for i in range(len(data)):
    print(results['openai/gpt-4o_vs_deepseek/deepseek-chat-v3-0324'][i],responses["openai/gpt-4o"][i]
         ,responses['deepseek/deepseek-chat-v3-0324'][i])

1 -0.7 -0.7
0 8 2
1 C_s Cₛ
0 m_{H_2}^2 \sim \frac{y_i^2}{16\pi^2} \Lambda^2 \( m_{H_2} \approx \frac{y_i^2 x}{4\pi} \)
1 C6H10O(C6H5)F 2-(fluoro(phenyl)methyl)cyclohexanone
1 Ethanethiolate > Hydroxide > 4-methylcyclohexan-1-olate > Propionate > Methanol Ethanethiolate > Hydroxide > 4-methylcyclohexan-1-olate > Propionate > Methanol
1 Enhancers or regulatory regions where IKAROS is indirectly associated with DNA through protein-protein interactions. Disappearing peaks are most likely found at binding sites of other transcription factors that recruit IKAROS indirectly via protein-protein interactions.
0 D3h D₅h
1 2 2
0 \frac{9.08}{k} 36.0 fm
0 Cyclohexene derivative with a butyl group 5-ethyl-6-propylnona-1,5-diene
1 The unknown substance likely acted as a base, neutralizing H+ ions, increasing the pH, and slowing the reaction rate. The unknown substance was likely a base or buffer that neutralized H⁺ ions, reducing their concentration and slowing the reaction, while releasing heat from

In [56]:
diffs = np.array([scores["openai/gpt-4o"][i]
         -scores['deepseek/deepseek-chat-v3-0324'][i] for i in range(len(data))])

In [57]:
diffs[np.array(results['openai/gpt-4o_vs_deepseek/deepseek-chat-v3-0324'],dtype="bool")].mean()

np.float64(-0.06481481481481481)

In [58]:
diffs[np.array(results['openai/gpt-4o_vs_deepseek/deepseek-chat-v3-0324'],dtype="bool")].std()

np.float64(0.4143971111222267)

In [59]:
diffs[~np.array(results['openai/gpt-4o_vs_deepseek/deepseek-chat-v3-0324'],dtype="bool")].mean()

np.float64(-0.42857142857142855)

In [60]:
diffs[~np.array(results['openai/gpt-4o_vs_deepseek/deepseek-chat-v3-0324'],dtype="bool")].std()

np.float64(0.49487165930539345)

In [69]:
for model in models:
    np.save("gpqa_diamond/"+model.replace("/","_"),scores[model])

In [66]:
key.strip("\/")

  key.strip("\/")


'openai/gpt-4o_vs_deepseek/deepseek-chat-v3-0324'

In [27]:
print(prompt)

Your task is to judge whether the given response to a question matches a given ground truth answer or not. You are provided with a question, a ground truth response, and the response you need to judge.
For a response to "match", it must have at least as much information as the ground-truth. 
The response can have more information than the ground-truth. It can be more specific (for example, "Labrador" is more specific than "dog"), or have additional possible correct answers. But it must cover everything mentioned in the ground-truth. It is okay if it covers it in different words, i.e. paraphrased. 
For numeric answers, the relative error, defined as |response - ground truth| / mean(response, ground truth), must be less than 1% for the response to be judged as a correct match. Here, if the ground truth is a specific numeric quantity but the response is a range, then they don't match (even if the range contains the ground truth).

Possible judgments:

"0": The response does not match the 

In [53]:
data

Unnamed: 0,question_id,model,thinking,question_text,answer,response,rating_match,rating_osq,rating_multians,comments,full_response
0,rec0wZvZgiz320KRs,"[deepseek/deepseek-chat-v3-0324, openai/gpt-4o...","[, , , ]",A spin-half particle is in a linear superposit...,-0.7,"[-0.7, -0.7, -0.7, -0.7]","[5, 5, 5, 5]",5,5,I am confident answer has unique as I found th...,[To find the expectation value of the operator...
1,recRgabRzMaEoBRcM,"[deepseek/deepseek-chat-v3-0324, meta-llama/ll...","[, , , ]","7-(tert-butoxy)bicyclo[2.2.1]hepta-2,5-diene i...",4,"[2, 6, 3, 8]","[1, 1, 1, 1]",5,5,Answer has to be unique?,[To determine the number of chemically distinc...
2,recDj2Y2BbtV02Wv5,"[qwen/qwen3-32b, deepseek/deepseek-chat-v3-032...","[, , , ]",toluene is treated with nitric acid and sulfur...,c2h,"[Cs, Cₛ, C_s, C2v]","[1, 1, 1, 1]",5,4,,[The question involves a sequence of three che...
4,rec6sE2CRtD4drtHg,"[qwen/qwen3-32b, openai/gpt-4o, deepseek/deeps...","[, , , ]",Consider the extension of the Standard Model g...,M_{h_{2}}^{2}=\frac{1}{8\pi^{2}\left(x^{2}+v^{...,"[\boxed{\frac{g y x}{4\sqrt{2}\pi}}, m_{H_2}^2...","[1, 1, 1, 1]",4,4,,[To determine the approximation of the mass of...
8,recZWeueB7lSPR6wN,"[meta-llama/llama-4-maverick, deepseek/deepsee...","[, , , ]",cyclohexanone is treated with LDA at low tempe...,"((R)-((R)-2,2-difluorocyclohexyl)fluoromethyl)...","[2-(fluoro(phenyl)methyl)cyclohexanone, 2-(flu...","[1, 1, 1, 1]",4,4,Kind of leaning yes? see explanation,[## Step 1: Understand the given reaction sequ...
...,...,...,...,...,...,...,...,...,...,...,...
191,reczjcMtrB1YGS2fO,"[deepseek/deepseek-chat-v3-0324, openai/gpt-4o...","[, , , ]","Consider an aperture, which shapes like a N-si...",0.506 \lambda / a,"[\frac{\lambda}{a}, \(\frac{1.22 \lambda}{2a}\...","[1, 1, 5, 1]",5,4,,"[To solve this problem, let's break it down st..."
192,recPSTGXK3P39yNYT,"[openai/gpt-4o, deepseek/deepseek-chat-v3-0324...","[, , , ]",Consider two electrons are in p orbital angula...,0,"[0, 0, 0, 0]","[5, 5, 5, 5]",5,5,,"[To solve this problem, we need to determine t..."
193,recIj8lR4tuDgrHou,"[deepseek/deepseek-chat-v3-0324, qwen/qwen3-32...","[, , , ]",Four identical spin-1/2 particles are placed i...,"10E, 15E, 18E\n","[Ground state: 10E, First excited state: 15E, ...","[5, 1, 1, 1]",4,5,,"[To solve this problem, we need to determine t..."
195,recINSFEYLCyyd08m,"[meta-llama/llama-4-maverick, qwen/qwen3-32b, ...","[, , , ]",A benzene ring has a carboxylic acid a carbald...,3-cyano-6-(dimethylamino)-5-formyl-2-hydroxy-4...,[5-Cyano-2-dimethylamino-3-formyl-6-hydroxy-4-...,"[1, 5, 1, 1]",4,4,,"[To solve this, let's break down the informati..."
