In [None]:
import pandas as pd
import random
import json

main_df    = pd.read_csv('../data/gpqa/gpqa_main.csv')      # gpqa_main.csv 
diamond_df = pd.read_csv('../data/gpqa/gpqa_diamond.csv')   # gpqa_diamond.csv 

In [None]:
diamond_ids = set(diamond_df['Question'])
candidates  = main_df[~main_df['Question'].isin(diamond_ids)].reset_index(drop=True)


def make_few_shot_sets(df, seeds=(505, 1115), k=5):
    all_sets = []
    for seed in seeds:

        sample_df = df.sample(n=k, random_state=seed).reset_index(drop=True)


        random.seed(seed)

        questions = []
        for _, row in sample_df.iterrows():

            options = [
                (row['Incorrect Answer 1'], False),
                (row['Incorrect Answer 2'], False),
                (row['Incorrect Answer 3'], False),
                (row['Correct Answer'],    True),
            ]
            random.shuffle(options)


            choices = {}
            correct_letter = None
            for letter, (text, is_correct) in zip(['A','B','C','D'], options):
                choices[letter] = text
                if is_correct:
                    correct_letter = letter

            questions.append({
                "question":       row["Question"],
                "choices":        choices,
                "explanation":    row.get("Explanation", ""),
                "correct_answer": correct_letter
            })

        all_sets.append({
            "seed":      seed,
            "questions": questions
        })
    return all_sets


In [None]:
few_shot_sets = make_few_shot_sets(candidates)

for fs in few_shot_sets:
    out = {"questions": fs["questions"]}
    fname = f'few_shot_seed_{fs["seed"]}.json'
    with open(fname, 'w', encoding='utf-8') as f:
        json.dump(out, f, ensure_ascii=False, indent=2)
    print(f'Saved {fname}')


Saved few_shot_seed_505.json
Saved few_shot_seed_1115.json
