In [None]:
import os
import json
import jsonlines
from collections import defaultdict
import time
import random
random.seed(2025)

In [None]:
all_start = time.time()

In [None]:
# CMeIE
# A. c→(h,t)
out_name = 'sub_task'
inp_prompt = '''You are currently a senior expert in commonsense true-or-false questions.
Your task is to determine whether a given question and candidate answer are correct.
The output format of the task is: Yes or No.
Given question: "{question}"
Candidate answer: "{answer}"
'''
oup_prompt = '{answer_text}'

In [None]:
data_file_list = [
    'train_rand_split.jsonl',
    'dev_rand_split.jsonl',
]
out_file_list = [
    'train.json',
    'dev.json',
]

In [None]:
read_dir = './ori/CommonsenseQA/'
out_dir = './CommonsenseQA'
out_dir = os.path.join(out_dir,out_name)
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

In [None]:
start = time.time()
print('out_name:{}'.format(out_name))
for data_file,out_file in zip(data_file_list,out_file_list):
    read_path = os.path.join(read_dir,data_file)
    out_path = os.path.join(out_dir,out_file)
    with jsonlines.open(read_path,'r') as f:
        datas = [data for data in f]
    with jsonlines.open(out_path,'w') as fw:
        for data in datas:
            ans = data['answerKey']
            data = data['question']
            question = data['stem']
            for choice in data['choices']:
                label = choice['label']
                text = choice['text']
                inp = inp_prompt.format(question = question, answer = text)
                if label == ans:
                    oup = 'Yes'
                    out_data = {
                        'instruction':inp,
                        'input':'',
                        'output':oup
                    }
                    fw.write(out_data)
                else:
                    oup = 'No'
                    if random.random() <= 0.25:
                        out_data = {
                            'instruction':inp,
                            'input':'',
                            'output':oup
                        }
                        fw.write(out_data)
end = time.time()
print('cost:{}秒'.format(round(end-start, 2)))

In [None]:
# CMeIE
# C. h[s1]t [s2]c→r
out_name = 'option_task'
inp_prompt = '''You are currently a senior expert in commonsense Q&A.
Your task is to choose the correct answer option based on the given question and five options. The input format of option is: "option number. option content".
The output format of the task is: correct option number.
Given question: "{question}"
Given options: {option}
'''
oup_prompt = '{answer_text}'

In [None]:
data_file_list = [
    'train_rand_split.jsonl',
    'dev_rand_split.jsonl',
]
out_file_list = [
    'train.json',
    'dev.json',
]

In [None]:
read_dir = './ori/CommonsenseQA/'
out_dir = './CommonsenseQA'
out_dir = os.path.join(out_dir,out_name)
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

In [None]:
start = time.time()
print('out_name:{}'.format(out_name))
for data_file,out_file in zip(data_file_list,out_file_list):
    read_path = os.path.join(read_dir,data_file)
    out_path = os.path.join(out_dir,out_file)
    with jsonlines.open(read_path,'r') as f:
        datas = [data for data in f]
    with jsonlines.open(out_path,'w') as fw:
        for data in datas:
            ans = data['answerKey']
            data = data['question']
            question = data['stem']
            option = ''
            for i,choice in enumerate(data['choices']):
                label = choice['label']
                text = choice['text']
                assert chr(ord('A')+i) == label
                option += '{}. {};\t'.format(label,text)
            option = option.strip()


            inp = inp_prompt.format(question = question, option = option)
            oup = ans
            out_data = {
                'instruction':inp,
                'input':'',
                'output':oup
            }
            fw.write(out_data)
end = time.time()
print('cost:{}秒'.format(round(end-start, 2)))

In [None]:
# CMeIE
# C. h[s1]t [s2]c→r
out_name = 'option_task_fewer'
inp_prompt = '''You are currently a senior expert in commonsense Q&A.
Your task is to choose the correct answer option based on the given question and {op_nums} options. The input format of option is: "option number. option content".
The output format of the task is: correct option number.
Given question: "{question}"
Given options: {option}
'''
oup_prompt = '{answer_text}'

In [None]:
data_file_list = [
    'train_rand_split.jsonl',
    'dev_rand_split.jsonl',
]
out_file_list = [
    'train.json',
    'dev.json',
]

In [None]:
read_dir = './ori/CommonsenseQA/'
out_dir = './CommonsenseQA'
out_dir = os.path.join(out_dir,out_name)
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

In [None]:
op_nums = ['two', 'three', 'four', 'five']
start = time.time()
print('out_name:{}'.format(out_name))
for data_file, out_file in zip(data_file_list, out_file_list):
    read_path = os.path.join(read_dir, data_file)
    out_path = os.path.join(out_dir, out_file)
    
    with jsonlines.open(read_path, 'r') as f:
        datas = [data for data in f]
    
    # Split data into 4 equal parts
    chunk_size = len(datas) // 4
    data_chunks = [
        datas[:chunk_size],
        datas[chunk_size:2*chunk_size],
        datas[2*chunk_size:3*chunk_size],
        datas[3*chunk_size:]
    ]
    
    with jsonlines.open(out_path, 'w') as fw:
        for chunk_idx, chunk in enumerate(data_chunks):
            n_options = chunk_idx + 2  # 2, 3, 4, 5 options
            op_num = op_nums[chunk_idx]
            
            for data in chunk:
                ans = data['answerKey']
                data = data['question']
                question = data['stem']
                
                # Get all choices and the correct answer
                all_choices = data['choices']
                correct_choice = None
                options = []
                
                for i, choice in enumerate(all_choices):
                    label = choice['label']
                    text = choice['text']
                    assert chr(ord('A') + i) == label
                    if label == ans:
                        correct_choice = (label, text)
                    options.append((label, text))
                
                # Randomly select n_options-1 incorrect options + the correct one
                incorrect_options = [opt for opt in options if opt[0] != ans]
                selected_incorrect = random.sample(incorrect_options, n_options-1)
                selected_options = selected_incorrect + [correct_choice]
                
                # Shuffle the options
                random.shuffle(selected_options)
                
                # Find the new correct label after shuffling
                new_correct_label = None
                option_text = ''
                for i, (label, text) in enumerate(selected_options):
                    if label == ans:
                        new_correct_label = chr(ord('A') + i)
                    option_text += '{}. {};\t'.format(chr(ord('A') + i), text)
                option_text = option_text.strip()
                
                inp = inp_prompt.format(question=question, option=option_text, op_nums= op_num)
                oup = new_correct_label
                out_data = {
                    'instruction': inp,
                    'input': '',
                    'output': oup
                }
                fw.write(out_data)
end = time.time()
print('cost:{}秒'.format(round(end-start, 2)))

In [None]:

all_end = time.time()
all_end-all_start