In [1]:
import json
from http import HTTPStatus
import dashscope
from dotenv import dotenv_values
from retry import retry
config = dotenv_values('.env')

In [2]:
MODEL_NAME = 'qwen-max'
dashscope.api_key = config['qwen_key'],

@retry(delay=60, tries=3)
def call_qwen_api(MODEL_NAME, query):
    messages = [
        {'role': 'user', 'content': query}]
    response = dashscope.Generation.call(
        MODEL_NAME,
        messages=messages,
        result_format='message',  # set the result is message format.
    )
    if response.status_code == HTTPStatus.OK:
        #print(response)
        return response['output']['choices'][0]['message']['content']
    else:
        print('Request id: %s, Status code: %s, error code: %s, error message: %s' % (
            response.request_id, response.status_code,
            response.code, response.message
        ))
        raise Exception()

In [3]:
def get_prompt(problem, question, options):

    options = '\n'.join(f"{'ABCDEFG'[i]}. {o}" for i, o in enumerate(options))

    prompt = f"""你是一个逻辑推理专家，擅长解决逻辑推理问题。以下是一个逻辑推理的题目，形式为单项选择题。所有的问题都是（close-world assumption）闭世界假设，即未观测事实都为假。请逐步分析问题并在最后一行输出答案，最后一行的格式为"答案是：A"。题目如下：

### 题目:
{problem}

### 问题:
{question}
{options}
"""

    return prompt

In [4]:
def get_question_prompt(data):
    prompt = f"""你是一个逻辑推理问题出题专家，以下JSON是一个problem，对应一些子问题。除了这些子问题外再生成三个其他子问题，保持格式一致并给出参考答案。answer格式为子母序号："A"
### promble如下：
{data}
### 输出格式如下:
{{
problem:
questions:[
question:
options:[]
answer:
]
id:
}}
"""
    return prompt


In [5]:
def read_file(ifn):
    # 读取输入文件
    data = []
    with open(ifn) as reader:
        for line in reader:
            # 解析每一行JSON数据
            sample = json.loads(line)
            data.append(sample)

    return data

In [16]:
def write_jsonl(results, filename):
    with open(f'data/{filename}.jsonl', 'w',encoding='utf-8') as outfile:
        for entry in results:
            json.dump(entry, outfile, ensure_ascii=False)
            outfile.write('\n')
            
def write_json(data, filename):
    with open(f'data/{filename}.json', 'w',encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False)

In [7]:
import re


def extract_json(response):
    # 使用正则表达式匹配 JSON 数据
    match = re.search(r'```json\s*(.*?)\s*```', response, re.DOTALL)
    if match:
        # 如果找到了 JSON 数据，则返回清理后的 JSON 字符串
        return match.group(1).strip()
    else:
        # 如果没有找到 JSON 数据，则返回原始响应
        return response


def process_data(data):
    query = get_question_prompt(data)
    respond = ''
    try:
        respond = call_qwen_api(MODEL_NAME, query)
        # 清理 JSON 字符串
        respond = extract_json(respond)
        # 解析 JSON 字符串
        data = json.loads(respond)
    except Exception as e:
        #print(respond)
        print(f"Error: {str(e)}")
    return data

In [8]:
file = read_file('data/round1_train_data.jsonl')

In [None]:
from tqdm import tqdm
import concurrent.futures
#file = file[:5]

# 并行批量处理
def batch_process_questions(file, max_workers=5):
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_data, data) for data in file]
        results = [future.result() for future in tqdm(concurrent.futures.as_completed(futures), total=len(file))]
    return results

results = batch_process_questions(file)

write_json(results,"output")

In [17]:
write_jsonl(results,"output")

In [None]:
import random

def generate_option_and_answer(correct_answer):
    # Determine the range based on the number of digits in the correct answer
    if correct_answer < 10:
        range_start, range_end = 1, 10
    elif correct_answer < 100:
        range_start, range_end = 10, 100
    else:
        range_start, range_end = 100, 1000
    
    # Generate 4 random options including the correct answer
    options = [correct_answer]
    while len(options) < 4:
        option = random.randint(range_start, range_end)
        if option not in options:
            options.append(option)
    random.shuffle(options)
    
    # Find the correct answer's index (0-based)
    answer_index = options.index(correct_answer)
    
    # Create answer "A" as the correct option's index (1-based)
    answer = chr(65 + answer_index)
    
    return {
        "options": options,
        "answer": answer
    }

# Example usage
correct_answer = 42
result = generate_option_and_answer(correct_answer)
print("Options:", result["options"])
print("Answer:", result["answer"])


In [None]:
#数据集下载
from modelscope.msdatasets import MsDataset
ds =  MsDataset.load('modelscope/gsm8k', subset_name='main', split='train')
#您可按需配置 subset_name、split，参照“快速使用”示例代码

In [3]:
def gsm8k_to_problem(data):
    
    
    
    
    return {
        "options": options,
        "answer": answer
    }

7473

In [6]:
sentance = ds[0]['question'].split('.')


['Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May',
 ' How many clips did Natalia sell altogether in April and May?']