In [1]:
from dataclasses import dataclass, field

In [2]:
@dataclass
class QuestionItem:
    question: str
    options: list[str]
    reasoning: str | None = field(default=None)
    answer: str | None = field(default=None)

@dataclass
class Entry:
    problem: str = field(default="")
    questions: list[QuestionItem] = field(default_factory=list)

In [3]:
import pickle
entries = pickle.load(open("./entries_false.pkl", "rb"))

In [4]:
import dotenv
dotenv.load_dotenv("../.env")

True

In [5]:
import re
answer_regex = re.compile(r"答案.*?([A-Z])")
export_entries = []
for entry in entries:
    export_questions = []
    for question in entry.questions:
        if question.reasoning is None:
            continue
        if question.reasoning.startswith("%"):
            print(question)
            continue
        matches = answer_regex.findall(question.reasoning)
        answer = matches[-1] if len(matches) > 0 else None
        if answer is None:
            continue
        if answer != question.answer:
            # print(f"答案不匹配: {answer} != {question.answer}")
            continue
        export_questions.append(QuestionItem(
            question=question.question, options=question.options, reasoning=question.reasoning, answer=answer
        ))
    if len(export_questions) == 0:
         continue
    export_entries.append(Entry(problem=entry.problem, questions=export_questions))

In [6]:
# deep copy
export_entries_translated = [
    Entry(
        problem=entry.problem,
        questions=[
            QuestionItem(
                question=question.question,
                options=question.options,
                reasoning=question.reasoning,
                answer=question.answer
            )
            for question in entry.questions
        ]
    )
    for entry in export_entries
]

In [7]:
for entry in export_entries_translated:
    if "翻译" in entry.problem:
        export_entries_translated.remove(entry)

In [8]:
len(export_entries_translated)

416

In [9]:
from tqdm.notebook import tqdm

In [10]:
def translate(text: list[str], trans_back=False) -> list[str]:
    from tencentcloud.common import credential
    from tencentcloud.tmt.v20180321 import tmt_client, models
    import os
    cred = credential.Credential(
        os.getenv("TENCENT_SECRET_ID"),
        os.getenv("TENCENT_SECRET_KEY")
    )
    from tencentcloud.common.profile.client_profile import ClientProfile
    from tencentcloud.common.profile.http_profile import HttpProfile
    httpProfile = HttpProfile()
    httpProfile.endpoint = "tmt.tencentcloudapi.com"
    clientProfile = ClientProfile()
    clientProfile.httpProfile = httpProfile
    client = tmt_client.TmtClient(cred, "ap-beijing", clientProfile)
    request = models.TextTranslateBatchRequest()
    params = {
        "Source": "zh" if not trans_back else "ja",
        "Target": "ja" if not trans_back else "zh",
        "ProjectId": 0,
        "SourceTextList": text
    }
    import json
    request.from_json_string(json.dumps(params))
    response = client.TextTranslateBatch(request)
    return json.loads(response.to_json_string())["TargetTextList"]
    

In [11]:
def translate_entries_inplace(entries: list[Entry], trans_back=False):
    # problems = [entry.problem for entry in entries]
    # translated_problems = translate(problems, trans_back)
    # for entry, translated_problem in zip(entries, translated_problems):
    #     entry.problem = translated_problem
    question_flat = [
        question.question
        for entry in entries
        for question in entry.questions
    ]
    reasoning_flat = [
        question.reasoning
        for entry in entries
        for question in entry.questions
    ]
    translated_question_flat = translate(question_flat, trans_back)
    translated_reasoning_flat = translate(reasoning_flat, trans_back)
    
    i = 0
    for entry in entries:
        for question in entry.questions:
            question.question = translated_question_flat[i]
            question.reasoning = translated_reasoning_flat[i]
            i += 1
    
    problem_flat = [entry.problem for entry in entries]
    translated_problem_flat = translate(problem_flat, trans_back)
    i = 0
    for entry in entries:
        entry.problem = translated_problem_flat[i]
        i += 1
    
    options_flat = [
        option
        for entry in entries
        for question in entry.questions
        for option in question.options
    ]
    translated_options_flat = translate(options_flat, trans_back)
    i = 0
    for entry in entries:
        for question in entry.questions:
            for j in range(len(question.options)):
                question.options[j] = translated_options_flat[i]
                i += 1

In [12]:
# export_entries_translated = export_entries_translated[:2]

In [13]:
batch = 2
for i in tqdm(range(0, len(export_entries_translated), batch)):
    from_idx = i
    to_idx = min(i + batch, len(export_entries_translated))
    translate_entries_inplace(export_entries_translated[from_idx:to_idx])
    translate_entries_inplace(export_entries_translated[from_idx:to_idx], True)

  0%|          | 0/208 [00:00<?, ?it/s]

In [14]:
export_entries_translated[:2]

[Entry(problem='我们有一个方程，系数是A，B和C。以下是一些已知的规则。\n\n1.若B ^2 - 4AC = 0，则方程有实数解。\n2.当B ^2 - 4AC> 0时，方程有两个不同的实数解。\n3.若B ^2 - 4AC < 0，则方程有两个不同的复解。\n\n按照上面的规则，回答下面的选择问题：', questions=[QuestionItem(question='备选办法2：\n当系数A=7，B=9，C=5时，方程的解是6。', options=['是的，长官。', '不，不。'], reasoning='首先，可以看出公式为\\(B^2-4AC=0\\)。这意味着如果B^2-4AC=0，则AC=0。这意味着\\(B^2=4AC\\)，这意味着\\(A=4\\)。\n\n然后可知，系数A=7，B=9，C=5。因此，可以使用上述规则找到相应的方程组。\n\n-第二个方程：\n\\[B^2-4AC>0\\]\n\n因为B^2=4AC为人所知，所以将\\(A=4\\)置换为\\(A=4\\)来解这个联立方程式。\n\\[B^2-4\\cdot4\\cdot9>0\\]\n\\[B^2-36>0\\]\n\\[B^2>36\\]\n\n-第三个方程：\n\\[B^2-4\\cdot4\\cdot5>0\\]\n\\[B^2-16\\cdot5>0\\]\n\\[B^2>80\\]\n\n以上，所有的方程式，第2的条件\\(B^2-4\\cdot4\\cdot9>0\\)满足，第3的方程式\\(B^2-4\\cdot4\\cdot5>0\\)对应。这意味着在满足所有条件的情况下，B^2的值大于9，因此联立方程也有两个不同的实数解。\n\n因此，选项B：“否”不正确。正确答案是A：是的。', answer='A')]),
 Entry(problem='有一组程序可以用来计算两个数字的最小公倍数LCM。这个程序使用的是帮助的最大公数GCD。具体的程序包括以下内容。\n\n1.如果两个数字相等，则GCD将是其中一个数字。\n2.如果第一个数字小于第二个数字，则从第二个数字中减去第一个数字，然后对新结果递归计算GCD。\n3.如果第一个数字大于第二个数字，则交换这两个数字，并递归计算GCD。\n\n根据GCD的计算，我们可以用下面的公式计算两个数字的LC

In [15]:
pickle.dump(
    export_entries_translated, open("./entries_aug_false.pkl", "wb")
)