In [28]:
from experiment_project.utils.initial.util import init_sys_env
from experiment_project.utils.files.read import read_yaml
import dspy
from dspy.teleprompt import BootstrapFewShot

from dspy.datasets.gsm8k import GSM8K, gsm8k_metric
secret_env_file = 'env_secret_config.yaml'

api_configs = read_yaml(secret_env_file)
model_config = api_configs.get('openai')
turbo = dspy.OpenAI(model=model_config.get('model'), max_tokens=1024,api_key=model_config.get('api_key'))
dspy.settings.configure(lm=turbo)


In [29]:
# 构建训练数据集
trainset = [
    {"question": "中国的首都是哪里？", "answer": "北京"},
    {"question": "《红楼梦》的作者是谁？", "answer": "曹雪芹"},
    {"question": "太阳系中最大的行星是什么？", "answer": "木星"},
    {"question": "水的沸点是多少摄氏度？", "answer": "100度"},
    {"question": "蒙娜丽莎的画家是谁？", "answer": "达芬奇"},
    {"question": "最小的质数是什么？", "answer": "2"},
    {"question": "金的化学符号是什么？", "answer": "Au"},
    {"question": "特斯拉的CEO是谁？", "answer": "埃隆·马斯克"},
]

# 构建验证数据集
devset = [
    {"question": "德国的首都是哪里？", "answer": "柏林"},
    {"question": "《1984》的作者是谁？", "answer": "乔治·奥威尔"},
    {"question": "地球上最大的海洋是什么？", "answer": "太平洋"},
    {"question": "水的冰点是多少摄氏度？", "answer": "0度"},
    {"question": "《星夜》的画家是谁？", "answer": "梵高"},
]
trainset = [dspy.Example(i).with_inputs('question') for i in trainset]
devset = [dspy.Example(i).with_inputs('question') for i in devset]

In [19]:
# # 构建训练数据集
# trainset = [
#     {
#         "question": "为什么清王朝会灭亡？",
#         "answer": "多种原因，包括内部腐败、外部侵略和农民起义。",
#         "chain_of_thought": "清王朝在19世纪面临严重的内部腐败问题，官员贪污腐败导致国家治理能力下降。同时，外部列强的侵略使得清政府签订了一系列不平等条约，国力进一步衰弱。最后，农民起义如太平天国运动进一步动摇了清政府的统治基础，最终导致清王朝的灭亡。"
#     },
#     {
#         "question": "为什么二战中德国没有胜利？",
#         "answer": "多种原因，包括战略错误、资源不足和盟军的强大反攻。",
#         "chain_of_thought": "德国在二战中犯了多次战略错误，如进攻苏联导致两线作战，严重消耗了德国的军力和资源。同时，德国的资源不足以支撑长期战争，特别是在盟军的海上封锁下，物资供应极为困难。最后，盟军的强大反攻，如诺曼底登陆和苏联的东线反攻，彻底击溃了德国的抵抗力量，导致德国的失败。"
#     },
#     {
#         "question": "美国为什么能够赢得独立战争？",
#         "answer": "多种原因，包括法国的支持、英国的战略失误和美国的顽强抵抗。",
#         "chain_of_thought": "美国在独立战争中得到了法国的军事和经济支持，大大增强了其战斗力。英国在战争中的战略失误，如未能有效控制南方战场，导致其处于被动局面。同时，美国人民的顽强抵抗和对自由的渴望，使得他们在面对强敌时依然坚持不懈，最终赢得了独立战争的胜利。"
#     }
# ]
# 
# # 构建验证数据集
# devset = [
#     {
#         "question": "为什么法国大革命会爆发？",
#         "answer": "多种原因，包括经济危机、社会不平等和启蒙思想的传播。",
#         "chain_of_thought": "法国大革命的爆发有多种原因。首先，法国在18世纪末期面临严重的经济危机，政府财政赤字巨大，国家债务高企。其次，社会不平等现象严重，第三等级（包括农民和市民）承担了沉重的税负，而贵族和教士享有特权。最后，启蒙思想的传播激发了人民对自由和平等的向往，促使他们起来反抗旧制度，最终引发了法国大革命。"
#     },
#     {
#         "question": "为什么罗马帝国会衰亡？",
#         "answer": "多种原因，包括内部腐败、外部入侵和经济衰退。",
#         "chain_of_thought": "罗马帝国的衰亡有多种原因。内部腐败是一个重要因素，官员的贪污腐败削弱了国家的治理能力。外部入侵，如日耳曼部落的入侵，加速了罗马帝国的崩溃。经济衰退也是一个重要原因，随着领土的扩张，维护庞大帝国的成本不断增加，而经济发展却停滞不前，最终导致罗马帝国的衰亡。"
#     }
# ]
# 
# trainset = [dspy.Example(i).with_inputs('question') for i in trainset]
# devset = [dspy.Example(i).with_inputs('question') for i in devset]


In [30]:
trainset[0]

Example({'question': '中国的首都是哪里？', 'answer': '北京'}) (input_keys={'question'})

In [31]:
from dspy.evaluate import Evaluate

def simple_metric(example, pred,*args,**kwargs):
    return example['answer'].lower() == pred['answer'].lower()


evaluate = Evaluate(devset=devset, metric=simple_metric, num_threads=8, display_progress=True, display_table=False)

In [32]:
# 环境设置完成后，让我们定义一个自定义程序，利用 ChainOfThought 模块执行逐步推理以生成答案：

class CoT(dspy.Module):
    def __init__(self):
        super().__init__()
        self.prog = dspy.ChainOfThought("question -> answer")
    
    def forward(self, question):
        return self.prog(question=question)


In [38]:
import dspy
from dspy.evaluate import Evaluate
from dspy.teleprompt import BootstrapFewShot

# 使用 BootstrapFewShot 进行优化
config = dict(max_bootstrapped_demos=4, max_labeled_demos=4)
teleprompter = BootstrapFewShot(metric=simple_metric, **config)

# 为每个训练样本设置输入值

# 使用设置了输入值的训练集进行优化
optimized_cot = teleprompter.compile(CoT(), trainset=trainset)
optimized_cot.save('optimized_cot.json')



 62%|██████▎   | 5/8 [00:00<00:00, 3834.62it/s]


In [39]:
evaluate(optimized_cot)

Average Metric: 4 / 5  (80.0): 100%|██████████| 5/5 [00:00<00:00, 1912.41it/s]


80.0

In [40]:
question = "美国的金融中心是那个城市?"
result = optimized_cot(question=question)
print(f"问题: {question}")
print(f"最终预测答案: {result.answer}")



问题: 美国的金融中心是那个城市?
最终预测答案: 纽约市


In [43]:
turbo.history[-1]

{'prompt': 'Given the fields `question`, produce the fields `answer`.\n\n---\n\nFollow the following format.\n\nQuestion: ${question}\nReasoning: Let\'s think step by step in order to ${produce the answer}. We ...\nAnswer: ${answer}\n\n---\n\nQuestion: 中国的首都是哪里？\nReasoning: Let\'s think step by step in order to produce the answer. We need to identify the capital city of China. The capital city is the political, cultural, and historical center of the country. The capital of China is Beijing.\nAnswer: 北京\n\n---\n\nQuestion: 《红楼梦》的作者是谁？\nReasoning: Let\'s think step by step in order to Reasoning: Let\'s think step by step in order to produce the answer. We need to identify the author of the classic Chinese novel 《红楼梦》. This novel is one of the Four Great Classical Novels of Chinese literature and was written during the Qing dynasty. The author is widely recognized as Cao Xueqin.\nAnswer: 曹雪芹\n\n---\n\nQuestion: 太阳系中最大的行星是什么？\nReasoning: Let\'s think step by step in order to produce the an

In [44]:
turbo.inspect_history(n=1)




Given the fields `question`, produce the fields `answer`.

---

Follow the following format.

Question: ${question}
Reasoning: Let's think step by step in order to ${produce the answer}. We ...
Answer: ${answer}

---

Question: 中国的首都是哪里？
Reasoning: Let's think step by step in order to produce the answer. We need to identify the capital city of China. The capital city is the political, cultural, and historical center of the country. The capital of China is Beijing.
Answer: 北京

---

Question: 《红楼梦》的作者是谁？
Reasoning: Let's think step by step in order to Reasoning: Let's think step by step in order to produce the answer. We need to identify the author of the classic Chinese novel 《红楼梦》. This novel is one of the Four Great Classical Novels of Chinese literature and was written during the Qing dynasty. The author is widely recognized as Cao Xueqin.
Answer: 曹雪芹

---

Question: 太阳系中最大的行星是什么？
Reasoning: Let's think step by step in order to produce the answer. We need to identify the largest p

'\n\n\nGiven the fields `question`, produce the fields `answer`.\n\n---\n\nFollow the following format.\n\nQuestion: ${question}\nReasoning: Let\'s think step by step in order to ${produce the answer}. We ...\nAnswer: ${answer}\n\n---\n\nQuestion: 中国的首都是哪里？\nReasoning: Let\'s think step by step in order to produce the answer. We need to identify the capital city of China. The capital city is the political, cultural, and historical center of the country. The capital of China is Beijing.\nAnswer: 北京\n\n---\n\nQuestion: 《红楼梦》的作者是谁？\nReasoning: Let\'s think step by step in order to Reasoning: Let\'s think step by step in order to produce the answer. We need to identify the author of the classic Chinese novel 《红楼梦》. This novel is one of the Four Great Classical Novels of Chinese literature and was written during the Qing dynasty. The author is widely recognized as Cao Xueqin.\nAnswer: 曹雪芹\n\n---\n\nQuestion: 太阳系中最大的行星是什么？\nReasoning: Let\'s think step by step in order to produce the answer.