0 导入包

In [2]:
# 仅限本机，设置代理，避免访问不了外网
import os
os.environ['http_proxy'] = 'http://127.0.0.1:7890'
os.environ['https_proxy'] = 'http://127.0.0.1:7890'

In [1]:
# 导入包
import re
import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import GRPOConfig, GRPOTrainer

1 构建 输入 输出 数据 

1.1加载数据

In [None]:
# 从huggingface加载数据集
data = load_dataset('openai/gsm8k', 'main')

In [5]:
# 加载本地数据集
data = load_dataset('parquet', data_files={
    'train': './openai-gsm8k/train.parquet',
    'test': './openai-gsm8k/test.parquet'
})

In [6]:
data

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 7473
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 1319
    })
})

1.2处理数据

In [None]:
# 系统提示词
SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""


# 提取数据集中 ### 后的答案
def make_answer(text:str):
    if '###' not in text:
        return None
    return text.split('####')[-1].strip()

In [7]:
# 处理数据集
dataset = data['train'].map(
    lambda x:{
        # 对 prompt answer 这两列进行遍历操作
        'prompt':[
            {'role':'system','content':SYSTEM_PROMPT},
            {'role':'user','content':x['question']}
        ],
        'answer':make_answer(x['answer'])
    }
)
dataset

Dataset({
    features: ['question', 'answer', 'prompt'],
    num_rows: 7473
})

1.3查看数据 的 输入 输出

In [8]:
# 打印 数据集 中的 推理过程
n = 97

from pprint import pprint
pprint(dataset['prompt'][n])


[{'content': '\n'
             'Respond in the following format:\n'
             '<reasoning>\n'
             '...\n'
             '</reasoning>\n'
             '<answer>\n'
             '...\n'
             '</answer>\n',
  'role': 'system'},
 {'content': 'Nancy, the librarian, is shelving books from the cart. She '
             'shelved 12 history books, 8 romance books, and 4 poetry books '
             'from the top section of the cart. Half the books on the bottom '
             'section of the cart were mystery books, which she quickly put '
             'back into place. Then, she shelved the remaining books from the '
             'bottom of the cart, including 5 Western novels and 6 '
             'biographies. How many books did she have on the book cart when '
             'she started?',
  'role': 'user'}]


In [9]:
# 打印 数据集 中的 答案
dataset['answer'][n]

'46'

2 指定 打分规则

2.1 GRPO处理数据的运行逻辑

In [10]:
'''奖励函数列表的运行

huggingface的GRPO 会处理数据集

prompt 来自于数据集
[ 代表批次batch_size=1
    [
        {'role': 'system', 'content': 'SYSTEM_PROMPT内容...'},
        {'role': 'user', 'content': '小明有5个苹果，吃掉2个，还剩几个？'}
    ],
]

completions 是模型的输出
GRPO 将 prompt 输入给模型 输出 completions
[ 代表批次batch_size=1
    [
        {'content': '<reasoning>5个苹果减去2个苹果\n5-2=3</reasoning>\n<answer>3</answer>'}
    ],
]

answer 来自于数据集
[ 代表批次batch_size=1
    '3',
]

奖励规则函数要根据上面的数据作为参数，来计算出最终的奖励分数
'''


'''理论上
最好所有奖励规则 都是以try except来识别是否给奖励
否则格式不对可能有报错
这些奖励规则可以自行设定
'''

'''只要保证
奖励的方向正确即可
具体分值不太重要
'''

"奖励函数列表的运行\n\nhuggingface的GRPO 会处理数据集\n\nprompt 来自于数据集\n[ 代表批次batch_size=1\n    [\n        {'role': 'system', 'content': 'SYSTEM_PROMPT内容...'},\n        {'role': 'user', 'content': '小明有5个苹果，吃掉2个，还剩几个？'}\n    ],\n]\n\ncompletions 是模型的输出\nGRPO 将 prompt 输入给模型 输出 completions\n[ 代表批次batch_size=1\n    [\n        {'content': '<reasoning>5个苹果减去2个苹果\n5-2=3</reasoning>\n<answer>3</answer>'}\n    ],\n]\n\nanswer 来自于数据集\n[ 代表批次batch_size=1\n    '3',\n]\n"

2.2奖励规则

In [11]:
# 从输出格式中提取答案
'''
输入：
<reasoning>
这是推理过程。
</reasoning>
<answer>
这是答案。
</answer>

输出：
这是答案。
'''
def extract_xml_answer(text:str):
    try:
        # 检查文本中是否包含所需的标签
        if '<answer>' not in text or '</answer>' not in text:
            return ""  # 如果没有找到标签，返回空字符串
        
        # 提取答案
        answer = text.split('<answer>')[1] # 输出 <answer> 后的内容
        answer = answer.split('</answer>')[0] # 输出 </answer> 前的内容
        return answer.strip() # 去除首尾空格
    except Exception as e:
        print(f"Error extracting answer from text: {text}")
        return ""  # 发生任何错误时返回空字符串

In [12]:
# ——————————设定 打分 规则——————————
'''打分规则只需要确保优化的方向正确即可，即正负号正确，量化并不需要非常精准'''





''' 检查正确性
responses = ['<reasoning>5个苹果减去2个苹果\n5-2=3</reasoning>\n<answer>3</answer>'] # 输出的批量
extracted_responses = ['3'] # 提取答案的批量

如果 提取答案 = 答案 则 打分 = 2.0
否则 打分 = 0.0
返回 [2.0] 一个列表
'''
def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    try:
        responses = [completion[0]['content'] for completion in completions] # 输出的批量
        extracted_responses = [extract_xml_answer(r) for r in responses] # 提取答案的批量
        
        # 打印调试信息
        if len(responses) > 0:
            print('-'*20)
            print(f"Response[0]:\n{responses[0]}") # 模型输出
            print(f"Extracted[0]:\n{extracted_responses[0]}") # 提取的答案
            print(f"Answer[0]:\n{answer[0]}") # 数据集答案
        
        # 如果提取的答案为空，给出0分
        return [2.0 if r and r == a else 0.0 for r, a in zip(extracted_responses, answer)]
    except Exception as e:
        print(f"Error in correctness_reward_func: {e}")
        return [0.0] * len(completions)  # 发生错误时返回相应长度的0分列表


''' 检查 是否是 纯数字
responses = ['<reasoning>5个苹果减去2个苹果\n5-2=3</reasoning>\n<answer>3</answer>'] # 输出的批量
extracted_responses = ['3'] # 提取答案的批量

如果 提取答案 是 纯数字 则 打分 = 0.5
否则 打分 = 0.0
返回 [0.5] 一个列表
'''
def int_reward_func(completions, **kwargs) -> list[float]:
    try:
        responses = [completion[0]['content'] for completion in completions] # 输出的批量
        extracted_responses = [extract_xml_answer(r) for r in responses] # 提取答案的批量
        # 如果提取的答案为空，给出0分
        return [0.5 if r and r.isdigit() else 0.0 for r in extracted_responses]
    except Exception as e:
        print(f"Error in int_reward_func: {e}")
        return [0.0] * len(completions)  # 发生错误时返回相应长度的0分列表

''' 检查 是否是 每个标签后都会换行，严格等于我要求的格式
responses = ['<reasoning>5个苹果减去2个苹果\n5-2=3</reasoning>\n<answer>3</answer>'] # 输出的批量
matches = [True] # 匹配的批量

如果 输出 是 严格格式 则 打分 = 0.5
否则 打分 = 0.0
返回 [0.5] 一个列表
'''
def strict_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"^<reasoning>\n.*?\n</reasoning>\n<answer>\n.*?\n</answer>\n$"
    responses = [completion[0]["content"] for completion in completions] # 输出的批量
    matches = [re.match(pattern, r) for r in responses] # 匹配的批量
    return [0.5 if match else 0.0 for match in matches] # 返回打分列表


''' 检查 是否是 每个标签后可以有不换行
responses = ['<reasoning>5个苹果减去2个苹果\n5-2=3</reasoning>\n<answer>3</answer>'] # 输出的批量
matches = [True] # 匹配的批量

如果 输出 是 软格式 则 打分 = 0.5
否则 打分 = 0.0
返回 [0.5] 一个列表
'''
def soft_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"
    responses = [completion[0]["content"] for completion in completions] # 输出的批量
    matches = [re.match(pattern, r) for r in responses] # 匹配的批量
    return [0.5 if match else 0.0 for match in matches] # 返回打分列表



''' 检查 标签的数量
responses = ['<reasoning>5个苹果减去2个苹果\n5-2=3</reasoning>\n<answer>3</answer>'] # 输出的批量
count = [0.5] # 标签数量的批量

如果 输出 是 标签数量 则 打分 = 0.5
否则 打分 = 0.0
返回 [0.5] 一个列表
'''
def count_xml(text) -> float:
    count = 0.0
    if text.count("<reasoning>\n") == 1:
        count += 0.125
    if text.count("\n</reasoning>\n") == 1:
        count += 0.125

    if text.count("\n<answer>\n") == 1:
        count += 0.125
        count -= len(text.split("\n</answer>\n")[-1])*0.001 # 如果标签后还有多余内容 扣分
    if text.count("\n</answer>") == 1:
        count += 0.125
        count -= (len(text.split("\n</answer>")[-1]) - 1)*0.001 # 如果标签后还有多余内容 扣分
    return count

def xmlcount_reward_func(completions, **kwargs) -> list[float]:
    contents = [completion[0]["content"] for completion in completions] # 输出的批量
    return [count_xml(c) for c in contents]

3 训练

In [None]:
# 设定使用的gpu,代表只能使用这个gpu
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [13]:
# 模型保存目录
output_dir = "outputs/Qwen-1.5B-GRPO" 


# 训练参数
training_args = GRPOConfig(
    output_dir=output_dir,
    run_name="Qwen-1.5B-GRPO-gsm8k",
    learning_rate=5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type='cosine',
    logging_steps=1,
    bf16=True,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_generations=16,
    max_prompt_length=256,
    max_completion_length=200,
    num_train_epochs=1,
    save_steps=100,
    max_grad_norm=0.1,
    log_on_each_node=False,
    use_vllm=False,
    vllm_gpu_memory_utilization=.3,
    vllm_device="cuda:0",
    report_to="none" #I'm disabling Wandb.
)

In [51]:
# 从huggingface加载模型

model_name = "Qwen/Qwen2.5-1.5B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="cuda:0" # 不需要改变，因为上一步环境变量已经设置好了
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [14]:
# 加载本地模型
model_path = "./qwen2.5-1.5b"
# model_path = "./qwen2.5-0.5b"
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    device_map="cuda:0",  # 不需要改变，因为上一步环境变量已经设置好了
    trust_remote_code=True  # Qwen模型需要这个参数
)

# 加载分词器
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    trust_remote_code=True,  # Qwen模型需要这个参数
    padding_side="left"  # 如果需要的话设置padding_side
)
tokenizer.pad_token = tokenizer.eos_token

In [15]:
print(next(model.parameters()).device)  # 打印模型所在设备

cuda:0


In [None]:
trainer = GRPOTrainer(
    model=model,
    processing_class=tokenizer,
    # 根据每一个函数的返回值，加起来给一个评分
    reward_funcs=[
        xmlcount_reward_func,
        soft_format_reward_func,
        strict_format_reward_func,
        int_reward_func,
        correctness_reward_func],
    args=training_args,
    train_dataset=dataset,
    #peft_config=peft_config
)
trainer.train()

trainer.save_model(output_dir)

--------------------
Response[0]:
To determine the minimum grade Ahmed needs to beat Emily, we'll first calculate the total points each of them has accumulated based on the assignments and then add the final assignment's grade to Emily's total.

### Calculation for Ahmed:

1. Ahmed has 9 assignments with a 91 in each:
\[ 9 \times 91 = 819 \]

2. Ahmed scored a 90 on the final assignment with a weight of \( \frac{1}{9} \):
\[ 90 \times \frac{1}{9} = 10 \]

3. Adding this to Ahmed's starting total:
\[ 819 + 10 = 829 \]

### Calculation for Emily:

1. Emily has 9 assignments with a 92 in each:
\[ 9 \times 92 = 828 \]

2. Emily scored a 90 on the final assignment:
\[ 90 \times
Extracted[0]:

Answer[0]:
100
--------------------
Response[0]:
Let's denote the full capacity of the tank as \( C \) gallons.

Given that the gauge shows the tank is 1/3 full, this means the amount of water already in the tank is \( \frac{1}{3}C \) gallons.

When 16 gallons of water are added to fill the tank, the t

Step,Training Loss
1,0.0
2,-0.0
3,0.0
4,0.0
5,0.0
6,0.0
7,0.0
8,0.0
9,0.0
10,0.0


--------------------
Response[0]:
Adult women have 20 bones, so there are 20 * 0.5 = 10 adult men. This means there are 10 * 2 = 20 children in the graveyard. Adult men have 20 + 5 = 25 bones each. Children have 20 / 2 = 10 bones each. Adult women total 20 * 20 = 400 bones. Men total 20 * 25 = 500 bones. Children total 10 * 20 = 200 bones. The total is 400 + 500 + 200 = 1100 bones.

Therefore, the answer is 1100.
Extracted[0]:

Answer[0]:
375
--------------------
Response[0]:
Reasoning:

First, we need to calculate the total cost of the books and the magazines. 

For the books about cats and the solar system, we have:
- 7 books about cats priced at $7 each.
- 2 books about the solar system also priced at $7 each.

So, the total cost for books is:
\( (7 \text{ books} \times $7 \text{ per book}) + (2 \text{ books} \times $7 \text{ per book}) = $49 + $14 = $63 \).

For the magazines, we have 3 magazines priced at $4 each. So, the total cost for magazines is:
\( 3 \text{ magazines} \times 