In [None]:
import re 

from datasets import load_dataset, load_from_disk
import torch
from peft import get_peft_model, LoraConfig, TaskType
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import GRPOConfig, GRPOTrainer



In [None]:
torch.set_default_dtype(torch.bfloat16)

In [None]:
# 下载数据

# raw_dataset = load_dataset(r'swulling/gsm8k_chinese')
# raw_dataset.save_to_disk('data/gsm8k_chinese')

raw_dataset = load_from_disk('data/gsm8k_chinese')
raw_dataset

In [None]:
SYS_PROMPT = '''\
你是一个输出思考过程的人工智能助手。按照以下格式输出：
<think>
在这里输出思考过程。
</think>
<answer>
在这里输出最终答案。
</answer>
'''

In [None]:
def dataset_map_to_conversational_format(item:  dict[str, list]):
    '''
    see: https://huggingface.co/docs/trl/v0.15.2/en/grpo_trainer#using-a-custom-reward-function
    '''
    prompt_list = []
    responses_list = []
    for question, answer in zip(item['question_zh-cn'], item['answer_only']):

        prompt_list.append(
            [{"role": "system", "content": SYS_PROMPT}, {"role": "user", "content": str(question)}]
        )
        responses_list.append(
            [{"role": "assistant", "content": str(answer).strip()}]
        )

    return {
        'prompt': prompt_list,
        'response': responses_list,
    }


In [None]:
maped_dataset = raw_dataset['train'].map(dataset_map_to_conversational_format, num_proc=2, batch_size=4, batched=True, remove_columns=raw_dataset['train'].column_names)

In [None]:
maped_dataset[0]

### 奖励函数设置

In [None]:
FORMAT_LABELS = ['<think>\n', '</think>\n', '<answer>\n', '</answer>\n']
ANSWER_RE = re.compile(r'.*<answer>(.*?)</answer>.*',re.DOTALL)
THINK_RE =  re.compile(r'.*<think>(.*?)</think>.*',re.DOTALL)
NUM_RE = re.compile(r'^[+-]?\d+\.?\d+?$')
FORMAT_RE = re.compile(r"^<think>\n.*?\n</think>\n<answer>\n.*?\n</answer>\n$")

def get_part_format_reward(text: str):
    '''
    获取部分格式正确的奖励
    '''
    label_counts = [text.count(label) for label in FORMAT_LABELS]
    total_rewards = sum(0.25 if l_cnt == 1 else 0.0 for l_cnt in label_counts)
    return float(total_rewards)

def extract_completion_think_or_answer(text: str, pattern: re.Pattern) -> str:
    '''
    提取答案或思考过程
    '''
    matchs = pattern.findall(text)
    if matchs:
        return str(matchs[0].strip())
    return ''

# --------------------------------------------

def part_format_reward_func(completions: list[dict], **kwargs):
    '''
    部分格式奖励
    '''
    completion_contents: list[str] = [completion[0]["content"] for completion in completions]
    rewards = []
    for content in completion_contents:
        rw = get_part_format_reward(text=content)
        if content.startswith('<think>'):
            rw += 0.25
        if content.strip().endswith('</answer>'):
            rw += 0.25
        rewards.append(rw)
    return rewards

def format_reward_func(completions: list[dict], **kwargs):
    '''
    回到格式完全正确奖励
    '''
    completion_contents = [completion[0]["content"] for completion in completions]
    matches = [FORMAT_RE.match(content) for content in completion_contents]
    return [1.5 if match else 0.0 for match in matches]

def answer_digit_reward_func(prompts: list[dict], completions: list[dict], response: list[dict], **kwargs):
    '''
    答案是数字奖励
    '''
    completion_contents = [completion[0]["content"] for completion in completions]
    labels_answers = [completion[0]["content"] for completion in response]
    completion_answers = [extract_completion_think_or_answer(content, ANSWER_RE) for content in completion_contents]
    rewards = []
    for label, predict in zip(labels_answers, completion_answers):
        if label == predict:
           rewards.append(1.5)
        elif label in predict and label != predict:
            # 答案部分匹配
            rewards.append(1.0)
        elif NUM_RE.match(predict):
            # 是数字就行
            rewards.append(0.5)
        else:
            rewards.append(0.0)
    return rewards

def answer_correct_reward_func(completions: list[dict], response: list[dict], **kwargs):
    '''
    答案正确性奖励, completions 模型生成内容，response 数据集的标准答案
    '''
    completion_contents = [completion[0]["content"] for completion in completions]
    labels_answers = [completion[0]["content"] for completion in response]
    completion_answers = [extract_completion_think_or_answer(content, ANSWER_RE) for content in completion_contents]

    rewards = [2.5 if label == predict else 0.0 for label, predict in zip(labels_answers, completion_answers)]

    return rewards
    
def think_length_reward_func(completions: list[dict], **kwargs):
    '''
    思考长度奖励
    '''
    completion_contents = [completion[0]["content"] for completion in completions]
    completion_thinks = [extract_completion_think_or_answer(content, THINK_RE) for content in completion_contents]
    
    rewards = []
    for think in completion_thinks:
        if len(think) <= 20:
            rewards.append(0.2)
        elif len(think) >= 500:
            rewards.append(0.2)
        elif len(think) >= 400:
            rewards.append(0.5)
        elif len(think) >= 300:
            rewards.append(0.8)
        else:
            # 偏向短思考 300 字以下
            rewards.append(1.5)

    return rewards

In [None]:
training_args = GRPOConfig(
    output_dir="output_models/Qwen2-0.5B-GRPO", 
    save_only_model=True,
    save_steps=100,
    warmup_ratio=0.01,
    report_to='tensorboard',
    warmup_steps=10,
    bf16=True, 
    learning_rate=5e-5,
    logging_steps=1, 
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    num_generations=8,
    max_prompt_length=384,
    max_completion_length=384,
    use_vllm=False,
    torch_empty_cache_steps=1,
    lr_scheduler_type='constant_with_warmup',
    )

In [None]:
use_lora = False
model_path = r'/mnt/sdc/models_home/Qwen2___5-0___5B-Instruct'

model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

if use_lora:
    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",]
    )
    model = get_peft_model(model, peft_config=lora_config)

In [None]:
trainer = GRPOTrainer(
    model=model,
    processing_class=tokenizer,
    reward_funcs=[
        part_format_reward_func,
        format_reward_func,
        answer_digit_reward_func,
        answer_correct_reward_func,
        think_length_reward_func,
    ],
    args=training_args,
    train_dataset=maped_dataset,
)

In [None]:
trainer.train()