In [1]:
"""
Llama3.1_(8B)-GRPO 模型微调代码
使用Unsloth框架进行Llama 3.1 8B模型的GRPO(Generative Reward-Paired Optimization)微调
"""

from unsloth import FastLanguageModel
import torch

# 设置模型参数
max_seq_length = 1024  # 可以增加以获得更长的推理轨迹
lora_rank = 32  # 更大的rank会使模型更智能，但训练更慢

# 加载预训练模型
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "/root/autodl-tmp/models/Qwen/Qwen2___5-7B-Instruct",
    max_seq_length = max_seq_length,
    load_in_4bit = True,  # 使用4bit量化加载
    fast_inference = True,  # 启用vLLM快速推理
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.6,  # 如果显存不足可以降低
)

# 配置LoRA参数
model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank,  # LoRA秩，建议使用8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],  # 如果显存不足可以移除QKVO
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth",  # 启用长上下文微调
    random_state = 3407,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 03-30 16:23:46 [__init__.py:256] Automatically detected platform cuda.
==((====))==  Unsloth 2025.3.19: Fast Qwen2 patching. Transformers: 4.49.0. vLLM: 0.8.1.
   \\   /|    NVIDIA GeForce RTX 4090. Num GPUs = 1. Max memory: 23.65 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = True]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Your GPU cannot handle sequence lengths of 256 due to limited GPU memory.
Unsloth: Your GPU can only handle approximately the maximum sequence length of 256.
Unsloth: vLLM loading /root/autodl-tmp/models/Qwen/Qwen2___5-7B-Instruct with actual GPU utilization = 58.93%
Unsloth: Your GPU has CUDA 

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 03-30 16:25:03 [punica_selector.py:18] Using PunicaWrapperGPU.
INFO 03-30 16:25:04 [model_runner.py:1146] Model loading took 5.3635 GB and 3.657617 seconds
INFO 03-30 16:25:10 [worker.py:267] Memory profiling takes 6.24 seconds
INFO 03-30 16:25:10 [worker.py:267] the current vLLM instance can use total_gpu_memory (23.65GiB) x gpu_memory_utilization (0.59) = 13.94GiB
INFO 03-30 16:25:10 [worker.py:267] model weights take 5.36GiB; non_torch_memory takes 0.08GiB; PyTorch activation peak memory takes 0.70GiB; the rest of the memory reserved for KV Cache is 7.80GiB.
INFO 03-30 16:25:10 [executor_base.py:111] # cuda blocks: 9124, # CPU blocks: 7021
INFO 03-30 16:25:10 [executor_base.py:116] Maximum concurrency for 256 tokens per request: 570.25x
INFO 03-30 16:25:13 [model_runner.py:1442] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If o

Capturing CUDA graph shapes: 100%|██████████| 19/19 [00:24<00:00,  1.28s/it]

INFO 03-30 16:25:38 [model_runner.py:1570] Graph capturing finished in 24 secs, took 2.10 GiB
INFO 03-30 16:25:38 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 33.99 seconds



Sliding Window Attention is enabled but not implemented for `eager`; unexpected results may be encountered.
Unsloth 2025.3.19 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [3]:
# 数据准备部分
import re
from datasets import load_dataset, Dataset

# 定义系统提示词
SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

# 定义XML格式模板
XML_COT_FORMAT = """\
<reasoning>
{reasoning}
</reasoning>
<answer>
{answer}
</answer>
"""

def extract_xml_answer(text: str) -> str:
    """从XML格式文本中提取答案部分"""
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()

def extract_hash_answer(text: str) -> str | None:
    """从带####标记的文本中提取答案"""
    if "####" not in text:
        return None
    return text.split("####")[1].strip()

def get_gsm8k_questions(split = "train") -> Dataset:
    """加载GSM8K数据集并进行预处理"""
    #data = load_dataset('openai/gsm8k', 'main')[split]
    data = load_dataset('/root/autodl-tmp/datasets/gsm8k', 'main')[split]
    data = data.map(lambda x: {
        'prompt': [
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': x['question']}
        ],
        'answer': extract_hash_answer(x['answer'])
    })
    return data

# 加载数据集
dataset = get_gsm8k_questions()

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

In [4]:
# 定义各种奖励函数
def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    """正确性奖励函数：检查答案是否正确"""
    responses = [completion[0]['content'] for completion in completions]
    q = prompts[0][-1]['content']
    extracted_responses = [extract_xml_answer(r) for r in responses]
    print('-'*20, f"Question:\n{q}", f"\nAnswer:\n{answer[0]}", f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}")
    return [2.0 if r == a else 0.0 for r, a in zip(extracted_responses, answer)]

def int_reward_func(completions, **kwargs) -> list[float]:
    """整数奖励函数：检查答案是否为整数"""
    responses = [completion[0]['content'] for completion in completions]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    return [0.5 if r.isdigit() else 0.0 for r in extracted_responses]

def strict_format_reward_func(completions, **kwargs) -> list[float]:
    """严格格式奖励函数：检查是否完全符合XML格式"""
    pattern = r"^<reasoning>\n.*?\n</reasoning>\n<answer>\n.*?\n</answer>\n$"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def soft_format_reward_func(completions, **kwargs) -> list[float]:
    """宽松格式奖励函数：检查是否基本符合XML格式"""
    pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def count_xml(text) -> float:
    """计算XML标签的完整性得分"""
    count = 0.0
    if text.count("<reasoning>\n") == 1:
        count += 0.125
    if text.count("\n</reasoning>\n") == 1:
        count += 0.125
    if text.count("\n<answer>\n") == 1:
        count += 0.125
        count -= len(text.split("\n</answer>\n")[-1])*0.001
    if text.count("\n</answer>") == 1:
        count += 0.125
        count -= (len(text.split("\n</answer>")[-1]) - 1)*0.001
    return count

def xmlcount_reward_func(completions, **kwargs) -> list[float]:
    """XML标签计数奖励函数"""
    contents = [completion[0]["content"] for completion in completions]
    return [count_xml(c) for c in contents]

# 训练配置
max_prompt_length = 256

from trl import GRPOConfig, GRPOTrainer
training_args = GRPOConfig(
    learning_rate = 5e-6,  # 学习率
    adam_beta1 = 0.9,      # Adam优化器参数
    adam_beta2 = 0.99,
    weight_decay = 0.1,    # 权重衰减
    warmup_ratio = 0.1,    # 预热比例
    lr_scheduler_type = "cosine",  # 学习率调度器类型
    optim = "paged_adamw_8bit",    # 优化器类型
    logging_steps = 1,             # 日志记录步数
    per_device_train_batch_size = 1,  # 每个设备的训练批次大小
    gradient_accumulation_steps = 1,  # 梯度累积步数
    num_generations = 6,              # 生成数量
    max_prompt_length = max_prompt_length,  # 最大提示词长度
    max_completion_length = max_seq_length - max_prompt_length,  # 最大完成长度
    max_steps = 250,                 # 最大训练步数
    save_steps = 250,                # 保存步数
    max_grad_norm = 0.1,             # 最大梯度范数
    report_to = "none",              # 报告目标
    output_dir = "outputs",          # 输出目录
)

# 初始化训练器
trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        xmlcount_reward_func,
        soft_format_reward_func,
        strict_format_reward_func,
        int_reward_func,
        correctness_reward_func,
    ],
    args = training_args,
    train_dataset = dataset,
)

# 开始训练
trainer.train()

Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 1 to the `num_generations` of 6


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 7,473 | Num Epochs = 1 | Total steps = 250
O^O/ \_/ \    Batch size per device = 6 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (6 x 1 x 1) = 6
 "-____-"     Trainable parameters = 80,740,352/7,000,000,000 (1.15% trained)


-------------------- Question:
A concert ticket costs $40. Mr. Benson bought 12 tickets and received a 5% discount for every ticket bought that exceeds 10. How much did Mr. Benson pay in all? 
Answer:
476 
Response:
reasoning
To solve this, we need to calculate the cost of the tickets with the given discount. Mr. Benson bought 12 tickets in total, which means 2 tickets (12 - 10) were bought with a discount.

1. First, calculate the cost of the first 10 tickets without any discount.
2. Calculate the discount on the 2 extra tickets.
3. Add the discounted cost to the cost of the first 10 tickets.

Step 1:
Cost of 10 tickets without discount:
10 tickets * $40 per ticket = $400

Step 2:
Discount rate for extra tickets = 5%
Discounted cost for each extra ticket = $40 * 5% = $40 * 0.05 = $2

Step 3:
Cost for 2 extra tickets with 
Extracted:
reasoning
To solve this, we need to calculate the cost of the tickets with the given discount. Mr. Benson bought 12 tickets in total, which means 2 ticket

Step,Training Loss,reward,reward_std,completion_length,kl,rewards / xmlcount_reward_func,rewards / soft_format_reward_func,rewards / strict_format_reward_func,rewards / int_reward_func,rewards / correctness_reward_func
1,-0.0,-0.012333,0.164507,179.0,0.0,-0.012333,0.0,0.0,0.0,0.0
2,-0.0,0.020833,0.051031,154.0,0.0,0.020833,0.0,0.0,0.0,0.0
3,0.0,0.083333,0.06455,171.0,0.000319,0.083333,0.0,0.0,0.0,0.0
4,0.0,-0.011333,0.162236,169.0,0.000442,-0.011333,0.0,0.0,0.0,0.0
5,0.0,0.041667,0.06455,124.0,0.000528,0.041667,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,157.0,0.00035,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,176.0,0.000403,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0625,0.068465,161.166672,0.000716,0.0625,0.0,0.0,0.0,0.0
9,0.0,0.104167,0.051031,178.0,0.000648,0.104167,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,117.0,0.000297,0.0,0.0,0.0,0.0,0.0


-------------------- Question:
Jane is trying to decide whether to buy a house or a trailer. A house costs $480,000 and a trailer costs $120,000. Each loan will be paid in monthly installments over 20 years. How much more is the monthly payment on the house compared to the trailer? 
Answer:
1500 
Response:
<reasoning>
To determine how much more Jane would pay in monthly installments for the house compared to the trailer, we need to perform the following steps:

1. Calculate the total amount paid for each over the 20-year period, assuming the interest rate and other financing terms are the same for both the house and the trailer.
2. Divide the total amount paid for each by the number of months in 20 years to determine the monthly payment for each.
3. Subtract the monthly payment for the trailer from the monthly payment for the house to find the difference.

However, the problem doesn't provide the interest rate or other details necessary to calculate the exact monthly payment. Instead, 

TrainOutput(global_step=250, training_loss=0.0005799733952827424, metrics={'train_runtime': 1864.8425, 'train_samples_per_second': 0.804, 'train_steps_per_second': 0.134, 'total_flos': 0.0, 'train_loss': 0.0005799733952827424})

In [5]:
# 保存训练好的LoRA模型
model.save_lora("grpo_saved_lora")

# 测试模型
text = tokenizer.apply_chat_template([
    {"role" : "system", "content" : SYSTEM_PROMPT},
    {"role" : "user", "content" : "Calculate pi."},
], tokenize = False, add_generation_prompt = True)

from vllm import SamplingParams
sampling_params = SamplingParams(
    temperature = 0.8,    # 采样温度
    top_p = 0.95,        # top-p采样参数
    max_tokens = 1024,   # 最大生成token数
)

# 使用保存的LoRA进行推理
output = model.fast_generate(
    text,
    sampling_params = sampling_params,
    lora_request = model.load_lora("grpo_saved_lora"),
)[0].outputs[0].text

print(output)

Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.51s/it, est. speed input: 15.14 toks/s, output: 87.26 toks/s]

<reasoning>
Calculating the exact value of pi (π) is not possible using a finite number of steps or operations because π is an irrational number. This means it has an infinite number of non-repeating digits. However, we can approximate π to any desired degree of accuracy using various algorithms and series.

One of the most famous and efficient methods to approximate π is the Chudnovsky algorithm, which is based on the following series:

\[
\frac{1}{\pi} = 12 \sum_{k=0}^{\infty} \frac{(-1)^k (6k)! (13591409 + 545140134k)}{(3k)!(k!)^3 640320^{3k + 3/2}}
\]

This series converges very quickly, and each term approximately doubles the number of correct digits. For a simpler method, we can use the Leibniz formula for π, which is an infinite series:

\[






In [None]:
# 模型保存选项
# 保存为16位浮点数
if False: 
    model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit")
    model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# 保存为4位整数
if False: 
    model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit")
    model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# 仅保存LoRA适配器
if False: 
    model.save_pretrained_merged("model", tokenizer, save_method = "lora")
    model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

# GGUF/llama.cpp转换选项
# 保存为8位Q8_0
if False: 
    model.save_pretrained_gguf("model", tokenizer)
    model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# 保存为16位GGUF
if False: 
    model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
    model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# 保存为q4_k_m GGUF
if False: 
    model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
    model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

# 保存多个GGUF选项
if False:
    model.push_to_hub_gguf(
        "hf/model",
        tokenizer,
        quantization_method = ["q4_k_m", "q8_0", "q5_k_m"],
        token = "",
    )