In [1]:
"""
使用unsloth框架微调Qwen2.5-7B-Instruct模型来解决数独问题
这个脚本包含了数据准备、模型训练和评估的完整过程
"""

import os
import json
import random
import torch
from datasets import Dataset
from unsloth import FastLanguageModel
from transformers import TrainingArguments
import wandb

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 04-13 18:13:31 [__init__.py:256] Automatically detected platform cuda.


In [2]:
random.seed(42)
os.environ["http_proxy"] = "http://127.0.0.1:7897"
os.environ["https_proxy"] = "http://127.0.0.1:7897"
# os.environ["WANDB_PROJECT"] = "sudoku_solving_qwen"

In [3]:
# 定义模型参数
max_seq_length = 3000  # 可以增加以适应更长的推理过程
lora_rank = 16  # 更大的rank = 更智能，但更慢

In [4]:
def load_and_prepare_data(data_path, train_ratio=0.9):
    """加载并准备训练数据"""
    
    # 加载数据集
    with open(data_path, "r", encoding="utf-8") as f:
        sudoku_dataset = json.load(f)
    
    print(f"加载了 {len(sudoku_dataset)} 条数独数据")
    
    # 构建训练数据集
    training_data = []

        # 添加系统提示
    SYSTEM_PROMPT = """
    用以下格式回答问题:
    <think>推理过程</think>
    <answer>答案</answer>
    """
    
    for example in sudoku_dataset:
        question = example["question"]
        answer = example["answer"]
        
        # 构建Qwen2.5的输入格式
        prompt = f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n"
        full_prompt = prompt + answer + "<|im_end|>"
        
        training_data.append({"text": full_prompt})
    
    # 转换为HuggingFace数据集格式
    random.shuffle(training_data)  # 随机打乱数据
    train_size = int(len(training_data) * train_ratio)  
    train_dataset = Dataset.from_list(training_data[:train_size])
    eval_dataset = Dataset.from_list(training_data[train_size:])
    
    print(f"训练集大小: {len(train_dataset)}")
    print(f"验证集大小: {len(eval_dataset)}")
    
    return train_dataset, eval_dataset

In [5]:
def load_model(model_name="Qwen/Qwen2.5-3B-Instruct"):
    """加载基础模型并配置LoRA"""
    model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Qwen/Qwen2.5-3B-Instruct",
    max_seq_length = max_seq_length,
    load_in_4bit = True, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.6, # Reduce if out of memory
    )

    model = FastLanguageModel.get_peft_model(
        model,
        r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
        target_modules = [
            "q_proj", "k_proj", "v_proj", "o_proj",
            "gate_proj", "up_proj", "down_proj",
        ], # Remove QKVO if out of memory
        lora_alpha = lora_rank,
        use_gradient_checkpointing = "unsloth", # Enable long context finetuning
        random_state = 3407,
    )
    
    return model, tokenizer

In [6]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

def setup_trainer(model, tokenizer, train_dataset):
    """设置训练器"""
    trainer = SFTTrainer(
        model = model,
        tokenizer = tokenizer,
        train_dataset = train_dataset,
        dataset_text_field = "text",
        max_seq_length = max_seq_length,
        dataset_num_proc = 2,
        packing = False, # Can make training 5x faster for short sequences.
        args = TrainingArguments(
            per_device_train_batch_size = 2,
            gradient_accumulation_steps = 4,
            warmup_steps = 5,
            # num_train_epochs = 1, # Set this for 1 full training run.
            max_steps = 60,
            learning_rate = 2e-4,
            fp16 = not is_bfloat16_supported(),
            bf16 = is_bfloat16_supported(),
            logging_steps = 1,
            optim = "adamw_8bit",
            weight_decay = 0.01,
            lr_scheduler_type = "linear",
            seed = 3407,
            output_dir = "outputs",
            report_to = "none", # Use this for WandB etc
        ),
    )
    
    return trainer

In [5]:
def test_model(model, tokenizer, test_question):
    """测试模型在给定问题上的表现"""

    # 添加系统提示
    SYSTEM_PROMPT = """
    用以下格式回答问题:
    <think>推理过程</think>
    <answer>答案</answer>
    """
    
    prompt = f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n<|im_start|>user\n{test_question}<|im_end|>\n<|im_start|>assistant\n"

    # alpaca_prompt = Copied from above
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference
    inputs = tokenizer(
    [
        prompt
    ], return_tensors = "pt").to("cuda")

    outputs = model.generate(**inputs, max_new_tokens = 3000, use_cache = True)
    generated_text = tokenizer.batch_decode(outputs)
    
    # 提取生成文本中的助手回复部分
    assistant_response = generated_text[0].split("<|im_start|>assistant\n")[-1].split("<|im_end|>")[0]
    return assistant_response

In [8]:
def evaluate_model(model, tokenizer, dataset, num_samples=5):
    """评估模型在数据集上的表现，重点关注是否生成了<think>和<answer>标签"""

    # 添加系统提示
    SYSTEM_PROMPT = """
    用以下格式回答问题:
    <think>推理过程</think>
    <answer>答案</answer>
    """
    
    # 随机选择样本
    indices = random.sample(range(len(dataset)), min(num_samples, len(dataset)))
    
    results = []
    for idx in indices:
        example = dataset[idx]
        text = example["text"]
        
        # 提取问题
        question = text.split("<|im_start|>user\n")[1].split("<|im_end|>")[0]
        prompt = f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n"
        
        # alpaca_prompt = Copied from above
        FastLanguageModel.for_inference(model) # Enable native 2x faster inference
        inputs = tokenizer(
        [
            prompt
        ], return_tensors = "pt").to("cuda")

        outputs = model.generate(**inputs, max_new_tokens = 3000, use_cache = True)
        generated_text = tokenizer.batch_decode(outputs)
        
        # 提取答案
        answer = generated_text[0].split("<|im_start|>assistant\n")[-1].split("<|im_end|>")[0]
        
        # 检查是否包含 <think> 和 <answer> 标记
        has_think_tag = "<think>" in answer and "</think>" in answer
        has_answer_tag = "<answer>" in answer and "</answer>" in answer
        
        # 收集结果
        results.append({
            "has_think_tag": has_think_tag,
            "has_answer_tag": has_answer_tag,
            "answer": answer,
        })
    
    # 计算统计数据
    stats = {
        "total_samples": len(results),
        "samples_with_think_tag": sum(1 for r in results if r["has_think_tag"]),
        "samples_with_answer_tag": sum(1 for r in results if r["has_answer_tag"]),
        "samples_with_both_tags": sum(1 for r in results if r["has_think_tag"] and r["has_answer_tag"]),
    }

    return results, stats

In [9]:
def compare_models(original_model, original_tokenizer, finetuned_model, finetuned_tokenizer, test_question):
    
    # 使用原始模型生成答案
    original_answer = test_model(original_model, original_tokenizer, test_question)
    
    # 使用微调后的模型生成答案
    finetuned_answer = test_model(finetuned_model, finetuned_tokenizer, test_question)
    
    return original_answer, finetuned_answer

In [6]:
# 加载和准备数据
train_dataset, eval_dataset = load_and_prepare_data("reasoning_sft_dataset/sudoku_reasoning_dataset.json")

加载了 50 条数独数据
训练集大小: 45
验证集大小: 5


In [None]:
# 加载基础模型
model, tokenizer = load_model()

In [None]:
# 测试原始模型
if len(eval_dataset) > 0:
    test_example = eval_dataset[3]["text"]
    test_question = test_example.split("<|im_start|>user\n")[1].split("<|im_end|>")[0]
    original_response = test_model(model, tokenizer, test_question)
    print("问题：")
    print(test_question)
    print("原始模型回答示例:")
    print(original_response)

In [13]:
# # 设置训练器
# trainer = setup_trainer(model, tokenizer, train_dataset)

In [14]:
# # 开始训练
# trainer.train()

In [8]:
# # 保存模型
output_dir = "output/sudoku_solving_qwen3b_sft"
# trainer.save_model(output_dir)
# print(f"模型已保存到: {output_dir}")

In [None]:
#  加载微调后的模型
print("\n步骤7: 加载微调后的模型...")
finetuned_model, finetuned_tokenizer = FastLanguageModel.from_pretrained(
    model_name=output_dir,
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    fast_inference=True,
    gpu_memory_utilization=0.6,
)

In [None]:
# 测试微调后的模型
print("\n步骤8: 测试微调后的模型...")
if len(eval_dataset) > 0:
    test_example = eval_dataset[3]["text"]
    test_question = test_example.split("<|im_start|>user\n")[1].split("<|im_end|>")[0]
    finetuned_response = test_model(finetuned_model, finetuned_tokenizer, test_question)
    print("问题：")
    print(test_question)
    print("sft模型回答示例:")
    print(finetuned_response)

In [18]:
# 重新加载原始模型进行评估
print("加载原始模型进行评估...")
original_model, original_tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Qwen/Qwen2.5-3B-Instruct",
    max_seq_length = max_seq_length,
    load_in_4bit = True,
    fast_inference = True,
    gpu_memory_utilization = 0.8,
)

original_results, original_stats = evaluate_model(original_model, original_tokenizer, eval_dataset, num_samples=1)
print("原始模型评估结果:")
print("包含<think>标签的样本数:", original_stats["samples_with_think_tag"])
print("包含<answer>标签的样本数:", original_stats["samples_with_answer_tag"])
print("同时包含两个标签的样本数:", original_stats["samples_with_both_tags"])
print("\n原始模型回答示例:")
print(original_results[0]["answer"])

print("\n评估微调后的模型...")
finetuned_results, finetuned_stats = evaluate_model(finetuned_model, finetuned_tokenizer, eval_dataset, num_samples=1)
print("微调后模型评估结果:")
print("包含<think>标签的样本数:", finetuned_stats["samples_with_think_tag"])
print("包含<answer>标签的样本数:", finetuned_stats["samples_with_answer_tag"])
print("同时包含两个标签的样本数:", finetuned_stats["samples_with_both_tags"])
print("\n微调后模型回答示例:")
print(finetuned_results[0]["answer"])

In [19]:
# 步骤10: 创建一个新的测试数独
new_sudoku = """以下是一个数独游戏，在9乘9的81宫格中，数字的顺序分别为：
8 6 3 | 2 0 4 | 9 1 5
1 5 9 | 6 3 8 | 7 4 2
4 2 7 | 5 9 1 | 3 8 6
------+-------+------
9 1 6 | 8 2 3 | 5 7 4
7 4 5 | 1 6 9 | 2 3 8
3 8 2 | 4 5 7 | 6 9 1
------+-------+------
6 0 8 | 3 4 2 | 1 5 7
5 7 1 | 9 8 6 | 4 2 3
2 3 4 | 7 1 5 | 8 6 9
其中0代表空缺的数字，需要你去填写，请你完成这个数独游戏，并输出相同格式的答案。"""



original_new_answer, finetuned_new_answer = compare_models(
    model, tokenizer, finetuned_model, finetuned_tokenizer, new_sudoku
)

print("新的数独问题:")
print(new_sudoku)
print("\n原始模型回答:")
print(original_new_answer[:3000] + "..." if len(original_new_answer) > 3000 else original_new_answer)


finetuned_new_answer = test_model(finetuned_model, finetuned_tokenizer, test_question)

print("\n微调后模型回答:")
print(finetuned_new_answer[:3000] + "..." if len(finetuned_new_answer) > 3000 else finetuned_new_answer)

In [7]:
from datasets import Dataset
import pandas as pd
import numpy as np
import re

def format_sudoku(puzzle_str):
    """将81个字符的数独字符串转换为9x9矩阵格式"""
    if len(puzzle_str) != 81:
        print(f"警告: 数独字符串长度不是81，而是{len(puzzle_str)}，无法格式化。")
        return None

    grid = np.array(list(puzzle_str)).reshape(9, 9)
    formatted = []
    for i in range(9):
        row = " ".join(grid[i, :3]) + " | " + " ".join(grid[i, 3:6]) + " | " + " ".join(grid[i, 6:])
        formatted.append(row)
        if i == 2 or i == 5:
            formatted.append("------+-------+------")
    
    return "\n".join(formatted)

def get_sudoku_dataset(split="train") -> Dataset:
    """加载数独数据集并转换为GRPO训练所需的格式"""
    # 读取CSV文件
    df = pd.read_csv("dataset/sudoku_cluewise.csv")
    
    # 筛选线索数量大于等于78的数据
    df = df[df['clue_numbers'] >= 78]
    
    # 如果数据量超过500，随机选择500条
    if len(df) > 500:
        df = df.sample(n=500, random_state=42)
    
    print(f"筛选后数据集大小: {len(df)}")
    
    # 转换数据格式
    def transform_data(row):
        puzzle = row['quizzes']
        solution = row['solutions']
        
        # 格式化数独谜题
        formatted_puzzle = format_sudoku(puzzle)
        if formatted_puzzle is None:
            return None
            
        # 构建问题
        question = f"以下是一个数独游戏，在9乘9的81宫格中，数字的顺序分别为：\n{formatted_puzzle}\n其中0代表空缺的数字，需要你去填写，请你完成这个数独游戏，并输出相同格式的答案。"

        # 添加系统提示
        SYSTEM_PROMPT = """
        用以下格式回答问题:
        <think>推理过程</think>
        <answer>答案</answer>
        """
        
        return {
            'prompt': [
                {'role': 'system', 'content': SYSTEM_PROMPT},
                {'role': 'user', 'content': question}
            ],
            'answer': format_sudoku(solution)
        }
    
    # 转换数据
    transformed_data = [transform_data(row) for _, row in df.iterrows()]
    transformed_data = [x for x in transformed_data if x is not None]
    
    # 转换为Dataset格式
    dataset = Dataset.from_list(transformed_data)
    
    # 划分训练集和验证集
    if split == "train":
        dataset = dataset.select(range(int(0.9 * len(dataset))))
    else:
        dataset = dataset.select(range(int(0.9 * len(dataset)), len(dataset)))
    
    print(f"{split}集大小: {len(dataset)}")
    return dataset

# 提取<answer>标签中的内容
def extract_xml_answer(text: str) -> str:
    if "<answer>" not in text or "</answer>" not in text:
        return ""
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()

# 正确性奖励函数
def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    q = prompts[0][-1]['content']
    
    # 提取答案，如果提取失败则返回空字符串
    extracted_responses = [extract_xml_answer(r) for r in responses]
    
    # print('-'*20, f"Question:\n{q}", f"\nAnswer:\n{answer[0]}", f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}")
    
    # 比较答案，空字符串直接返回0分
    return [2.0 if r and r == a else 0.0 for r, a in zip(extracted_responses, answer)]

# 软格式奖励函数
def soft_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"<think>.*?</think>\s*<answer>.*?</answer>"
    responses = [completion[0]["content"] for completion in completions]
    
    rewards = []
    
    print("\n=== Soft Format Reward Debug ===")
    for i, response in enumerate(responses):
        contains_think_open = "<think>" in response
        contains_think_close = "</think>" in response
        contains_answer_open = "<answer>" in response
        contains_answer_close = "</answer>" in response
        
        match = re.match(pattern, response, re.DOTALL)
        
        # 计算奖励
        if match:
            reward = 1.0  # 完全匹配正则，直接给满分
        else:
            reward = (
                (0.15 if contains_think_open else 0) +
                (0.15 if contains_think_close else 0) +
                (0.15 if contains_answer_open else 0) +
                (0.15 if contains_answer_close else 0)
            )
        
        rewards.append(reward)
        
        # 调试信息
        print(f"\nResponse {i}:")
        print(f"Contains <think>: {contains_think_open}")
        print(f"Contains </think>: {contains_think_close}")
        print(f"Contains <answer>: {contains_answer_open}")
        print(f"Contains </answer>: {contains_answer_close}")
        print(f"Pattern match: {match}")
        print(f"Final Reward: {reward}")

    return rewards

# 格式正确性奖励函数
def format_correctness_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the answer follows the correct format."""
    responses = [completion[0]["content"] for completion in completions]
    rewards = []
    
    print("\n=== Format Correctness Reward Debug ===")
    for i, response in enumerate(responses):
        print(f"\nResponse {i}:")
        # 提取<answer>标签中的内容
        answer = extract_xml_answer(response)
        
        # 如果提取失败，直接返回0分
        if not answer:
            print("Failed to extract answer from XML tags")
            rewards.append(0.0)
            continue
        
        print("Extracted answer:")
        print(answer)
        
        # 检查格式是否正确
        lines = answer.strip().split('\n')
        print(f"Number of lines: {len(lines)}")
        
        if len(lines) != 11:  # 9行数字 + 2行分隔线
            print(f"Wrong number of lines: expected 11, got {len(lines)}")
            rewards.append(0.0)
            continue
            
        # 检查每行的格式
        format_correct = True
        for i, line in enumerate(lines):
            if i in [3, 7]:  # 分隔线
                if line != "------+-------+------":
                    print(f"Wrong separator line at index {i}: {line}")
                    format_correct = False
                    break
            else:  # 数字行
                parts = line.split('|')
                if len(parts) != 3:
                    print(f"Wrong number of parts at line {i}: {line}")
                    format_correct = False
                    break
                for j, part in enumerate(parts):
                    if len(part.strip().split()) != 3:
                        print(f"Wrong number of numbers in part {j} of line {i}: {part}")
                        format_correct = False
                        break
        
        print(f"Format correct: {format_correct}")
        rewards.append(1.0 if format_correct else 0.0)
    
    return rewards

def sudoku_validity_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the answer is a valid Sudoku solution."""
    responses = [completion[0]["content"] for completion in completions]
    rewards = []
    
    print("\n=== Sudoku Validity Reward Debug ===")
    for i, response in enumerate(responses):
        print(f"\nResponse {i}:")
        # 提取<answer>标签中的内容
        answer = extract_xml_answer(response)
        
        # 如果提取失败，直接返回0分
        if not answer:
            print("Failed to extract answer from XML tags")
            rewards.append(0.0)
            continue
        
        try:
            # 将答案转换为9x9矩阵
            lines = answer.strip().split('\n')
            grid = []
            for line in lines:
                if line == "------+-------+------":
                    continue
                # 移除分隔符并分割数字
                numbers = line.replace('|', '').split()
                grid.append([int(n) for n in numbers])
            
            if len(grid) != 9 or any(len(row) != 9 for row in grid):
                print("Invalid grid dimensions")
                rewards.append(0.0)
                continue
            
            # 检查每行
            row_rewards = 0
            for row_idx, row in enumerate(grid):
                if set(row) == set(range(1, 10)):
                    row_rewards += 0.1
                    print(f"Row {row_idx + 1} is valid")
                else:
                    print(f"Row {row_idx + 1} is invalid: {row}")
            
            # 检查每列
            col_rewards = 0
            for col in range(9):
                column = [grid[row][col] for row in range(9)]
                if set(column) == set(range(1, 10)):
                    col_rewards += 0.1
                    print(f"Column {col + 1} is valid")
                else:
                    print(f"Column {col + 1} is invalid: {column}")
            
            # 检查每个3x3小框
            box_rewards = 0
            for box_row in range(0, 9, 3):
                for box_col in range(0, 9, 3):
                    # 提取3x3小框中的数字
                    box = []
                    for i in range(3):
                        for j in range(3):
                            box.append(grid[box_row + i][box_col + j])
                    if set(box) == set(range(1, 10)):
                        box_rewards += 0.1
                        print(f"Box at ({box_row}, {box_col}) is valid")
                    else:
                        print(f"Box at ({box_row}, {box_col}) is invalid: {box}")
            
            # 总奖励为行奖励、列奖励和小框奖励之和
            total_reward = row_rewards + col_rewards + box_rewards
            print(f"Total reward: {total_reward} (rows: {row_rewards}, columns: {col_rewards}, boxes: {box_rewards})")
            rewards.append(total_reward)
            
        except Exception as e:
            print(f"Error processing grid: {str(e)}")
            rewards.append(0.0)
    
    return rewards

# 线索保留和空单元格填充奖励函数
def clue_preservation_reward_func(prompts, completions, **kwargs) -> list[float]:
    """Reward function that checks if original clues are preserved and rewards correct empty cell filling."""
    responses = [completion[0]["content"] for completion in completions]
    questions = [prompt[-1]["content"] for prompt in prompts]
    rewards = []
    
    print("\n=== Clue Preservation and Empty Cell Reward Debug ===")
    for i, (response, question) in enumerate(zip(responses, questions)):
        print(f"\nResponse {i}:")
        
        # 提取答案
        answer = extract_xml_answer(response)
        if not answer:
            print("Failed to extract answer from XML tags")
            rewards.append(0.0)
            continue
            
        try:
            # 从问题中提取原始数独
            original_grid = []
            for line in question.split('\n'):
                if '|' in line and not line.startswith('------'):
                    # 移除分隔符并分割数字
                    numbers = line.replace('|', '').split()
                    original_grid.append([int(n) if n != '0' else 0 for n in numbers])
            
            # 从答案中提取填充后的数独
            filled_grid = []
            for line in answer.split('\n'):
                if '|' in line and not line.startswith('------'):
                    # 移除分隔符并分割数字
                    numbers = line.replace('|', '').split()
                    filled_grid.append([int(n) for n in numbers])
            
            # 检查原始线索是否保持不变
            clue_preserved = True
            empty_cells = 0
            correct_fills = 0
            
            for i in range(9):
                for j in range(9):
                    if original_grid[i][j] != 0:  # 这是一个原始线索
                        if original_grid[i][j] != filled_grid[i][j]:
                            clue_preserved = False
                            print(f"Original clue changed at position ({i}, {j}): {original_grid[i][j]} -> {filled_grid[i][j]}")
                    else:  # 这是一个空单元格
                        empty_cells += 1
                        if filled_grid[i][j] in range(1, 10):  # 确保填充的是有效数字
                            correct_fills += 1
            
            if not clue_preserved:
                print("Original clues were not preserved")
                rewards.append(0.0)
                continue
            
            # 计算奖励
            if empty_cells > 0:
                reward = correct_fills / empty_cells
                print(f"Empty cells: {empty_cells}, Correct fills: {correct_fills}, Reward: {reward}")
            else:
                reward = 1.0  # 如果没有空单元格，说明所有线索都正确
                print("No empty cells to fill")
            
            rewards.append(reward)
            
        except Exception as e:
            print(f"Error processing grids: {str(e)}")
            rewards.append(0.0)
    
    return rewards

In [21]:
# 加载训练数据
train_dataset = get_sudoku_dataset(split="train")
eval_dataset = get_sudoku_dataset(split="eval")

# # 初始化wandb
# wandb.init(
#     project="sudoku_solving_qwen",
#     config={
#         "model_name": "Qwen2.5-0.5B-Instruct",
#         "max_seq_length": max_seq_length,
#         "lora_rank": lora_rank,
#         "learning_rate": 5e-6,
#         "batch_size": 1,
#         "gradient_accumulation_steps": 1,
#         "num_generations": 6,
#         "max_steps": 250,
#     }
# )

# from trl import GRPOConfig, GRPOTrainer

# training_args = GRPOConfig(
#     learning_rate = 5e-6,
#     adam_beta1 = 0.9,
#     adam_beta2 = 0.99,
#     weight_decay = 0.1,
#     warmup_ratio = 0.1,
#     lr_scheduler_type = "cosine",
#     optim = "paged_adamw_8bit",
#     logging_steps = 1,
#     per_device_train_batch_size = 1,
#     gradient_accumulation_steps = 1,
#     num_generations = 6,
#     max_prompt_length = max_seq_length // 2,
#     max_completion_length = max_seq_length // 2,
#     max_steps = 500,
#     save_steps = 250,
#     max_grad_norm = 0.1,
#     report_to = "wandb",  # 启用wandb报告
#     output_dir = "output/sudoku_solving_qwen3b_grpo",
# )


In [22]:
# 创建一个自定义的回调类来记录奖励函数的结果
from transformers import TrainerCallback
class RewardCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if state.log_history:
            # 记录最新的训练指标
            for log in state.log_history:
                if isinstance(log, dict):
                    wandb.log(log)

In [23]:
# trainer = GRPOTrainer(
#     model = finetuned_model,
#     processing_class = finetuned_tokenizer,
#     reward_funcs = [
#         soft_format_reward_func,
#         correctness_reward_func,
#         format_correctness_reward_func,
#         sudoku_validity_reward_func,
#         clue_preservation_reward_func,
#     ],
#     args = training_args,
#     train_dataset = train_dataset,
#     callbacks=[RewardCallback()],  # 添加自定义回调
# )

In [24]:
# # 开始训练
# trainer.train()

# # 关闭wandb
# wandb.finish()

In [10]:
# # 保存模型
output_dir = "output/sudoku_solving_qwen3b_grpo"
# trainer.save_model(output_dir)
# print(f"模型已保存到: {output_dir}")

In [11]:
#  加载微调后的模型
print("\n步骤7: 加载微调后的模型...")
finetuned_model, finetuned_tokenizer = FastLanguageModel.from_pretrained(
    model_name=output_dir,
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    fast_inference=True,
    gpu_memory_utilization=0.6,
)


步骤7: 加载微调后的模型...
==((====))==  Unsloth 2025.3.18: Fast Qwen2 patching. Transformers: 4.50.0. vLLM: 0.8.1.
   \\   /|    NVIDIA GeForce RTX 2080 Ti. Num GPUs = 1. Max memory: 21.657 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/qwen2.5-7b-instruct-unsloth-bnb-4bit with actual GPU utilization = 58.33%
Unsloth: Your GPU has CUDA compute capability 7.5 with VRAM = 21.66 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 3000. Num Sequences = 192.
Unsloth: vLLM's KV Cache can use up to 6.5 GB. Also swap space = 4 GB.
INFO 04-13 18:17:03 [config.py:583] This model supports multiple tasks: {'reward', 'classify', 'score', 'embed', 'generate'}. Defaulting to 'generate'.
Unsl

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 04-13 18:17:09 [punica_selector.py:18] Using PunicaWrapperGPU.
INFO 04-13 18:17:09 [model_runner.py:1146] Model loading took 7.0262 GB and 4.281014 seconds
INFO 04-13 18:17:10 [worker.py:267] Memory profiling takes 1.54 seconds
INFO 04-13 18:17:10 [worker.py:267] the current vLLM instance can use total_gpu_memory (21.66GiB) x gpu_memory_utilization (0.58) = 12.63GiB
INFO 04-13 18:17:10 [worker.py:267] model weights take 7.03GiB; non_torch_memory takes 0.02GiB; PyTorch activation peak memory takes 1.06GiB; the rest of the memory reserved for KV Cache is 4.52GiB.
INFO 04-13 18:17:10 [executor_base.py:111] # cuda blocks: 5288, # CPU blocks: 4681
INFO 04-13 18:17:10 [executor_base.py:116] Maximum concurrency for 3000 tokens per request: 28.20x
INFO 04-13 18:17:12 [model_runner.py:1442] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If o

Capturing CUDA graph shapes: 100%|█████████████████████████████████████████████████████████████████| 27/27 [00:11<00:00,  2.26it/s]

INFO 04-13 18:17:24 [model_runner.py:1570] Graph capturing finished in 12 secs, took 0.55 GiB
INFO 04-13 18:17:24 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 15.16 seconds



Unsloth 2025.3.18 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [14]:
import gc 
gc.collect()

# 测试微调后的模型
print("\n步骤8: 测试微调后的模型...")
if len(eval_dataset) > 0:
    test_example = eval_dataset[2]["text"]
    test_question = test_example.split("<|im_start|>user\n")[1].split("<|im_end|>")[0]
    finetuned_response = test_model(finetuned_model, finetuned_tokenizer, test_question)
    print("问题：")
    print(test_question)
    print("grpo模型回答示例:")
    print(finetuned_response)


步骤8: 测试微调后的模型...
问题：
以下是一个数独游戏，在9乘9的81宫格中，数字的顺序分别为：
3 7 4 | 8 5 1 | 6 2 9
1 5 8 | 6 9 2 | 4 3 7
2 9 6 | 4 7 3 | 8 1 5
------+-------+------
8 1 7 | 5 2 6 | 9 4 3
6 2 5 | 9 3 4 | 0 8 1
4 3 9 | 7 1 8 | 5 6 2
------+-------+------
9 4 2 | 3 8 7 | 1 5 6
5 6 1 | 2 4 9 | 3 7 8
7 8 3 | 1 6 5 | 2 9 4
其中0代表空缺的数字，需要你去填写，请你完成这个数独游戏，并输出相同格式的答案。
sft模型回答示例:
<think>嗯，我现在要解决这个数独问题。首先，我需要仔细观察题目给出的数独布局，找出所有0的位置，然后根据数独的规则来推断出正确的数字。数独的规则是每一行、每一列以及每个3x3的小九宫格内数字1-9不能重复。

首先，我先将题目中的数独结构整理清楚。数独的结构是9x9的，分为9个3x3的小宫格。现在，我需要找出所有0的位置，并逐一解决。

观察给出的数独，我注意到第6行的第7列有一个0，也就是坐标（6,7）的位置。其他位置都是已填好的数字。现在，我需要确定这个0应该填什么数字。

根据数独的规则，每一行、列和小宫格都必须包含1-9的数字，不能重复。所以，我需要检查第6行、第7列以及第5-7行的中间小宫格（即第5-7行，第7-9列）来确定0的位置应该填什么数字。

先看第6行，已有的数字是4,3,9,7,1,8,5,6,2，缺少的数字是0，所以这个位置应该填0，但这里可能有误，因为0的位置应该填其他数字。所以，我需要仔细检查。

现在，我需要检查第6行的数字，看看是否有重复。当前第6行的数字是4,3,9,7,1,8,0,5,6。所以，缺少的数字是2。因此，第6行第7列的0应该填2。这样，第6行就完整了。

接下来，我需要检查第7列是否有其他数字，确保没有重复。第7列的数字是6,4,8,9,0,5,1,3,2。这里0的位置在第6行，所以填2是正确的。

然后，我需要检查第5-7行的中间小宫格（第5-7行，第7-9列）是否有重复。当前这个小宫格中的数字是0（即第6行第7列）、8、1；第7