# 1. 安装依赖库
使用pip安装Unsloth及相关依赖，确保环境准备好。


In [None]:
# 安装Unsloth及相关依赖库，建议在命令行或notebook中运行
!pip install unsloth bitsandbytes accelerate xformers==0.0.29.post3 peft trl sentencepiece protobuf datasets huggingface_hub hf_transfer

# 2. 加载和配置模型
导入FastLanguageModel，设置最大序列长度、数据类型和是否使用4bit量化，加载Qwen2.5-7B预训练模型和分词器。


In [1]:
# 导入FastLanguageModel和torch
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048  # 最大序列长度，可根据显存调整
dtype = None  # 自动检测数据类型，推荐float16或bfloat16
load_in_4bit = True  # 是否使用4bit量化，节省显存
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen2.5-7B",  # 选择Qwen2.5-7B模型
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # 如需访问受限模型请填写token
 )

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.7.11: Fast Qwen2 patching. Transformers: 4.54.1.
   \\   /|    NVIDIA RTX A6000. Num GPUs = 1. Max memory: 44.988 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.50s/it]


# 3. 添加LoRA适配器
为模型添加LoRA适配器，只微调部分参数以节省显存和加速训练。


In [3]:
# 为模型添加LoRA适配器，只微调部分参数，节省显存
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,  # LoRA秩，越大可微调参数越多
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,  # 推荐为0，优化显存
    bias = "none",
    use_gradient_checkpointing = "unsloth",  # 支持长上下文
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
 )

Unsloth 2025.7.11 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


# 4. 准备Alpaca数据集
加载Alpaca中文数据集，也可替换为自定义数据集。


In [None]:
# 加载Alpaca数据集，可替换为自定义数据集
from datasets import load_dataset
dataset = load_dataset("yahma/alpaca-cleaned", split="train")  # 默认加载英文，可替换为中文或自定义数据

In [11]:
# 打印数据集看看
print(dataset)
print(dataset[0])  # 打印第一条数据

Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 51760
})
{'instruction': 'Give three tips for staying healthy.', 'input': '', 'output': '1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.\n\n2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.\n\n3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.', 'text': '以下是一条描述任务的指令，配有进一步的输入信息。请根据要求完成回复。\n\n### 指令:\nGive three t

# 5. 格式化数据并添加EOS标记
定义格式化函数，将指令、输入和输出拼接为训练文本，并在末尾添加EOS标记。


In [6]:
# 定义格式化函数，将指令、输入、输出拼接为训练文本，并添加EOS标记
alpaca_prompt = """以下是一条描述任务的指令，配有进一步的输入信息。请根据要求完成回复。\n\n### 指令:\n{}\n\n### 输入:\n{}\n\n### 回复:\n{}"""
EOS_TOKEN = tokenizer.eos_token  # 获取模型的EOS标记
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # 拼接指令、输入和输出，并添加EOS标记
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}
dataset = dataset.map(formatting_prompts_func, batched=True)

Map: 100%|██████████| 51760/51760 [00:00<00:00, 89056.92 examples/s]


# 6. 训练模型
使用TRL的SFTTrainer进行微调，设置训练参数如批次大小、学习率、训练步数等。


In [7]:
# 使用SFTTrainer进行微调，设置训练参数
from trl import SFTConfig, SFTTrainer
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    packing = False,  # 短序列可加速训练
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,  # 训练步数，可根据需求调整
        learning_rate = 2e-4,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "model_train_outputs",
        report_to = "none",  # 可接入WandB等工具
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2): 100%|██████████| 51760/51760 [00:15<00:00, 3264.68 examples/s]


# 7. 显示显存信息
通过torch.cuda获取和打印显卡显存使用情况，便于监控资源消耗。


In [8]:
# 显示当前GPU显存信息，便于监控资源消耗
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU型号: {gpu_stats.name}，最大显存: {max_memory} GB")
print(f"已预留显存: {start_gpu_memory} GB")

GPU型号: NVIDIA RTX A6000，最大显存: 44.988 GB
已预留显存: 7.246 GB


In [9]:
# 开始训练模型，并显示训练结果和显存变化
trainer_stats = trainer.train()
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"训练耗时: {trainer_stats.metrics['train_runtime']} 秒")
print(f"训练耗时: {round(trainer_stats.metrics['train_runtime']/60, 2)} 分钟")
print(f"训练期间峰值显存: {used_memory} GB，占最大显存 {used_percentage}%")
print(f"LoRA训练显存: {used_memory_for_lora} GB，占最大显存 {lora_percentage}%")

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 51,760 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 40,370,176 of 7,655,986,688 (0.53% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,1.3702
2,1.7758
3,1.4507
4,1.6968
5,1.5804
6,1.3786
7,0.9655
8,1.1911
9,1.0724
10,0.9864


训练耗时: 180.6005 秒
训练耗时: 3.01 分钟
训练期间峰值显存: 8.635 GB，占最大显存 19.194%
LoRA训练显存: 1.389 GB，占最大显存 3.087%


# 8. 模型推理（生成文本）
启用推理模式，输入指令和上下文，生成模型输出，并展示如何使用TextStreamer流式输出结果。


In [10]:
# 启用推理模式，输入指令和上下文，生成模型输出
FastLanguageModel.for_inference(model)  # 开启高效推理模式
inputs = tokenizer([
    alpaca_prompt.format(
        "请续写斐波那契数列。",  # 指令
        "1, 1, 2, 3, 5, 8",  # 输入
        "",  # 输出留空，模型自动生成
    )
], return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
print(tokenizer.batch_decode(outputs))  # 输出生成结果

# 使用TextStreamer流式输出结果
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128)

['以下是一条描述任务的指令，配有进一步的输入信息。请根据要求完成回复。\n\n### 指令:\n请续写斐波那契数列。\n\n### 输入:\n1, 1, 2, 3, 5, 8\n\n### 回复:\n13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 4181, 6']
以下是一条描述任务的指令，配有进一步的输入信息。请根据要求完成回复。

### 指令:
请续写斐波那契数列。

### 输入:
1, 1, 2, 3, 5, 8

### 回复:
13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 4181, 6765, 10946, 17711, 28657, 46368, 75077, 121393, 193491, 194676, 


# 9. 保存和加载微调后的模型
保存LoRA适配器和分词器到本地或上传到Hugging Face Hub，并演示如何重新加载用于推理。


In [None]:
# 保存LoRA适配器和分词器到本地
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")
# 上传到Hugging Face Hub（需填写token）
# model.push_to_hub("你的用户名/lora_model", token="你的token")
# tokenizer.push_to_hub("你的用户名/lora_model", token="你的token")

# 重新加载微调后的模型用于推理
if False:
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model",
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model)

# 10. 保存为float16或GGUF格式
展示如何将模型保存为float16或GGUF格式，支持VLLM和llama.cpp等推理框架。


In [None]:
# 保存为float16格式，适用于VLLM等框架
if False: model.save_pretrained_merged("model", tokenizer, save_method="merged_16bit")
if False: model.push_to_hub_merged("你的用户名/model", tokenizer, save_method="merged_16bit", token="你的token")

# 保存为GGUF格式，适用于llama.cpp等框架
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method="q4_k_m")
if False: model.push_to_hub_gguf("你的用户名/model", tokenizer, quantization_method="q4_k_m", token="你的token")