In [17]:
# 在 colab 中进行实验时，需要安装特定版本的 trl 和 datasets 库。
!pip install trl==0.14.0
!pip install datasets==2.14.6

Collecting datasets==2.14.6
  Downloading datasets-2.14.6-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets==2.14.6)
  Downloading dill-0.3.7-py3-none-any.whl.metadata (9.9 kB)
Collecting fsspec<=2023.10.0,>=2023.1.0 (from fsspec[http]<=2023.10.0,>=2023.1.0->datasets==2.14.6)
  Downloading fsspec-2023.10.0-py3-none-any.whl.metadata (6.8 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
Collecting multiprocess (from datasets==2.14.6)
  Downloading multiprocess-0.70.18-py312-none-any.whl.metadata (7.5 kB)
  Downloading multiprocess-0.70.17-py312-none-any.whl.metadata (7.2 kB)
  Downloading multiprocess-0.70.15-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.7-py3-none-an

# L3: SFT in practice

在这个实验中，我们将使用一个小的训练集创建一个 SFT Pipeline。


在实验中，我们将从一个基础语言模型开始，  
并准备用于聊天和指令遵循的标注数据。  
接着进行 SFT，得到一个能与用户聊天的微调模型。

In [15]:
# 过滤 warning
import warnings
warnings.filterwarnings('ignore')

## 导入相关的库

In [2]:
import torch
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM, SFTConfig

## 定义训练中相关的函数

In [3]:
# 模型推理函数，用于生成通用回复。它的参数包括模型本身、分词器、用户消息，以及可选的 system prompt等。
def generate_responses(model, tokenizer, user_message, system_message=None,
                       max_new_tokens=100):
    # 将输入的 prompt 使用 chat message
    messages = []
    if system_message:
        messages.append({"role": "system", "content": system_message})

    # 假设所有的数据都是单轮对话（Q-A）
    messages.append({"role": "user", "content": user_message})

    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False,
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    input_len = inputs["input_ids"].shape[1]
    generated_ids = outputs[0][input_len:]
    response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

    return response

In [4]:
# 测试模型生成效果
def test_model_with_questions(model, tokenizer, questions,
                              system_message=None, title="Model Output"):
    print(f"\n=== {title} ===")
    for i, question in enumerate(questions, 1):
        response = generate_responses(model, tokenizer, question,
                                      system_message)
        print(f"\nModel Input {i}:\n{question}\nModel Output {i}:\n{response}\n")


In [5]:
# 加载模型并定义 tokenizer
def load_model_and_tokenizer(model_name, use_gpu = False):

    # 加载基座模型和 tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)

    if use_gpu:
        model.to("cuda")

    # 定义默认的 chat tempalte
    # 如你所见，在使用 LLM 时，模型看到的信息是由 chat message list 转换而来的一个 token sequence，从而继续一个一个生成下一个 token。
    if not tokenizer.chat_template:
        tokenizer.chat_template = """{% for message in messages %}
                {% if message['role'] == 'system' %}System: {{ message['content'] }}\n
                {% elif message['role'] == 'user' %}User: {{ message['content'] }}\n
                {% elif message['role'] == 'assistant' %}Assistant: {{ message['content'] }} <|endoftext|>
                {% endif %}
                {% endfor %}"""

    # 将用于填充的 pad token 设置为用于结尾的 eos token
    if not tokenizer.pad_token:
        tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

In [6]:
# 可视化数据集
def display_dataset(dataset):
    rows = []
    for i in range(3):
        example = dataset[i]
        user_msg = next(m['content'] for m in example['messages']
                        if m['role'] == 'user')
        assistant_msg = next(m['content'] for m in example['messages']
                             if m['role'] == 'assistant')
        rows.append({
            'User Prompt': user_msg,
            'Assistant Response': assistant_msg
        })

    # Display as table
    df = pd.DataFrame(rows)
    pd.set_option('display.max_colwidth', None)  # Avoid truncating long strings
    display(df)

## 加载 Qwen3-0.6B 的 Base 模型并针对简单问题进行测试

In [7]:
USE_GPU = True

questions = [
    "Give me an 1-sentence introduction of LLM.",
    "Calculate 1+1-1",
    "What's the difference between thread and process?"
]

In [8]:
model, tokenizer = load_model_and_tokenizer("Qwen/Qwen3-0.6B-Base", USE_GPU)

test_model_with_questions(model, tokenizer, questions,
                          title="Base Model (Before SFT) Output")

del model, tokenizer


=== Base Model (Before SFT) Output ===

Model Input 1:
Give me an 1-sentence introduction of LLM.
Model Output 1:
⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ ⚙ �


Model Input 2:
Calculate 1+1-1
Model Output 2:
⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ �


Model Input 3:
What's the difference between thread and process?
Model Output 3:
⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ ⚇ �



## Qwen3-0.6B 模型的 SFT 结果
在本节中，我们将回顾之前完成的 SFT 训练的结果。由于资源有限，我们不会对像 Qwen3-0.6B 这样规模较大的模型进行完整的训练。不过，在本笔记本的下一节中，您将使用较小的模型和轻量级的数据集来完成完整的训练过程。

In [9]:
model, tokenizer = load_model_and_tokenizer("Qwen/Qwen3-0.6B", USE_GPU)

test_model_with_questions(model, tokenizer, questions,
                          title="Base Model (After SFT) Output")

del model, tokenizer

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.50G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



=== Base Model (After SFT) Output ===

Model Input 1:
Give me an 1-sentence introduction of LLM.
Model Output 1:
A large language model is a system designed to understand and generate human language.


Model Input 2:
Calculate 1+1-1
Model Output 2:
The expression $1 + 1 - 1$ can be evaluated step by step:

1. Add the first two 1s:  
   $1 + 1 = 2$

2. Subtract the third 1:  
   $2 - 1 = 1$

So, the final result is:  
**1**.


Model Input 3:
What's the difference between thread and process?
Model Output 3:
The difference between **thread** and **process** is important in operating systems and concurrent programming. Here's a clear breakdown:

### 1. **Process**:
- A **process** is a **thread** that is running independently.
- It is a **unit of execution** in a program.
- Processes are managed by the operating system and can be created, stopped, or terminated.
- Processes are isolated from other processes, meaning they share the same memory space and resources.
- Example:



## 在小尺寸的模型上应用 SFT

注意：我们正在使用一个小型的 HuggingFaceTB/SmolLM2-135M 模型以及一个较小的训练数据集进行 SFT（序列到序列生成）操作，以确保整个训练过程能够在有限的计算资源上顺利进行。如果您在自己的机器上运行这些笔记本，并且能够使用 GPU，那么您可以切换到更大的模型（例如 Qwen/Qwen3-0.6B-Base）来执行完整的 SFT 操作并重现上述结果。

In [8]:
model_name = "HuggingFaceTB/SmolLM2-135M"
model, tokenizer = load_model_and_tokenizer(model_name, USE_GPU)

In [10]:
train_dataset = load_dataset("banghua/DL-SFT-Dataset")["train"]
if not USE_GPU:
    train_dataset=train_dataset.select(range(100))

display_dataset(train_dataset)

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/2961 [00:00<?, ? examples/s]

Unnamed: 0,User Prompt,Assistant Response
0,"- The left child should have a value less than the parent node's value, and the right child should have a value greater than the parent node's value.","This statement is correct. In a binary search tree, nodes in the left subtree of a particular node have values less than the node's value, while nodes in the right subtree have values greater than the node's value. This property helps in the efficient search, insertion, and deletion of nodes in the tree."
1,"To pass three levels must be the plan.\nThen tackle Two, when that is done.\nOf 100 that start, at the end will be 20.\nFinQuiz is a website that helps you prepare.\nUse it to be stress-free, and not lose your hair.\nThen, take the exam with a smile on your face.\nBe confident that you will gain your place.\nSo make this the goal to which you aspire. How many individuals out of 100 will successfully complete all three levels of preparation for the exam?","Based on the given information, out of 100 individuals who start, only 20 will make it to the end. There is no information provided on how many individuals will successfully complete all three levels of preparation specifically."
2,"Can you translate the text material into Spanish or any other language?: He really is, you know.\nThings a hero should show.\nHe loves me more than a zillion things.\nHe loves me when I sing my jolly folktale rhymes.\nHe's good, not just good, in fact he's great!\nBut because he's my best mate!\nWOW !!! I love it!!!!","¿Puede traducir el texto a español o a cualquier otro idioma?: \nRealmente lo es, ya sabes.\nCosas que un héroe debería demostrar.\nMe quiere más que un millón de cosas.\nMe quiere cuando canto mis alegres rimas de cuentos populares.\nEs bueno, no solo bueno, ¡de hecho es genial!\n¡Pero porque es mi mejor amigo!\n¡WOW! ¡Me encanta!"


In [11]:
# SFTTrainer 设置
sft_config = SFTConfig(
    learning_rate=8e-5,
    num_train_epochs=1,
    per_device_train_batch_size=1, # 每块 GPU 的 batch size。
    gradient_accumulation_steps=8, # 梯度累积次数。
    gradient_checkpointing=False, # 启用梯度检查点机制，以降低训练期间的内存使用量，但会以训练速度变慢为代价。
    logging_steps=2,  # 每两个 step 打印一次 log。
)

In [12]:
sft_trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=train_dataset,
    processing_class=tokenizer,
)
sft_trainer.train()

Map:   0%|          | 0/2961 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 0}.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmoyitech[0m ([33mmoyitech-taiyuan-university-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
2,2.3958
4,2.2826
6,2.1662
8,2.1333
10,1.9429
12,2.3429
14,2.3692
16,2.1977
18,2.1593
20,2.1179


TrainOutput(global_step=371, training_loss=2.043148580587135, metrics={'train_runtime': 548.1459, 'train_samples_per_second': 5.402, 'train_steps_per_second': 0.677, 'total_flos': 306268039564416.0, 'train_loss': 2.043148580587135, 'epoch': 1.0})

## 在小型模型和小型数据集上测试训练结果

**注意:** 以下结果是基于我们用于 SFT 训练的小型模型和数据集得出的，这是由于计算资源有限所致。若要查看在更大模型上进行全量训练的结果，请参阅上方的“SFT 在 Qwen3-0.6B 模型上的结果”部分。启用梯度检查点机制，以降低训练期间的内存使用量，但会以训练速度变慢为代价。

In [13]:
if not USE_GPU:
    sft_trainer.model.to("cpu")
test_model_with_questions(sft_trainer.model, tokenizer, questions,
                          title="Base Model (After SFT) Output")


=== Base Model (After SFT) Output ===

Model Input 1:
Give me an 1-sentence introduction of LLM.
Model Output 1:
1. I am a graduate of the University of Oxford.
2. I am a graduate of the University of Oxford.
3. I am a graduate of the University of Oxford.
4. I am a graduate of the University of Oxford.
5. I am a graduate of the University of Oxford.
6. I am a graduate of the University of Oxford.
7. I am a graduate of the University of Oxford.
8. I am a graduate of the University


Model Input 2:
Calculate 1+1-1
Model Output 2:
1+1-1 = 1+1-1 = 1+1-1 = 1+1-1 = 1+1-1 = 1+1-1 = 1+1-1 = 1+1-1 = 1+1-1 = 1+1-1 = 1+1-1 = 1+1-1 = 1+1-1 = 1+1-1 = 1+


Model Input 3:
What's the difference between thread and process?
Model Output 3:
Assistant: Thread is a single-threaded process that runs in a separate thread of execution from the main thread. It can be used to execute a piece of code in a separate thread, such as a thread for a database query or a thread for a graphical user interface. Threads