# train
huggingface提供的训练是经过抽象的，分为trainingArguments，training

## 1 .准备模型&分词器

In [None]:
from transformers import AutoTokenizer,AutoModelForCausalLM
import os
import torch

device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'

model_path = os.path.expanduser('~/models/Qwen/Qwen3-0.6B')

model = AutoModelForCausalLM.from_pretrained(model_path,device_map=device,torch_dtype=torch.float32)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
print(model)

In [None]:
print(tokenizer('你好')) # 只有input_ids和attention_mask
print(tokenizer)

## 3.准备数据集

In [None]:
import os
from datasets import load_dataset
data_dir = os.path.expanduser("~/datasets/alpaca-gpt4-data-zh")
ds = load_dataset("json",data_dir=data_dir,split="train")
ds

In [None]:
""" 
数据集的每一个item有instruction，input和output三个

"""
ds[5]

In [None]:
def process_func(example):
    f"""
    处理数据集，用来把数据改造成适合训练的格式.
    对话数据集，这里的每个example都是一次对话，只有问题&回答
    这里需要加上Human和Assistant标签
    Args:
        example字典结构:
            'instruction':字符串
            'input':str
            'output':str
        
    """
    MAX_LENGTH=256
    # 训练用的instruction由instruction和input构成
    instruction = tokenizer(
        "\n".join(["Human: "+example["instruction"], example["input"]]).strip()
        +"\n\nAssistant: ")
    # 响应就是output
    response = tokenizer(example['output'] + tokenizer.eos_token)
    # 模型训练的内容就是instruction+output这一次对话，
    # 以便后面给出instruction模型能够自回归生成后面的output
    input_ids = instruction['input_ids'] + response['input_ids']
    atten_mask = instruction['attention_mask']+response['attention_mask']
    # Create labels with -100 for instruction part (ignored in loss) and actual tokens for response
    labels = [-100] * len(instruction['input_ids']) + response['input_ids']
    
    # 最大长度截断
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        atten_mask = atten_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    
    return {
        "input_ids":input_ids,
        "attention_mask":atten_mask,
        "labels":labels
    }

example = process_func(ds[0])
print(f'训练用的example:',example)
print(f'解码后:\n{tokenizer.decode(example["input_ids"])}')


In [None]:
# 转化数据,删除原始列名，默认是保留的(instruction,input,output)
train_ds = ds.map(process_func,remove_columns = ds.column_names)
train_ds

## 准备训练

### 训练参数配置

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="model/qwen3-0.6B",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    # per_device_eval_batch_size=2,
    num_train_epochs=2,
    weight_decay=1e-4,
    # eval_strategy="epoch",
    save_strategy="epoch",
    # load_best_model_at_end=True,
    # push_to_hub=True,
    logging_dir="./logs",
    logging_steps=1,
    logging_strategy='steps',
)

In [None]:
from transformers import Trainer
from transformers import DataCollatorForSeq2Seq
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    processing_class=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer,padding=True),
)

trainer.train()

# 使用chat template

In [1]:
from transformers import AutoTokenizer,AutoModelForCausalLM
import os
import torch

device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'

model_path = os.path.expanduser('~/models/Qwen/Qwen3-0.6B')

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map=device,
    torch_dtype=torch.float32,  # 使用BF16
    # attn_implementation="flash_attention_2",  # 如果支持Flash Attention
    )
tokenizer = AutoTokenizer.from_pretrained(model_path)

import os
from datasets import load_dataset
data_dir = os.path.expanduser("~/datasets/alpaca-gpt4-data-zh")
ds = load_dataset("json",data_dir=data_dir,split="train")


def process_func(example):
    f"""
    处理数据集，用来把数据改造成适合训练的格式.
    对话数据集，这里的每个example都是一次对话，只有问题&回答
    这里需要加上Human和Assistant标签
    Args:
        example字典结构:
            'instruction':字符串
            'input':str
            'output':str
        使用tokenizer的apply_chat_template方法
    """
    MAX_LENGTH=256
    user_content = example['instruction']
    if example['input'].strip():
        user_content += "\n" + example['input']
    
    messages = [
        {"role":"user","content":user_content},
        {"role":"assistant","content":example['output']}
    ]
    # 使用chat template
    full_text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False
    )
    
    # 分别处理用户和助手部分
    user_messages  = [{"role":"user","content":user_content}]
    user_text = tokenizer.apply_chat_template(
        user_messages,
        tokenize=False,
        add_generation_prompt=True # 添加assistant开始标记
    )
    # tokenize
    full_tokens = tokenizer(full_text,add_special_tokens=False)
    user_tokens = tokenizer(user_text,add_special_tokens=False)
    
    input_ids = full_tokens['input_ids']
    attention_mask = full_tokens['attention_mask']
    # 创建labels 用来计算loss
    labels = [-100] * len(user_tokens['input_ids']) + \
        input_ids[len(user_tokens['input_ids']):]
    
    # 长度截断
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    
    return {
        "input_ids":input_ids,
        "attention_mask":attention_mask,
        "labels":labels
    }
    

# 转化数据,删除原始列名，默认是保留的(instruction,input,output)
train_ds = ds.map(process_func,remove_columns = ds.column_names)


  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 48818/48818 [00:29<00:00, 1671.26 examples/s]


In [None]:
tokenizer.decode(train_ds[0]['input_ids'])

'<|im_start|>user\n保持健康的三个提示。<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n以下是保持健康的三个提示：\n\n1. 保持身体活动。每天做适当的身体运动，如散步、跑步或游泳，能促进心血管健康，增强肌肉力量，并有助于减少体重。\n\n2. 均衡饮食。每天食用新鲜的蔬菜、水果、全谷物和脂肪含量低的蛋白质食物，避免高糖、高脂肪和加工食品，以保持健康的饮食习惯。\n\n3. 睡眠充足。睡眠对人体健康至关重要，成年人每天应保证 7-8 小时的睡眠。良好的睡眠有助于减轻压力，促进身体恢复，并提高注意力和记忆力。<|im_end|>\n'

: 