# 调包

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoConfig
from peft import get_peft_model, LoraConfig, TaskType
import swanlab
from swanlab.integration.transformers import SwanLabCallback
import torch

  from .autonotebook import tqdm as notebook_tqdm


# 训练前的准备

## 变量定义

In [3]:
model_path = '/autodl-fs/data/stepfun-ai/Step-3___5-Flash'  ## 模型路径

tokenizer = AutoTokenizer.from_pretrained(model_path, 
                                        use_fast=True,
                                        trust_remote_code=True)    ## 分词器初始化
tokenizer

The tokenizer you are loading from '/autodl-fs/data/stepfun-ai/Step-3___5-Flash' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


LlamaTokenizerFast(name_or_path='/autodl-fs/data/stepfun-ai/Step-3___5-Flash', vocab_size=128000, model_max_length=131072, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<｜begin▁of▁sentence｜>', 'eos_token': '<|im_end|>', 'pad_token': '<｜end▁of▁sentence｜>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<｜begin▁of▁sentence｜>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<｜end▁of▁sentence｜>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<｜▁pad▁｜>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128000: AddedToken("<im_start>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128001: AddedToken("<im_patch>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128002: AddedToken("<im_end>", rstrip=False, lstrip=False, single

In [4]:

config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)  ## 模型配置
config


Step3p5Config {
  "architectures": [
    "Step3p5ForCausalLM"
  ],
  "att_impl_type": "GQA",
  "attention_dropout": 0.0,
  "attention_other_setting": {
    "attention_type": "sliding_attention",
    "head_dim": 128,
    "num_attention_groups": 8,
    "num_attention_heads": 96,
    "true_head_dim": 128
  },
  "auto_map": {
    "AutoConfig": "configuration_step3p5.Step3p5Config",
    "AutoModelForCausalLM": "modeling_step3p5.Step3p5ForCausalLM"
  },
  "bos_token_id": 0,
  "dtype": "bfloat16",
  "eos_token_id": [
    1,
    2,
    128007
  ],
  "head_dim": 128,
  "hidden_size": 4096,
  "intermediate_size": 11264,
  "layer_types": [
    "full_attention",
    "sliding_attention",
    "sliding_attention",
    "sliding_attention",
    "full_attention",
    "sliding_attention",
    "sliding_attention",
    "sliding_attention",
    "full_attention",
    "sliding_attention",
    "sliding_attention",
    "sliding_attention",
    "full_attention",
    "sliding_attention",
    "sliding_attention",


In [5]:


base = AutoModelForCausalLM.from_pretrained(
    model_path,
    dtype=torch.bfloat16,
    config=config,
    device_map="auto",
    trust_remote_code=True,  

)  ##base模型加载
base

Loading checkpoint shards: 100%|██████████| 44/44 [05:24<00:00,  7.38s/it]
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Step3p5ForCausalLM(
  (model): Step3p5Model(
    (embed_tokens): Embedding(128896, 4096, padding_idx=1)
    (layers): ModuleList(
      (0): Step3p5DecoderLayer(
        (self_attn): Step3p5Attention(
          (rotary_emb): Step3p5RotaryEmbedding()
          (q_proj): Linear(in_features=4096, out_features=8192, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=8192, out_features=4096, bias=False)
          (q_norm): Step3p5RMSNorm()
          (k_norm): Step3p5RMSNorm()
          (g_proj): Linear(in_features=4096, out_features=64, bias=False)
        )
        (mlp): Step3p5MLP(
          (gate_proj): Linear(in_features=4096, out_features=11264, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11264, bias=False)
          (down_proj): Linear(in_features=11264, out_features=4096, bias=False)
          (act_fn): SiL

## Lora

In [6]:
lora_cfg = LoraConfig(     ## Lora配置
    task_type=TaskType.CAUSAL_LM,
    r=16, ##rank
    lora_alpha=32,## The alpha parameter for Lora scaling
    lora_dropout=0.05,##  The dropout probability for Lora layers
    target_modules=[
        "q_proj", "k_proj" ## 这两个是必要的
    ],
    bias="none",
)
lora_cfg

LoraConfig(task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, peft_version='0.18.1', base_model_name_or_path=None, revision=None, inference_mode=False, r=16, target_modules={'q_proj', 'k_proj'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', trainable_token_indices=None, loftq_config={}, eva_config=None, corda_config=None, use_dora=False, alora_invocation_tokens=None, use_qalora=False, qalora_group_size=16, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False, target_parameters=None, arrow_config=None, ensure_weight_tying=False)

In [7]:

peft_model = get_peft_model(base, lora_cfg)  ## 将lora配置注入base模型
peft_model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Step3p5ForCausalLM(
      (model): Step3p5Model(
        (embed_tokens): Embedding(128896, 4096, padding_idx=1)
        (layers): ModuleList(
          (0): Step3p5DecoderLayer(
            (self_attn): Step3p5Attention(
              (rotary_emb): Step3p5RotaryEmbedding()
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=8192, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=8192, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vecto

## 训练参数配置

In [8]:
output_dir = "./lora_checkpoints"

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=10,              # 训练 10 轮
    save_strategy="epoch",           # 每轮结束后保存一个 checkpoint
    per_device_train_batch_size=32,   # 显存够就调大，炸了就调小
    learning_rate=2e-3,              # 学习率
    save_total_limit=1,              # 仅保留最近的一个 checkpoint，极度节省空间
    bf16=True,                       # bf16精度
    logging_steps=100,               # 打印日志的频率
)

## 可视化训练过程相关配置

In [9]:
swanlab_callback = SwanLabCallback(
    project="step3.5flash-Lora",  # 注意修改
    experiment_name="step3.5flash-LoRA-experiment"  # 注意修改
)

## 数据加载和peft_model的配置

In [10]:
import copy
import torch
from datasets import load_dataset
from transformers import DataCollatorForSeq2Seq

class DollyDataCollator:
    def __init__(self, tokenizer, model):
        # 使用官方 collator 处理 padding
        self.base_collator = DataCollatorForSeq2Seq(
            tokenizer=tokenizer, 
            model=model, 
            padding=True, 
            label_pad_token_id=-100, 
            return_tensors="pt"
        )

    def __call__(self, features):
        # 弹出 ref_text，防止 Trainer 将非张量数据传给模型导致报错
        ref_texts = [f.pop("ref_text") for f in features if "ref_text" in f]
        batch = self.base_collator(features)
        # 如果你需要在 eval 阶段用 ref_text，可以保留，但 train 阶段模型不收这个参数
        # batch["ref_text"] = ref_texts 
        return batch

class DollyProcessor:
    def __init__(self, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.max_len = max_len

    def process_fn(self, example):
        # 1. 构建 Prompt
        prompt_text = f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example.get('context', '')}\n\n### Response:\n"
        full_text = prompt_text + f"{example['response']}{self.tokenizer.eos_token}"
        
        # 2. 对全文进行编码 
        # 注意：使用 add_special_tokens=True 以确保包含 BOS token（如果模型需要）
        encodings = self.tokenizer(
            full_text, 
            truncation=True, 
            max_length=self.max_len, 
            padding=False, 
            add_special_tokens=True 
        )
        
        input_ids = list(encodings["input_ids"])
        attention_mask = list(encodings["attention_mask"])
        labels = copy.deepcopy(input_ids)
        
        # 3. 计算 Prompt 长度（关键：保持与全文编码一致的 special tokens 设置）
        # 这样计算出的长度才能准确匹配到 Response 的起始位置
        prompt_encodings = self.tokenizer(
            prompt_text, 
            add_special_tokens=True, 
            truncation=True, 
            max_length=self.max_len
        )
        prefix_len = len(prompt_encodings["input_ids"])
        
        # 4. 遮掩 Label 中的 Prompt 部分
        for i in range(min(prefix_len, len(labels))):
            labels[i] = -100 
            
        return {
            "input_ids": input_ids, 
            "attention_mask": attention_mask, 
            "labels": labels, 
            "ref_text": example["response"]
        }

# 修改后的函数，需要传入 tokenizer 和 max_len
def get_processed_dataset(tokenizer, max_len):
    # 1. 加载原始数据
    raw_dataset = load_dataset("json", data_files='dolly_huanhuan.jsonl')["train"]
    
    # 2. 实例化处理器
    processor = DollyProcessor(tokenizer, max_len)
    
    # 3. **核心步骤**：执行分词转换
    # num_proc 可以根据你的 CPU 核心数调整，加速处理
    tokenized_dataset = raw_dataset.map(
        processor.process_fn,
        remove_columns=raw_dataset.column_names,
        desc="Running tokenizer on dataset"
    )
    
    return tokenized_dataset

In [11]:
train_dataset = get_processed_dataset(tokenizer, 1024)

collator = DollyDataCollator(tokenizer, peft_model)

peft_model.config.use_cache = False  # 必须关闭，否则与梯度检查点冲突
peft_model.enable_input_require_grads()
peft_model.gradient_checkpointing_enable()
peft_model.print_trainable_parameters() ## 查看参数量

trainable params: 14,696,448 || all params: 196,970,826,816 || trainable%: 0.0075


# 正式开始训练

In [12]:
trainer = Trainer(
    model=peft_model,                 # 处理好的 LoRA 模型
    args=training_args,          # 上面的精简配置
    train_dataset=train_dataset, # 你的训练集
    data_collator=collator,      # 负责 Padding 的 Collator
    callbacks=[swanlab_callback]
)

trainer.train()

  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Step,Training Loss
100,2.8261
200,2.5109
300,2.1712
400,1.7701
500,1.3388
600,0.8903
700,0.4882
800,0.213
900,0.1113
1000,0.0501


TrainOutput(global_step=1170, training_loss=1.060844347313938, metrics={'train_runtime': 10919.3676, 'train_samples_per_second': 3.415, 'train_steps_per_second': 0.107, 'total_flos': 5.294349044001255e+18, 'train_loss': 1.060844347313938, 'epoch': 10.0})

## 测试

In [1]:
import torch
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from transformers.trainer_utils import get_last_checkpoint

mode_path = '/autodl-fs/data/stepfun-ai/Step-3___5-Flash'
lora_path = get_last_checkpoint('./lora_checkpoints/')


device = "cuda"
# 1. 加载 tokenizer (建议加上之前提到的修复参数)
tokenizer = AutoTokenizer.from_pretrained(mode_path, trust_remote_code=True)

# 2. 加载 Base Model
model = AutoModelForCausalLM.from_pretrained(
    mode_path, 
    device_map="auto", 
    torch_dtype=torch.bfloat16, 
    trust_remote_code=True
)

# 3. 加载 LoRA 权重
model = PeftModel.from_pretrained(model, model_id=lora_path)
model.eval() # 切换到推理模式

# 4. **构建与训练一致的 Prompt**
prompt = "你和温实初是什么关系？"

# 严格按照你训练时的 Dolly 格式拼接
# 如果训练时没有 context，这里 Input 也可以省略
# full_prompt = f"### Instruction:{prompt}\n\n### Response:\n"
full_prompt = f"### Instruction:\n{prompt}\n\n### Input:\n\n\n### Response:\n"



# 5. Tokenize 并将数据移动到显卡
inputs = tokenizer(full_prompt, return_tensors="pt").to(model.device)

# 6. 生成
gen_kwargs = {
    "max_new_tokens": 512,  # 建议用 max_new_tokens 而不是 max_length
    "do_sample": True, 
    "top_p": 0.9, 
    "temperature": 0.4,
    "repetition_penalty": 1.2,
    "eos_token_id": [128007, tokenizer.eos_token_id], 
}

with torch.no_grad():
    outputs = model.generate(**inputs, **gen_kwargs)
    # 只解码生成出来的部分（去掉输入的 prompt 部分）
    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    print("回答：")
    print(response)

  from .autonotebook import tqdm as notebook_tqdm
The tokenizer you are loading from '/autodl-fs/data/stepfun-ai/Step-3___5-Flash' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 44/44 [03:34<00:00,  4.87s/it]
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:128007 for open-end generation.


回答：
并无私情，皇上若不信，大可以彻查究竟。
