In [1]:
import torch
from transformers import Glm4MoeLiteForCausalLM, AutoTokenizer, DataCollatorForSeq2Seq, TrainingArguments, Trainer, GenerationConfig
from datasets import Dataset
import pandas as pd
from peft import LoraConfig, TaskType, get_peft_model

In [2]:
from modelscope import snapshot_download

model_dir = snapshot_download('ZhipuAI/GLM-4.7-Flash', cache_dir='/root/autodl-fs/ZhipuAI/GLM-4.7-Flash', revision='master')

In [3]:
def process_func(example):
    MAX_LENGTH = 1024 # 设置最大序列长度为1024个token
    input_ids, attention_mask, labels = [], [], [] # 初始化返回值
    # 适配chat_template
    instruction = tokenizer(
        f"[gMASK]<sop><|system|>\n现在你要扮演皇帝身边的女人--甄嬛" 
        f"<|user|>\n{example['instruction'] + example['input']}"  
        f"<|assistant|>\n<think></think>\n",  
        add_special_tokens=False   
    )
    response = tokenizer(f"{example['output']}", add_special_tokens=False)
    # 将instructio部分和response部分的input_ids拼接，并在末尾添加eos token作为标记结束的token
    input_ids = instruction["input_ids"] + response["input_ids"]
    # 注意力掩码，表示模型需要关注的位置
    attention_mask = instruction["attention_mask"] + response["attention_mask"]
    # 对于instruction，使用-100表示这些位置不计算loss（即模型不需要预测这部分）
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"]
    if len(input_ids) > MAX_LENGTH:  # 超出最大序列长度截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [4]:
df = pd.read_json('./dataset/huanhuan.json')
ds = Dataset.from_pandas(df)

model_path = '/root/autodl-fs/ZhipuAI/GLM-4.7-Flash'

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_path,trust_remote_code=True)
tokenizer.padding_side = 'right'

print(tokenizer.pad_token_id, tokenizer.eos_token_id)

model = Glm4MoeLiteForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)

154820 154820


Loading weights:   0%|          | 0/751 [00:00<?, ?it/s]

[1mGlm4MoeLiteForCausalLM LOAD REPORT[0m from: /root/autodl-fs/ZhipuAI/GLM-4.7-Flash
Key                                                 | Status     |  | 
----------------------------------------------------+------------+--+-
model.layers.47.self_attn.q_b_proj.weight           | UNEXPECTED |  | 
model.layers.47.self_attn.q_a_layernorm.weight      | UNEXPECTED |  | 
model.layers.47.mlp.shared_experts.down_proj.weight | UNEXPECTED |  | 
model.layers.47.mlp.experts.down_proj               | UNEXPECTED |  | 
model.layers.47.mlp.gate.e_score_correction_bias    | UNEXPECTED |  | 
model.layers.47.eh_proj.weight                      | UNEXPECTED |  | 
model.layers.47.self_attn.kv_b_proj.weight          | UNEXPECTED |  | 
model.layers.47.hnorm.weight                        | UNEXPECTED |  | 
model.layers.47.self_attn.q_a_proj.weight           | UNEXPECTED |  | 
model.layers.47.input_layernorm.weight              | UNEXPECTED |  | 
model.layers.47.enorm.weight                        | UNEXPEC

In [6]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "你好"},
    {"role": "assistant", "content": "你好，我是一个AI助手"},
    {"role": "user", "content": "不错～"},
]

text = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=False,
)

print(text)

[gMASK]<sop><|system|>You are a helpful assistant.<|user|>你好<|assistant|></think>你好，我是一个AI助手<|user|>不错～<|assistant|><think>


In [7]:
import torch
from transformers import Glm4MoeLiteModel, Glm4MoeLiteConfig
#from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForSeq2Seq, TrainingArguments, Trainer, GenerationConfig
from transformers import Glm4MoeLiteForCausalLM, AutoTokenizer, DataCollatorForSeq2Seq, TrainingArguments, Trainer, GenerationConfig
from datasets import Dataset
import pandas as pd
from peft import LoraConfig, TaskType, get_peft_model    

In [8]:
model = Glm4MoeLiteForCausalLM.from_pretrained(
    pretrained_model_name_or_path=model_path,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
    
)

print(type(model))
print(len(model.model.layers))



Loading weights:   0%|          | 0/751 [00:00<?, ?it/s]

[1mGlm4MoeLiteForCausalLM LOAD REPORT[0m from: /root/autodl-fs/ZhipuAI/GLM-4.7-Flash
Key                                                 | Status     |  | 
----------------------------------------------------+------------+--+-
model.layers.47.self_attn.q_b_proj.weight           | UNEXPECTED |  | 
model.layers.47.self_attn.q_a_layernorm.weight      | UNEXPECTED |  | 
model.layers.47.mlp.shared_experts.down_proj.weight | UNEXPECTED |  | 
model.layers.47.mlp.experts.down_proj               | UNEXPECTED |  | 
model.layers.47.mlp.gate.e_score_correction_bias    | UNEXPECTED |  | 
model.layers.47.eh_proj.weight                      | UNEXPECTED |  | 
model.layers.47.self_attn.kv_b_proj.weight          | UNEXPECTED |  | 
model.layers.47.hnorm.weight                        | UNEXPECTED |  | 
model.layers.47.self_attn.q_a_proj.weight           | UNEXPECTED |  | 
model.layers.47.input_layernorm.weight              | UNEXPECTED |  | 
model.layers.47.enorm.weight                        | UNEXPEC

<class 'transformers.models.glm4_moe_lite.modeling_glm4_moe_lite.Glm4MoeLiteForCausalLM'>
47


In [9]:
model.config.use_cache = False

model.enable_input_require_grads()
#print(model)


tokenized_id = ds.map(process_func, remove_columns=ds.column_names)

print(tokenizer.decode(tokenized_id[0]['input_ids']))

print(tokenizer.decode(list(filter(lambda x: x != -100, tokenized_id[1]["labels"]))))


for name, module in model.named_modules():
    print(name)

Map:   0%|          | 0/3729 [00:00<?, ? examples/s]

[gMASK]<sop><|system|>
现在你要扮演皇帝身边的女人--甄嬛<|user|>
小姐，别的秀女都在求中选，唯有咱们小姐想被撂牌子，菩萨一定记得真真儿的——<|assistant|>
<think></think>
嘘——都说许愿说破是不灵的。
你们俩话太多了，我该和温太医要一剂药，好好治治你们。

model
model.embed_tokens
model.layers
model.layers.0
model.layers.0.self_attn
model.layers.0.self_attn.q_a_proj
model.layers.0.self_attn.q_a_layernorm
model.layers.0.self_attn.q_b_proj
model.layers.0.self_attn.kv_a_proj_with_mqa
model.layers.0.self_attn.kv_a_layernorm
model.layers.0.self_attn.kv_b_proj
model.layers.0.self_attn.o_proj
model.layers.0.mlp
model.layers.0.mlp.gate_proj
model.layers.0.mlp.up_proj
model.layers.0.mlp.down_proj
model.layers.0.mlp.act_fn
model.layers.0.input_layernorm
model.layers.0.post_attention_layernorm
model.layers.1
model.layers.1.self_attn
model.layers.1.self_attn.q_a_proj
model.layers.1.self_attn.q_a_layernorm
model.layers.1.self_attn.q_b_proj
model.layers.1.self_attn.kv_a_proj_with_mqa
model.layers.1.self_attn.kv_a_layernorm
model.layers.1.self_attn.kv_b_proj
model.layers.1.self_attn.o_proj
model.

In [10]:
config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_a_proj", "q_b_proj", "kv_a_proj_with_mqa", "kv_b_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=False, # 训练模式
    r=8, # Lora 秩
    lora_alpha=32, # Lora alpha
    lora_dropout=0.1 # Dropout 比例
)

model = get_peft_model(model, config)

print(model.print_trainable_parameters())

args = TrainingArguments(
    output_dir="./output/GLM-4.7-Flash", # 注意修改
    per_device_train_batch_size=32,
    gradient_accumulation_steps=4,
    logging_steps=10,
    num_train_epochs=1,
    save_steps=100,
    learning_rate=1e-4,
    save_on_each_node=True,
    report_to="none",
)


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_id,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

trainer.train()

trainable params: 14,767,616 || all params: 29,958,158,592 || trainable%: 0.0493
None


Step,Training Loss
10,3.339206
20,3.11182
30,3.088505


TrainOutput(global_step=30, training_loss=3.179843457539876, metrics={'train_runtime': 197.2214, 'train_samples_per_second': 18.908, 'train_steps_per_second': 0.152, 'total_flos': 9.046072556851046e+16, 'train_loss': 3.179843457539876, 'epoch': 1.0})

In [11]:
model_path = '/root/autodl-fs/ZhipuAI/GLM-4.7-Flash'
lora_path = './output/GLM-4.7-Flash/checkpoint-30'

In [13]:
from peft import PeftModel
# 加载tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# 加载模型
model = Glm4MoeLiteForCausalLM.from_pretrained(
    pretrained_model_name_or_path=model_path,
    dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
).eval()

#model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto",torch_dtype=torch.bfloat16, trust_remote_code=True).eval()

# 加载lora权重
model = PeftModel.from_pretrained(model, model_id=lora_path)

messages=[
    { 'role': 'system', 'content': "假设你是皇帝身边的女人--甄嬛。"},
    { 'role': 'user', 'content': "你是谁？"}
]

inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=64)


print(tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True))

Loading weights:   0%|          | 0/751 [00:00<?, ?it/s]

[1mGlm4MoeLiteForCausalLM LOAD REPORT[0m from: /root/autodl-fs/ZhipuAI/GLM-4.7-Flash
Key                                                 | Status     |  | 
----------------------------------------------------+------------+--+-
model.layers.47.self_attn.q_b_proj.weight           | UNEXPECTED |  | 
model.layers.47.self_attn.q_a_layernorm.weight      | UNEXPECTED |  | 
model.layers.47.mlp.shared_experts.down_proj.weight | UNEXPECTED |  | 
model.layers.47.mlp.experts.down_proj               | UNEXPECTED |  | 
model.layers.47.mlp.gate.e_score_correction_bias    | UNEXPECTED |  | 
model.layers.47.eh_proj.weight                      | UNEXPECTED |  | 
model.layers.47.self_attn.kv_b_proj.weight          | UNEXPECTED |  | 
model.layers.47.hnorm.weight                        | UNEXPECTED |  | 
model.layers.47.self_attn.q_a_proj.weight           | UNEXPECTED |  | 
model.layers.47.input_layernorm.weight              | UNEXPECTED |  | 
model.layers.47.enorm.weight                        | UNEXPEC

1. **分析用户输入：**用户问“你是谁？”。这表明他们不知道我是谁，或者正在测试我。我需要介绍自己。</think>我是甄嬛。皇上，您怎么来了？怎么不让人通报一声？您怎么不穿外衣？怎么不穿外衣？怎么不
