In [1]:
import pandas as pd
import datasets
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer
import torch
from peft import AdaLoraConfig, TaskType, get_peft_model

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 数据获取
df=pd.read_json("/mnt/workspace/DistillationTrainData.jsonl",lines=True)
ds=datasets.Dataset.from_pandas(df)
examples=pd.read_json("/mnt/workspace/20250208183027_example_output.jsonl",lines=True)


In [3]:
# 分词器定义
tokenizer = AutoTokenizer.from_pretrained('/mnt/workspace/Qwen2.5-7B-Instruct', use_fast=False, trust_remote_code=True)


In [4]:
def process_func(example):
    MAX_LENGTH = 1024  # 最大序列长度
    system_prompt = "<|im_start|>system\n你是一个专业医生<|im_end|>\n"
    user_input = f"<|im_start|>user\n接下来患者会给你提供病历信息，请你根据提供的信息生成\"诊断\"和\"诊断依据\"，\"诊断\"即患者所患病症的诊断，\"诊断依据\"即从病历信息中提取的做出诊断的依据。以下是信息：\n{example['feature_content']}<|im_end|>\n"
    assistant_output = f"<|im_start|>assistant\n{example['bingli_json']}<|im_end|>"

    # 编码指令和回答
    instruction = tokenizer(system_prompt + user_input, add_special_tokens=False)
    response = tokenizer(assistant_output, add_special_tokens=False)

    # 拼接输入序列
    input_ids = instruction["input_ids"] + response["input_ids"]
    attention_mask = instruction["attention_mask"] + response["attention_mask"]
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"]

    # 截断过长序列
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }
tokenized_id= ds.map(process_func, remove_columns=ds.column_names)

Map: 100%|██████████| 4000/4000 [00:18<00:00, 217.28 examples/s]


In [5]:
# 导入模型
model = AutoModelForCausalLM.from_pretrained('/mnt/workspace/Qwen2.5-7B-Instruct', trust_remote_code=True, torch_dtype=torch.half,device_map="auto")
model.enable_input_require_grads() 


Loading checkpoint shards: 100%|██████████| 4/4 [01:24<00:00, 21.12s/it]


In [6]:
# ada-lora参数
config = AdaLoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    inference_mode=False, 
    r=8, 
    lora_alpha=16, 
    lora_dropout=0.1
    # beta等参数采取默认值
)

model = get_peft_model(model, config)

In [7]:
model.print_trainable_parameters()

trainable params: 30,279,984 || all params: 7,645,896,692 || trainable%: 0.3960


In [8]:
# 训练
args = TrainingArguments(
    output_dir="./output/Qwen",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,
    num_train_epochs=5,
    gradient_checkpointing=True,
    save_steps=500,
    learning_rate=1e-4,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_id,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

In [9]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
500,1.9323
1000,0.7672
1500,0.6564
2000,0.6116
2500,0.6046
3000,0.6098
3500,0.5835
4000,0.5779
4500,0.5622
5000,0.562


TrainOutput(global_step=5000, training_loss=0.7467437622070312, metrics={'train_runtime': 14315.8901, 'train_samples_per_second': 1.397, 'train_steps_per_second': 0.349, 'total_flos': 4.4638448698444506e+17, 'train_loss': 0.7467437622070312, 'epoch': 5.0})

In [10]:
model.save_pretrained("qwen")