# Lora微调

In [None]:
#! 在下面运行过程中  请先执行这段代码 后面就不用在执行了 里面的变量在下面的代码中会有
"""
checkpoint = "google-bert/bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
unfinetune_model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(id2label), label2id=label2id, id2label=id2label)  # 这个数据集是7个,主要是针对语言的

checkpoint = "output/save_model"
unfinetune_model.save_pretrained(checkpoint)  # 这里是提前保存  防止因为参数是随机生成的  不好比较
tokenizer.save_pretrained(checkpoint)
"""

In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import load_dataset
from transformers import pipeline
from peft import LoraConfig, get_peft_model, TaskType

In [None]:
ds = load_dataset("agentlans/chinese-classification")
ds

In [None]:
# 这个数据有点多，我们就拿他的验证集做训练集 测试集当验证集
ds["train"] = ds["validation"]
ds.pop("validation")
ds

In [None]:
# 拿到类别
labels = set(ds["train"]["label"])
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in enumerate(labels)}
id2label, label2id

In [None]:
sample_data = ds["train"].select(range(10))
sample_data

In [None]:
sample_data[0]

In [None]:
# #! 提前保存模型
# checkpoint = "google-bert/bert-base-chinese"
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# unfinetune_model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(id2label), label2id=label2id, id2label=id2label)  # 这个数据集是7个,主要是针对语言的

checkpoint = "output/save_model"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
unfinetune_model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

def processFunciton(examples):
    result = tokenizer(examples["text"], truncation=True)
    labels = examples["label"]
    labels = list(map(lambda x: label2id[x], labels))
    result["labels"] = labels
    return result

tokenizerd_sample_data = sample_data.map(processFunciton, batched=True)
tokenizerd_sample_data


In [8]:
# 确认一下 label是否转换正确
for item in tokenizerd_sample_data:
    label = item["label"]
    label_id = item["labels"]
    assert id2label[label_id] == label, f"错误:{label} != {id2label[label_id]} "

## 看一下还没训练的结果(实际训练中 可以忽略)

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
checkpoint = "output/save_model"
unfinetune_model = AutoModelForSequenceClassification.from_pretrained(checkpoint)  # 这个数据集是7个,主要是针对语言的
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
pipe = pipeline("text-classification", model=unfinetune_model, tokenizer=tokenizer)
pipe

In [None]:
text = "都唔明點解成日都會有人拎呢樣嚟打飛機"
label = "yue"
result = pipe(text)
print(result)
print(label)

## 开始lora训练

In [None]:
ds = ds.map(processFunciton, batched=True, remove_columns=ds["train"].column_names)
ds

In [None]:
checkpoint = "output/save_model"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
model

In [None]:
lora_config = LoraConfig(task_type=TaskType.SEQ_CLS)  # 这个地方是想把module_to_save设为空  但是事实上不行
lora_config

In [None]:
peft_model = get_peft_model(model, lora_config)
peft_model

In [None]:
peft_model

In [None]:
lora_config

In [16]:
args = TrainingArguments(
    "output/lora_seq_cls",
    per_device_train_batch_size=24,
    gradient_accumulation_steps=2,
    max_steps=3000,
    save_steps=100,
    save_safetensors="steps",
    save_total_limit=3,
    data_seed=42,
    logging_steps=10,
    logging_strategy="steps",
    save_strategy="steps",
)

In [17]:
trainer = Trainer(
    model=peft_model,
    args=args,
    data_collator=DataCollatorWithPadding(tokenizer),
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    processing_class=tokenizer,
)

In [None]:
trainer.train()

# 推理部分

In [1]:
import torch
from peft import PeftModelForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [2]:
checkpoint = "output/save_model"
lora_checkpoint = "output/lora_seq_cls/checkpoint-300"

In [3]:
unfinetune_model = AutoModelForSequenceClassification.from_pretrained(checkpoint).to("cuda:0")  # 这个数据集是7个,主要是针对语言的
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


In [None]:
with torch.inference_mode():
    text = "都唔明點解成日都會有人拎呢樣嚟打飛機"
    label = "yue"
    inputs = tokenizer(text, return_tensors="pt")
    inputs = {k:v.to(unfinetune_model.device)for k, v in inputs.items()}
    result = unfinetune_model(**inputs)
    logits = result.logits
    result = torch.nn.functional.softmax(logits, -1).max().item()
    print(f"score: {result}")
    result = logits.argmax(-1).cpu()
    result = result.cpu().item()
    print(unfinetune_model.config.id2label[result])

In [None]:
print(unfinetune_model)

In [None]:
# 这里其实可以不用接收这个返回 也能直接用于lora
unfinetune_model = PeftModelForSequenceClassification.from_pretrained(unfinetune_model, lora_checkpoint)
unfinetune_model

In [None]:
unfinetune_model.device

In [None]:
with torch.inference_mode():
    text = "都唔明點解成日都會有人拎呢樣嚟打飛機"
    label = "yue"
    inputs = tokenizer(text, return_tensors="pt")
    inputs = {k:v.to(unfinetune_model.device)for k, v in inputs.items()}
    result = unfinetune_model(**inputs)
    logits = result.logits
    result = torch.nn.functional.softmax(logits, -1).max().item()
    print(f"score: {result}")
    result = logits.argmax(-1).cpu()
    result = result.cpu().item()
    print(unfinetune_model.config.id2label[result])

In [None]:
with unfinetune_model.disable_adapter():
    with torch.inference_mode():
        text = "都唔明點解成日都會有人拎呢樣嚟打飛機"
        label = "yue"
        inputs = tokenizer(text, return_tensors="pt")
        inputs = {k:v.to(unfinetune_model.device)for k, v in inputs.items()}
        result = unfinetune_model(**inputs)
        logits = result.logits
        result = torch.nn.functional.softmax(logits, -1).max().item()
        print(f"score: {result}")
        result = logits.argmax(-1).cpu()
        result = result.cpu().item()
        print(unfinetune_model.config.id2label[result])