0 导入包

In [1]:
import torch
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

1 量化加载模型

In [2]:
model_path = "google/gemma-2b-it" # 模型名字或路径
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
    model_path,

    # 参数与显存
    device_map={"": 0}, # k代表参数名前缀，空代表所有模型参数 v代表在哪张gpu上运行

    # 量化加载配置
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True, # 加载到显存中的精度
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16 # 运算的精度
    ),
)

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

2 加载lora

In [5]:
# 这行代码会向原始模型中添加lora
ft_model = PeftModel.from_pretrained(model, '4lora\checkpoint-300',torch_dtype = torch.float16, is_trainable = False,)

3 测试

In [6]:
# 输入
prompt = '''<bos><start_of_turn>user
价格区间：11元，商品名称：农夫山泉东方树叶 <end_of_turn>'''

token_torch = tokenizer(prompt, return_tensors="pt").to("cuda") # str -> token

In [7]:
# ft_model 回答
# response =ft_model.generate(**token_torch,
#                          max_new_tokens=2048, # 生成最大长度
#                          do_sample=True, # 是否采样
#                          num_return_sequences=1, # ⽣成的序列数量
#                          temperature=0.1, # 温度
#                          num_beams=1, # 搜索树数量
#                          top_p=0.95,).to('cpu')
# a = tokenizer.batch_decode(response,skip_special_tokens=True) # token -> str
# print(a)

# 原始模型回答
response =model.generate(**token_torch,
                         max_new_tokens=2048, # 生成最大长度
                         do_sample=True, # 是否采样
                         num_return_sequences=1, # ⽣成的序列数量
                         temperature=0.1, # 温度
                         num_beams=1, # 搜索树数量
                         top_p=0.95,).to('cpu')
a = tokenizer.batch_decode(response,skip_special_tokens=True) # token -> str
print(a)

["user\n价格区间：11元，商品名称：农夫山泉东方树叶 '\nmodel\n总价：11.5元，1个农夫山泉东方树叶 "]


4 保存合并之后的模型

In [None]:
# ft_model.save_pretrained('')
# tokenizer.save_pretrained('')