In [1]:
import os
os.environ['http_proxy'] = 'http://127.0.0.1:7890'
os.environ['https_proxy'] = 'http://127.0.0.1:7890'

## 不完全量化

- load_in_8bit:
    - embed_tokens 继续是 torch.float16
    - 每个layer的内部（self attention）以及 mlp 部分是 int8
    - 每个layer的output（layernorm）部分是 float16（如果 load 时传入了 `torch_dtype=torch.bfloat16`，则这部分为 torch.float16）
    - 同理适用于 load_in_4bit

    ```
    model.embed_tokens.weight torch.float16 cuda:0
    model.layers.0.self_attn.q_proj.weight torch.int8 cuda:0
    model.layers.0.self_attn.k_proj.weight torch.int8 cuda:0
    model.layers.0.self_attn.v_proj.weight torch.int8 cuda:0
    model.layers.0.self_attn.o_proj.weight torch.int8 cuda:0
    model.layers.0.mlp.gate_proj.weight torch.int8 cuda:0
    model.layers.0.mlp.up_proj.weight torch.int8 cuda:0
    model.layers.0.mlp.down_proj.weight torch.int8 cuda:0
    model.layers.0.input_layernorm.weight torch.float16 cuda:0
    model.layers.0.post_attention_layernorm.weight torch.float16 cuda:0
    ```

In [12]:
import torch
from torch import nn
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer
from transformers.optimization import AdamW


In [16]:
# del model
import gc         # garbage collect library
gc.collect()
torch.cuda.empty_cache() 

In [3]:
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B", 
                                             quantization_config=BitsAndBytesConfig(
                                                 load_in_8bit=True,
                                                 # load_in_4bit=True
                                             ), 
                                             torch_dtype=torch.bfloat16,
                                             device_map="auto")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.
You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


In [15]:
for name, para in model.named_parameters():
    print(name, para.dtype, para.shape, para.device)

model.embed_tokens.weight torch.bfloat16 torch.Size([128256, 4096]) cuda:0
model.layers.0.self_attn.q_proj.weight torch.uint8 torch.Size([8388608, 1]) cuda:0
model.layers.0.self_attn.k_proj.weight torch.uint8 torch.Size([2097152, 1]) cuda:0
model.layers.0.self_attn.v_proj.weight torch.uint8 torch.Size([2097152, 1]) cuda:0
model.layers.0.self_attn.o_proj.weight torch.uint8 torch.Size([8388608, 1]) cuda:0
model.layers.0.mlp.gate_proj.weight torch.uint8 torch.Size([29360128, 1]) cuda:0
model.layers.0.mlp.up_proj.weight torch.uint8 torch.Size([29360128, 1]) cuda:0
model.layers.0.mlp.down_proj.weight torch.uint8 torch.Size([29360128, 1]) cuda:0
model.layers.0.input_layernorm.weight torch.bfloat16 torch.Size([4096]) cuda:0
model.layers.0.post_attention_layernorm.weight torch.bfloat16 torch.Size([4096]) cuda:0
model.layers.1.self_attn.q_proj.weight torch.uint8 torch.Size([8388608, 1]) cuda:0
model.layers.1.self_attn.k_proj.weight torch.uint8 torch.Size([2097152, 1]) cuda:0
model.layers.1.self

In [14]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): Ll

In [13]:
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Meta-Llama-3-8B')
tokenizer.pad_token = tokenizer.eos_token
# 示例训练数据
texts = [
    "Hello, how are you?",
    "The quick brown fox jumps over the lazy dog."
]

# Tokenize数据
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

# 移动到GPU（如果可用）
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
# model.to(device)

# 设置优化器和损失函数
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_fn = nn.CrossEntropyLoss()

# 模型训练步骤
model.train()
outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
loss = outputs.loss

# 反向传播
optimizer.zero_grad()
loss.backward()
optimizer.step()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
