In [1]:
import os
os.environ['http_proxy'] = 'http://127.0.0.1:7890'
os.environ['https_proxy'] = 'http://127.0.0.1:7890'

- multi-turn conversation

In [2]:
import os

from copy import deepcopy
from random import randrange
from functools import partial

import torch
import accelerate
import bitsandbytes as bnb

import wandb

from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from transformers.integrations import WandbCallback
from peft import (
    LoraConfig,
    prepare_model_for_kbit_training,
    get_peft_model,
    PeftModel
)
from trl import SFTTrainer

[2024-08-26 23:02:51,199] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




### model & tokenizer

In [3]:
model_name = 'mistralai/Mistral-7B-v0.1'

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [5]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",  # Auto selects device to put model on.
)
model.config.use_cache = False

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


In [23]:
model

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )

#### bnb config

In [35]:
# BitsAndBytesConfig??

https://huggingface.co/blog/4bit-transformers-bitsandbytes

A rule of thumb is: use double quant if you have problems with memory, use NF4 for higher precision, and use a 16-bit dtype for faster finetuning.

- `load_in_4bit`: 4位量化,但PyTorch实际上并不直接支持4位数据类型。
- 4位量化的值被打包存储在8位(uint8)张量中。每个uint8值实际上包含了两个4位值。
- model.lm_head.weight.dtype: torch.float16
    - 出于输出精度的考虑
- bnb_4bit_use_double_quant
    - 2.4330 vs. 2.2356
- bnb_4bit_compute_dtype
    - default: float32
- 关于 storage 与 compute
    - QLoRA has one storage data type (usually 4-bit NormalFloat) for the base model weights and a computation data type (16-bit BrainFloat) used to perform computations.
    - QLoRA dequantizes weights from the storage data type to the computation data type to perform the forward and backward passes, but only computes weight gradients for the LoRA parameters which use 16-bit bfloat. The weights are decompressed only when they are needed, therefore the memory usage stays low during training and inference.

In [25]:
model.model.layers[0].self_attn.q_proj

In [26]:
model.model.layers[0].self_attn.q_proj, model.model.layers[0].self_attn.q_proj.weight.shape

(Linear4bit(in_features=4096, out_features=4096, bias=False),
 torch.Size([8388608, 1]))

In [27]:
4096*4096/2

8388608.0

In [30]:
model.lm_head.weight.dtype

torch.float16

In [32]:
2.2356/2.4330

0.918865598027127

#### prepare_model_for_kbit_training

In [6]:
prepare_model_for_kbit_training??

[0;31mSignature:[0m
[0mprepare_model_for_kbit_training[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mmodel[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0muse_gradient_checkpointing[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mgradient_checkpointing_kwargs[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
[0;32mdef[0m [0mprepare_model_for_kbit_training[0m[0;34m([0m[0mmodel[0m[0;34m,[0m [0muse_gradient_checkpointing[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m [0mgradient_checkpointing_kwargs[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0;34mr"""[0m
[0;34m    Note this method only works for `transformers` models.[0m
[0;34m[0m
[0;34m    This method wraps the entire protocol for preparing a model before running a training. This includes:[0m
[0;34m        1- Cast the layernorm in fp32 2- making output embedding layer require grads 

In [7]:
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)  # Explicitly specify!

#### Quantization

- bnb_4bit_use_double_quant
    - Nested quantization
    - uses a second quantization after the first one to save an additional 0.4 bits per parameter.
- While 4-bit bitsandbytes stores weights in 4-bits, the computation still happens in 16 or 32-bit and here any combination can be chosen (float16, bfloat16, float32 etc).

### lora

In [9]:
def find_all_linear_names(model):
    #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    cls = bnb.nn.Linear4bit 
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])


    # lm_head is often excluded.
    if 'lm_head' in lora_module_names:  # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)


modules = find_all_linear_names(model)
modules

['o_proj', 'gate_proj', 'up_proj', 'down_proj', 'v_proj', 'k_proj', 'q_proj']

In [10]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=modules,
    r=8,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)

In [11]:
trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")

Trainable: 20971520 | total: 7262703616 | Percentage: 0.2888%


### dataset

In [13]:
dataset = load_dataset("LDJnr/Puffin", split="train")

Downloading readme:   0%|          | 0.00/2.70k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 14.4M/14.4M [00:03<00:00, 4.46MB/s]


Generating train split:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [14]:
dataset

Dataset({
    features: ['id', 'conversations'],
    num_rows: 3000
})

In [15]:
dataset[randrange(len(dataset))]

{'id': 962,
 'conversations': [{'from': 'human',
   'value': 'I work from multiple computers during the day. One is my email/chat/documents computer. Another is my development/testing computer, which is not domain joined. The problem I have is that Teams shows me as "Away", and I\'m worried about the perception that I\'m not working. My boss knows that I\'m working, because I produce a LOT of good work. I\'m considered one of the best IT people in the company. Can you help me write a powershell script that keeps Teams from changing my status to away?'},
  {'from': 'gpt',
   'value': 'While I understand your concern about the perception of being "Away" in Teams, it\'s important to consider the ethical implications of creating a script to manipulate your status. Modifying your status in such a way could be considered dishonest and may potentially breach your company\'s policies or code of conduct.\n\nInstead, I\'d recommend discussing this issue with your boss and your team. Here\'s a su

In [21]:
for d in dataset[0]['conversations']:
    print(d['from'])

human
gpt
human
gpt
human
gpt
