# Transformers 模型量化技术：AWQ（OPT-2.7B）

- 资源有限，上传也是噩梦，这里改用`opt-125m`

## 使用 AutoAWQ 量化模型

下面我们以 `facebook opt-2.7B` 模型为例，使用 `AutoAWQ` 库实现的 AWQ 算法实现模型量化。

In [12]:
from datasets import load_dataset

dataset_path = "/mnt/workspace/dataset/mit-han-lab/pile-val-backup"
datasets = load_dataset(path="/mnt/workspace/dataset/mit-han-lab/pile-val-backup")



In [2]:
datasets

DatasetDict({
    validation: Dataset({
        features: ['text', 'meta'],
        num_rows: 214670
    })
})

In [6]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

model_name_or_path = "/mnt/workspace/models/facebook/opt-2.7b"
quant_model_dir =    "/mnt/workspace/models/facebook/opt-2.7b-autoawq"

In [10]:
quant_config = {
    "zero_point": True,
    "q_group_size": 128,
    "w_bit": 4,
    "version": "GEMM"
}

In [13]:
# 加载模型
model = AutoAWQForCausalLM.from_pretrained(model_name_or_path, attn_implementation="flash_attention_2", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True,data_files=dataset_path)

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


In [18]:
# 量化模型
model.quantize(tokenizer, quant_config=quant_config,calib_data=dataset_path,split = "validation")

AWQ: 100%|██████████| 32/32 [09:26<00:00, 17.70s/it]


In [19]:
from transformers import AwqConfig, AutoConfig

# 修改配置文件以使其与transformers集成兼容
quantization_config = AwqConfig(
    bits=quant_config["w_bit"],
    group_size=quant_config["q_group_size"],
    zero_point=quant_config["zero_point"],
    version=quant_config["version"].lower(),
).to_dict()

model.model.config.quantization_config = quantization_config

# 保存模型权重
model.save_quantized(quant_model_dir)
# 保存分词器
tokenizer.save_pretrained(quant_model_dir)

('/mnt/workspace/models/facebook/opt-2.7b-autoawq/tokenizer_config.json',
 '/mnt/workspace/models/facebook/opt-2.7b-autoawq/special_tokens_map.json',
 '/mnt/workspace/models/facebook/opt-2.7b-autoawq/vocab.json',
 '/mnt/workspace/models/facebook/opt-2.7b-autoawq/merges.txt',
 '/mnt/workspace/models/facebook/opt-2.7b-autoawq/added_tokens.json',
 '/mnt/workspace/models/facebook/opt-2.7b-autoawq/tokenizer.json')

In [8]:
quant_config

{'zero_point': True, 'q_group_size': 128, 'w_bit': 4, 'version': 'GEMM'}

#### Transformers 兼容性配置

为了使`quant_config` 与 transformers 兼容，我们需要修改配置文件：`使用 Transformers.AwqConfig 来实例化量化模型配置`

In [9]:
from transformers import AwqConfig, AutoConfig

# 修改配置文件以使其与transformers集成兼容
quantization_config = AwqConfig(
    bits=quant_config["w_bit"],
    group_size=quant_config["q_group_size"],
    zero_point=quant_config["zero_point"],
    version=quant_config["version"].lower(),
).to_dict()

# 预训练的transformers模型存储在model属性中，我们需要传递一个字典
model.model.config.quantization_config = quantization_config

In [10]:
# 保存模型权重
model.save_quantized(quant_model_dir)
# 保存分词器
tokenizer.save_pretrained(quant_model_dir)  

('models/opt-2.7b-awq/tokenizer_config.json',
 'models/opt-2.7b-awq/special_tokens_map.json',
 'models/opt-2.7b-awq/vocab.json',
 'models/opt-2.7b-awq/merges.txt',
 'models/opt-2.7b-awq/added_tokens.json',
 'models/opt-2.7b-awq/tokenizer.json')

In [11]:
model.eval()

OptAWQForCausalLM(
  (model): OPTForCausalLM(
    (model): OPTModel(
      (decoder): OPTDecoder(
        (embed_tokens): Embedding(50272, 2560, padding_idx=1)
        (embed_positions): OPTLearnedPositionalEmbedding(2050, 2560)
        (final_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (layers): ModuleList(
          (0-31): 32 x OPTDecoderLayer(
            (self_attn): OPTAttention(
              (k_proj): WQLinear_GEMM(in_features=2560, out_features=2560, bias=True, w_bit=4, group_size=128)
              (v_proj): WQLinear_GEMM(in_features=2560, out_features=2560, bias=True, w_bit=4, group_size=128)
              (q_proj): WQLinear_GEMM(in_features=2560, out_features=2560, bias=True, w_bit=4, group_size=128)
              (out_proj): WQLinear_GEMM(in_features=2560, out_features=2560, bias=True, w_bit=4, group_size=128)
            )
            (activation_fn): ReLU()
            (self_attn_layer_norm): LayerNorm((2560,), eps=1e-05, elementwise_affin

### 使用 GPU 加载量化模型

In [20]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer_awq = AutoTokenizer.from_pretrained(quant_model_dir)
model_awq = AutoModelForCausalLM.from_pretrained(quant_model_dir, device_map="cuda").to(0)

In [21]:
def generate_text(text,tokenizer,model):
    inputs = tokenizer(text, return_tensors="pt").to(0)

    out = model.generate(**inputs, max_new_tokens=64)
    return tokenizer.decode(out[0], skip_special_tokens=True)


In [23]:
result = generate_text("Merry Christmas! I'm glad to",tokenizer_awq,model_awq)
print(result)

Merry Christmas! I'm glad to
M?m
M
Mm

M
Mm
M

M
M
M
M

M
M
M
M
M
M
M
M
M
M

M

M
M
M
M

M
M


In [24]:
result = generate_text("The woman worked as a",tokenizer_awq,model_awq)
print(result)

The woman worked as a the woman
The the woman woman
The the the the the the the
