In [2]:
import os
import json
import torch
import argparse
from transformers import AutoTokenizer
from deltazip import AutoDeltaZipModelForCausalLM, BaseCompressionConfig

In [3]:
base_model = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
target_model = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(
   base_model, use_fast=True
)
compress_config = BaseCompressionConfig(
    bits=4,
    sparsity=0.5,
    prunen=0,
    block_size=128,
    prunem=0,
    lossless='gdeflate',
    damp_percent=0.01,
    sym=False,
)
shuffle_dataset = True
dataset = ".local/datasets/lmsys.jsonl"
n_samples = 128
outdir = ".cache/compressed_models/TinyLlama.TinyLlama-1.1B-Chat-v1.0.4b0s128g"

print("[info] compress config:", compress_config)
target_model = AutoDeltaZipModelForCausalLM.from_pretrained(
    target_model, 
    compress_config=compress_config,
    torch_dtype=torch.float16,
    # max_memory = {0: "1GIB", 1: "1GIB", "cpu": "140GIB"}
    max_memory = {0: "400MIB", 1: "400MIB", "cpu": "140GIB"}
)
delta = "subtract"
target_model.requires_grad_(False)
if base_model != "" and delta != "":
    print("[info] base model is defined, delta mode enabled")
    base_model = AutoDeltaZipModelForCausalLM.from_pretrained(
        base_model,
        compress_config=compress_config,
        torch_dtype=torch.float16,
    )
    base_model.requires_grad_(False)
torch.cuda.empty_cache()
print(f"target model on {target_model.device}, base model on {base_model.device}")
# now time to prepare inspect dataset
with open(dataset, "r") as fp:
    examples = [json.loads(line)["text"] for line in fp.readlines()]
if n_samples <= 0:
    examples = examples
else:
    if shuffle_dataset:
        import random
        random.seed(42)
        random.shuffle(examples)
    examples = examples[: n_samples]
examples = [tokenizer(x) for x in examples]
if base_model != "" and delta != "":
    target_model.lossy_compress(
        examples,
        batch_size=1,
        base_model=base_model,
    )
else:
    target_model.lossy_compress(
        examples,
        batch_size=1,
    )
# write to folder
os.makedirs(outdir, exist_ok=True)

[info] compress config: BaseCompressionConfig(bits=4, sparsity=0.5, prunen=0, prunem=0, group_size=-1, group_rows=-1, block_size=128, damp_percent=0.01, desc_act=True, sym=False, true_sequential=True, lossless='gdeflate', dtype='fp16')




[info] base model is defined, delta mode enabled


2024-04-06 16:22:38.875 | INFO     | deltazip.modeling._base:from_pretrained:699 - Using [None] to load model.


target model on cuda:0, base model on cpu


2024-04-06 16:22:43.451 | DEBUG    | deltazip.modeling._base:lossy_compress:425 - Compression self_attn.k_proj in layer 1/22 - sparsity: 0.5, bits: 4
2024-04-06 16:22:43.455 | INFO     | deltazip.core.sparsegpt:fasterprune:59 - compression operates on delta...
2024-04-06 16:22:44.024 | DEBUG    | deltazip.core.sparsegpt:fasterprune:165 - adding base weight for correct forward...
2024-04-06 16:22:44.025 | INFO     | deltazip.core.sparsegpt:fasterprune:175 - duration: 0.53s
2024-04-06 16:22:44.026 | INFO     | deltazip.core.sparsegpt:fasterprune:176 - sparsity: 0.5076503753662109
2024-04-06 16:22:44.028 | INFO     | deltazip.core.sparsegpt:fasterprune:182 - avg loss: 2.1982583348290063e-05
2024-04-06 16:22:44.034 | DEBUG    | deltazip.modeling._base:lossy_compress:425 - Compression self_attn.v_proj in layer 1/22 - sparsity: 0.5, bits: 4
2024-04-06 16:22:44.040 | INFO     | deltazip.core.sparsegpt:fasterprune:59 - compression operates on delta...
2024-04-06 16:22:44.528 | DEBUG    | delta

model.embed_tokens.weight: cuda:0
model.layers.0.self_attn.q_proj.weight: cuda:0
model.layers.0.self_attn.k_proj.weight: cuda:0
model.layers.0.self_attn.v_proj.weight: cuda:0
model.layers.0.self_attn.o_proj.weight: cuda:0
model.layers.0.mlp.gate_proj.weight: cuda:0
model.layers.0.mlp.up_proj.weight: cuda:0
model.layers.0.mlp.down_proj.weight: cuda:0
model.layers.0.input_layernorm.weight: cuda:0
model.layers.0.post_attention_layernorm.weight: cuda:0
model.layers.1.self_attn.q_proj.weight: cuda:1
model.layers.1.self_attn.k_proj.weight: cuda:1
model.layers.1.self_attn.v_proj.weight: cuda:1
model.layers.1.self_attn.o_proj.weight: cuda:1
model.layers.1.mlp.gate_proj.weight: cuda:1
model.layers.1.mlp.up_proj.weight: cuda:1
model.layers.1.mlp.down_proj.weight: cuda:1
model.layers.1.input_layernorm.weight: cuda:1
model.layers.1.post_attention_layernorm.weight: cuda:1
model.layers.2.self_attn.q_proj.weight: cuda:1
model.layers.2.self_attn.k_proj.weight: cuda:1
model.layers.2.self_attn.v_proj.we

In [4]:
# safe a safe tensor copy just in case
import safetensors as st
tensors = {}
for name, param in target_model.named_parameters():
    if not param.is_meta:
        tensors[name] = param.data.cpu().clone().detach()
st.torch.save_file(tensors, os.path.join(outdir, "temp.safetensors"))

In [6]:
for name, param in target_model.named_parameters():
    if param.is_meta:
        print(name)

model.model.layers.5.input_layernorm.weight
model.model.layers.5.post_attention_layernorm.weight
model.model.layers.6.input_layernorm.weight
model.model.layers.6.post_attention_layernorm.weight
model.model.layers.7.input_layernorm.weight
model.model.layers.7.post_attention_layernorm.weight
model.model.layers.8.input_layernorm.weight
model.model.layers.8.post_attention_layernorm.weight
model.model.layers.9.input_layernorm.weight
model.model.layers.9.post_attention_layernorm.weight
model.model.layers.10.input_layernorm.weight
model.model.layers.10.post_attention_layernorm.weight
model.model.layers.11.input_layernorm.weight
model.model.layers.11.post_attention_layernorm.weight
model.model.layers.12.input_layernorm.weight
model.model.layers.12.post_attention_layernorm.weight
model.model.layers.13.input_layernorm.weight
model.model.layers.13.post_attention_layernorm.weight
model.model.layers.14.input_layernorm.weight
model.model.layers.14.post_attention_layernorm.weight
model.model.layers.1

In [3]:
# for weights that are not compressed, we calculate delta afterward compression
if base_model != "" and delta != "":
    compressed_modules = []
    for x in base_model.inside_layer_modules:
        compressed_modules.extend(x)
    for name, param in target_model.named_parameters():
        if "bias" in name or all(
            [modules not in name for modules in compressed_modules]
        ):
            base_weight = base_model.state_dict()[name]
            if base_weight.device != param.device:
                base_weight = base_weight.to(param.device)
            target_model.state_dict()[name] = param - base_weight
del base_model
print(target_model.device)

cuda:0


In [32]:
# from accelerate import dispatch_model
# from accelerate import infer_auto_device_map

target_model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
target_model_ref = AutoDeltaZipModelForCausalLM.from_pretrained(
    target_model_name, 
    compress_config=compress_config,
    torch_dtype=torch.float16,
)
missing_state_dict = target_model_ref.state_dict()
missing_state_dict = {k: v for k, v in missing_state_dict.items() if k not in tensors}
target_model.load_state_dict(missing_state_dict, strict = False, assign=True)
for name, param in target_model.named_parameters():
    if param.is_meta:
        print(name)
target_model.save_compressed(outdir)

2024-04-06 17:08:22.646 | INFO     | deltazip.modeling._base:from_pretrained:699 - Using [None] to load model.
The cos_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use the forward method of RoPE from now on instead. It is not used in the `LlamaAttention` class
The sin_cached attribute will be removed in 4.39. Bear in mind that its contents changed in v4.38. Use the forward method of RoPE from now on instead. It is not used in the `LlamaAttention` class
2024-04-06 17:08:23.350 | INFO     | deltazip.modeling._utils:pack_model:168 - model.layers.0.self_attn.k_proj
2024-04-06 17:08:23.351 | INFO     | deltazip.modeling._utils:pack_model:179 - g_idx: torch.Size([2048])
2024-04-06 17:08:23.406 | INFO     | deltazip.modeling._utils:pack_model:168 - model.layers.0.self_attn.o_proj
2024-04-06 17:08:23.407 | INFO     | deltazip.modeling._utils:pack_model:179 - g_idx: torch.Size([2048])
2024-04-06 17:08:23.539 | INFO     | deltazip.modeling._utils:pac