In [1]:
%env CUDA_VISIBLE_DEVICES=0

import torch
from transformers import LlamaForCausalLM
from rotation_utils import fuse_layer_norms, rotate_model

MODEL = "meta-llama/Meta-Llama-3.1-8B"
DEVICE = "cuda"

model = LlamaForCausalLM.from_pretrained(MODEL, device_map="cpu")

fuse_layer_norms(model, DEVICE)
rotate_model(model, DEVICE)

model = model.to(DEVICE)
model = model.to(torch.float16)

env: CUDA_VISIBLE_DEVICES=0


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Fusing layer norms:   0%|          | 0/32 [00:00<?, ?it/s]

Rotating:   0%|          | 0/32 [00:00<?, ?layer/s]

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B")

In [3]:
tokenizer.decode(
    model.generate(**tokenizer("Hi!", return_tensors='pt').to("cuda"))[0].cpu()
)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


"<|begin_of_text|>Hi! I'm new to the forum. I'm an amateur photographer, and I'm currently"

In [4]:
import torch
from torch import nn

from tqdm.auto import trange

@torch.no_grad()
def llama_eval(model, dataloader, dev):
    print('Evaluating ...')

    nsamples = len(dataloader) 

    use_cache = model.config.use_cache
    model.config.use_cache = False
    layers = model.model.layers

    model.model.embed_tokens = model.model.embed_tokens.to(dev)
    model.model.rotary_emb = model.model.rotary_emb.to(dev)
    layers[0] = layers[0].to(dev)

    dtype = next(iter(model.parameters())).dtype
    inps = []
    attention_masks = []
    position_ids = []

    class Catcher(nn.Module):
        def __init__(self, module):
            super().__init__()
            self.module = module
        def forward(self, inp, **kwargs):
            inps.append(inp)
            attention_masks.append(kwargs['attention_mask'])
            position_ids.append(kwargs['position_ids'])
            raise ValueError
    layers[0] = Catcher(layers[0])
    for batch in dataloader:
        try:
            model(batch.to(dev))
        except ValueError:
            pass
    layers[0] = layers[0].module

    layers[0] = layers[0].cpu()
    model.model.embed_tokens = model.model.embed_tokens.cpu()
    torch.cuda.empty_cache()

    for i in trange(len(layers), desc=f"Evaluating layer-by-layer..."):
        layer = layers[i].to(dev)
        for j in range(nsamples):
            inps[j] = layer(inps[j], attention_mask=attention_masks[j], position_ids=position_ids[j])[0]
        layers[i] = layer.cpu()
        del layer
        torch.cuda.empty_cache()

    if model.model.norm is not None:
        model.model.norm = model.model.norm.to(dev)
    model.lm_head = model.lm_head.to(dev)

    nlls = []
    for i in range(nsamples):
        hidden_states = inps[i]
        if model.model.norm is not None:
            hidden_states = model.model.norm(hidden_states)
        lm_logits = model.lm_head(hidden_states)
        shift_logits = lm_logits[:, :-1, :].contiguous()
        shift_labels = (dataloader[i].to(dev))[:, 1:]
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        neg_log_likelihood = loss.float() * 8192
        nlls.append(neg_log_likelihood)
    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * 8192))
    print(ppl.item())

    model.config.use_cache = use_cache
    
    return ppl.item()

In [5]:
from higgs import quantize_layer_higgs


@torch.inference_mode()
def quantize_model_higgs(model, higgs_d: int, higgs_n: int):
    for layer in model.model.layers:
        layer.self_attn.q_proj.weight.data = quantize_layer_higgs(layer.self_attn.q_proj.weight.data, 1, higgs_d, higgs_n)
        layer.self_attn.k_proj.weight.data = quantize_layer_higgs(layer.self_attn.k_proj.weight.data, 1, higgs_d, higgs_n)
        layer.self_attn.v_proj.weight.data = quantize_layer_higgs(layer.self_attn.v_proj.weight.data, 1, higgs_d, higgs_n)
        layer.self_attn.o_proj.weight.data = quantize_layer_higgs(layer.self_attn.o_proj.weight.data, 0, higgs_d, higgs_n)
        
        layer.mlp.gate_proj.weight.data = quantize_layer_higgs(layer.mlp.gate_proj.weight.data, 1, higgs_d, higgs_n)
        layer.mlp.up_proj.weight.data = quantize_layer_higgs(layer.mlp.up_proj.weight.data, 1, higgs_d, higgs_n)
        layer.mlp.down_proj.weight.data = quantize_layer_higgs(layer.mlp.down_proj.weight.data, 0, higgs_d, higgs_n)
        
quantize_model_higgs(model, 2, 256)

  GRIDS[dim][size] = torch.load(file)


In [6]:
import sys
sys.path.append("..")
from gptq.datautils import get_loaders

datasets = ['wikitext2'] 
for dataset in datasets:
    dataloader, testloader = get_loaders(
        dataset, seed=0, model=MODEL, seqlen=8192
    )
    ppl = llama_eval(model, testloader, "cuda")

Token indices sequence length is longer than the specified maximum sequence length for this model (2436214 > 131072). Running this sequence through the model will result in indexing errors


Evaluating ...


Evaluating layer-by-layer...:   0%|          | 0/32 [00:00<?, ?it/s]

The attention layers in this model are transitioning from computing the RoPE embeddings internally through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed `position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be removed and `position_embeddings` will be mandatory.


6.358399868011475


### Wiki-2 PPL

FP32: `5.60677433013916`

FP16: `5.606886386871338`

MR -> FP32: `5.607339382171631`

MR -> FP16: `5.60782527923584`

MR -> HIGGS 2d256: `6.358399868011475`

OR -> HIGGS 2d256: `6.015`


, MR: Merger Rotations, OR: Online Rotations

In [7]:
ref_model = LlamaForCausalLM.from_pretrained(MODEL, device_map="cpu")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [12]:
from hadamard import random_hadamard_matrix

Q = random_hadamard_matrix(4096, "cuda")

In [40]:
quantized = model.model.layers[0].self_attn.q_proj.weight.data.clone().detach().cuda()

ref = torch.matmul(
    (ref_model.model.layers[0].self_attn.q_proj.weight.data * ref_model.model.layers[0].input_layernorm.weight.data[None,:]).double().cuda(),
    Q,
)

float((quantized - ref).pow(2).sum() / ref.pow(2).sum())

0.007672099746861981

In [41]:
quantized = model.model.layers[0].self_attn.o_proj.weight.data.clone().detach().cuda()

ref = torch.matmul(
    Q.T,
    (ref_model.model.layers[0].self_attn.o_proj.weight.data).double().cuda(),
)

float((quantized - ref).pow(2).sum() / ref.pow(2).sum())

0.007760077692277988

In [42]:
quantized = model.model.layers[0].mlp.gate_proj.weight.data.clone().detach().cuda()

ref = torch.matmul(
    (ref_model.model.layers[0].mlp.gate_proj.weight.data * ref_model.model.layers[0].post_attention_layernorm.weight.data[None,:]).double().cuda(),
    Q,
)

float((quantized - ref).pow(2).sum() / ref.pow(2).sum())

0.00774175502160158

In [43]:
quantized = model.model.layers[0].mlp.down_proj.weight.data.clone().detach().cuda()

ref = torch.matmul(
    Q.T,
    (ref_model.model.layers[0].mlp.down_proj.weight.data).double().cuda(),
)

float((quantized - ref).pow(2).sum() / ref.pow(2).sum())

0.007750276103880045