In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

sys.path.append("../")

In [3]:
import os
import torch

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "6"

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [4]:
def model_size_in_bytes(model):
    """
    Calculate the size of the model in bytes.
    
    Args:
    model: The PyTorch model.
    
    Returns:
    int: Size of the model in bytes.
    """
    total_size = 0
    for param in model.parameters():
        total_size += param.numel() * param.element_size()
    return total_size

In [38]:
import numpy as np
import torch
import torch.nn as nn
import repitl.matrix_itl as itl

from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig


pretrained_model_dir = "facebook/opt-125m"
quantized_model_dir = "facebook_opt-125m-4bit-128g"


# os.makedirs(quantized_model_dir, exist_ok=True)
def get_wikitext2(nsamples, seed, seqlen, model):
    from datasets import load_dataset

    traindata = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
    testdata = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")

    from transformers import AutoTokenizer

    try:
        tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)
    except Exception:
        tokenizer = AutoTokenizer.from_pretrained(model, use_fast=True)
    trainenc = tokenizer("\n\n".join(traindata["text"]), return_tensors="pt")
    testenc = tokenizer("\n\n".join(testdata["text"]), return_tensors="pt")

    import random

    random.seed(seed)
    np.random.seed(0)
    torch.random.manual_seed(0)

    traindataset = []
    for _ in range(nsamples):
        i = random.randint(0, trainenc.input_ids.shape[1] - seqlen - 1)
        j = i + seqlen
        inp = trainenc.input_ids[:, i:j]
        attention_mask = torch.ones_like(inp)
        traindataset.append({"input_ids": inp, "attention_mask": attention_mask})
    return traindataset, testenc



def normalize(R):
    """
    Normalize the input matrix by subtracting the mean and dividing by the L2 norm.
    From https://github.com/waltonfuture/Matrix-Entropy

    Args:
        R (torch.Tensor): Input matrix to be normalized.

    Returns:
        torch.Tensor: Normalized matrix.

    """
    with torch.no_grad():
        mean = R.mean(dim=0)
        R = R - mean
        norms = torch.norm(R, p=2, dim=1, keepdim=True)
        R = R / norms
    return R

@torch.no_grad()
def opt_eval(model, testenc, dev, seqlen=2048):
    print("Evaluating ...")

    testenc = testenc.input_ids
    nsamples = testenc.numel() // seqlen

    use_cache = model.config.use_cache
    model.config.use_cache = False
    layers = model.model.decoder.layers

    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(dev)
    model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(dev)
    if hasattr(model.model.decoder, "project_out") and model.model.decoder.project_out:
        model.model.decoder.project_out = model.model.decoder.project_out.to(dev)
    if hasattr(model.model.decoder, "project_in") and model.model.decoder.project_in:
        model.model.decoder.project_in = model.model.decoder.project_in.to(dev)
    layers[0] = layers[0].to(dev)

    dtype = next(iter(model.parameters())).dtype
    inps = torch.zeros((nsamples, seqlen, model.config.hidden_size), dtype=dtype, device=dev)
    cache = {"i": 0, "attention_mask": None}

    class Catcher(nn.Module):
        def __init__(self, module):
            super().__init__()
            self.module = module

        def forward(self, inp, **kwargs):
            inps[cache["i"]] = inp
            cache["i"] += 1
            cache["attention_mask"] = kwargs["attention_mask"]
            raise ValueError

    layers[0] = Catcher(layers[0])
    for i in range(nsamples):
        batch = testenc[:, (i * seqlen) : ((i + 1) * seqlen)].to(dev)
        try:
            model(batch)
        except ValueError:
            pass
    layers[0] = layers[0].module

    layers[0] = layers[0].cpu()
    model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.cpu()
    model.model.decoder.embed_positions = model.model.decoder.embed_positions.cpu()
    if hasattr(model.model.decoder, "project_out") and model.model.decoder.project_out:
        model.model.decoder.project_out = model.model.decoder.project_out.cpu()
    if hasattr(model.model.decoder, "project_in") and model.model.decoder.project_in:
        model.model.decoder.project_in = model.model.decoder.project_in.cpu()
    torch.cuda.empty_cache()

    outs = torch.zeros_like(inps)
    attention_mask = cache["attention_mask"]

    for i in range(len(layers)):
        # print(i)
        layer = layers[i].to(dev)

        for j in range(nsamples):
            outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask)[0]
        layers[i] = layer.cpu()
        del layer
        torch.cuda.empty_cache()
        inps, outs = outs, inps

    if model.model.decoder.final_layer_norm is not None:
        model.model.decoder.final_layer_norm = model.model.decoder.final_layer_norm.to(dev)
    if model.model.decoder.project_out is not None:
        model.model.decoder.project_out = model.model.decoder.project_out.to(dev)
    model.lm_head = model.lm_head.to(dev)

    testenc = testenc.to(dev)
    nlls = []
    ents = []

    for i in range(nsamples):
        hidden_states = inps[i].unsqueeze(0)
        if model.model.decoder.final_layer_norm is not None:
            hidden_states = model.model.decoder.final_layer_norm(hidden_states)
        if model.model.decoder.project_out is not None:
            hidden_states = model.model.decoder.project_out(hidden_states)

        # perplexity
        lm_logits = model.lm_head(hidden_states)
        shift_logits = lm_logits[:, :-1, :].contiguous()
        shift_labels = testenc[:, (i * seqlen) : ((i + 1) * seqlen)][:, 1:]
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        neg_log_likelihood = loss.float() * seqlen
        nlls.append(neg_log_likelihood)

        # entropy
        N, D = hidden_states.shape[1:]
        hidden_states = normalize(hidden_states.squeeze())
        if N > D:
            cov = hidden_states.T @ hidden_states
        else:
            cov = hidden_states @ hidden_states.T
        cov /= torch.trace(cov)
        entropy = itl.matrixAlphaEntropy(cov.float(), alpha=1)
        ents.append(entropy)

    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * seqlen))
    print(ppl.item())

    ents = torch.stack(ents).cpu()
    logD_normalized_entropy = ents / np.log(seqlen)
    logN_normalized_entropy = ents / np.log(nsamples)
    logNlogD_normalized_entropy = ents / (np.log(nsamples) * np.log(seqlen))

    model.config.use_cache = use_cache

    return {
        "ppl": ppl.item(),
        "entropy": ents.mean().item(),
        "logD_normalized_entropy": logD_normalized_entropy.mean().item(),
        "logN_normalized_entropy": logN_normalized_entropy.mean().item(),
        "logNlogD_normalized_entropy": logNlogD_normalized_entropy.mean().item(),
    }

In [33]:
traindataset, testenc = get_wikitext2(128, 0, 2048, pretrained_model_dir)



In [34]:
testenc.input_ids

tensor([[    2, 50140,  5457,  ...,  1437, 50140, 50118]])

In [45]:
# from transformers import AutoModelForCausalLM


# for pretrained_model_dir in [
#     # "facebook_opt-1.3b",
#     # "facebook_opt-2.7b",
#     # "facebook_opt-6.7b",
#     "facebook_opt-13b"
#     ]:
#     model = AutoModelForCausalLM.from_pretrained(pretrained_model_dir.replace("_", "/"), device_map="auto", torch_dtype=torch.float16)
#     # model.save_pretrained(pretrained_model_dir)
#     # model.save_pretrained(pretrained_model_dir, use_safetensors=True)



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [49]:
# !cp -r ~/.cache/huggingface/hub/models--facebook--opt-13b .

In [58]:


quantize_config = BaseQuantizeConfig(
    bits=4,  # quantize model to 4-bit
    group_size=128,  # it is recommended to set the value to 128
    desc_act=False,  # desc_act and group size only works on triton
)

# load un-quantized model, the model will always be force loaded into cpu
model = AutoGPTQForCausalLM.from_pretrained(pretrained_model_dir, quantize_config)

# quantize model, the examples should be list of dict whose keys can only be "input_ids" and "attention_mask"
# with value under torch.LongTensor type.
model.quantize(traindataset, use_triton=False)
  
# save quantized model
model.save_quantized(quantized_model_dir)

# save quantized model using safetensors
model.save_quantized(quantized_model_dir, use_safetensors=True)

INFO - Start quantizing layer 1/12
INFO:auto_gptq.modeling._base:Start quantizing layer 1/12
INFO - Quantizing self_attn.k_proj in layer 1/12...
INFO:auto_gptq.modeling._base:Quantizing self_attn.k_proj in layer 1/12...
INFO - Quantizing self_attn.v_proj in layer 1/12...
INFO:auto_gptq.modeling._base:Quantizing self_attn.v_proj in layer 1/12...
INFO - Quantizing self_attn.q_proj in layer 1/12...
INFO:auto_gptq.modeling._base:Quantizing self_attn.q_proj in layer 1/12...
INFO - Quantizing self_attn.out_proj in layer 1/12...
INFO:auto_gptq.modeling._base:Quantizing self_attn.out_proj in layer 1/12...
INFO - Quantizing fc1 in layer 1/12...
INFO:auto_gptq.modeling._base:Quantizing fc1 in layer 1/12...
INFO - Quantizing fc2 in layer 1/12...
INFO:auto_gptq.modeling._base:Quantizing fc2 in layer 1/12...
INFO - Start quantizing layer 2/12
INFO:auto_gptq.modeling._base:Start quantizing layer 2/12
INFO - Quantizing self_attn.k_proj in layer 2/12...
INFO:auto_gptq.modeling._base:Quantizing self_at

In [9]:
# load quantized model, currently only support cpu or single gpu
model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir, device_map="auto", use_triton=False)

1. You disabled CUDA extensions compilation by setting BUILD_CUDA_EXT=0 when install auto_gptq from source.
2. You are using pytorch without CUDA support.
3. CUDA and nvcc are not installed in your device.
INFO - The layer lm_head is not quantized.


In [10]:
testenc

{'input_ids': tensor([[    2, 50140,  5457,  ...,  1437, 50140, 50118]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]])}

In [39]:
out = opt_eval(model.model, testenc, dev="cuda:0")

Evaluating ...
29.53990936279297


In [41]:
out | {"model": "model"}

{'ppl': 29.53990936279297,
 'entropy': 5.074114799499512,
 'logD_normalized_entropy': 0.6654909253120422,
 'logN_normalized_entropy': 1.026807427406311,
 'logNlogD_normalized_entropy': 0.13467001914978027,
 'model': 'model'}

In [36]:
# load quantized model, currently only support cpu or single gpu
model_ = AutoModelForCausalLM.from_pretrained( "facebook/opt-125m", device_map="auto", torch_dtype=torch.float16)

opt_eval(model_, testenc, dev="cuda:0")



Evaluating ...
0
1
2
3
4
5
6
7
8
9
10
11
27.655488967895508


In [37]:
model_size_in_bytes(model_)


250478592