In [1]:
%env CUDA_VISIBLE_DEVICES=0

import torch
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig

# All linear layers will use the same quantization config
quant_config = BaseQuantizeConfig(
    bits=4,
    group_size=64
)

# Load and quantize
model = AutoGPTQForCausalLM.from_pretrained(
    "meta-llama/Meta-Llama-3.1-8B",
    quant_config,
    torch_dtype="auto", 
    device_map="cuda",
)

env: CUDA_VISIBLE_DEVICES=0


  def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
  def backward(ctx, grad_output):
  @custom_fwd(cast_inputs=torch.float16)
CUDA extension not installed.
CUDA extension not installed.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [2]:
from gptq.datautils import get_loaders

dataloader, testloader = get_loaders(
    "red", seed=0, model="meta-llama/Meta-Llama-3.1-8B", seqlen=8192
)


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [3]:
model.quantize([{"input_ids": data, "attention_mask": torch.ones_like(data)} for data in dataloader])

quantization script so the packing stage is optimized for speed. Using too many cores may reduce packing performance.
----
import os
import math
max_threads = str(min(8, os.cpu_count()))
os.environ['OMP_NUM_THREADS'] = max_threads
os.environ['OPENBLAS_NUM_THREADS'] = max_threads
os.environ['MKL_NUM_THREADS'] = max_threads
os.environ['VECLIB_MAXIMUM_THREADS'] = max_threads
os.environ['NUMEXPR_NUM_THREADS'] = max_threads
os.environ['NUMEXPR_MAX_THREADS'] = max_threads
----

INFO - Start quantizing layer 1/32
INFO - Quantizing self_attn.k_proj in layer 1/32...
INFO - Quantizing self_attn.v_proj in layer 1/32...
INFO - Quantizing self_attn.q_proj in layer 1/32...
INFO - Quantizing self_attn.o_proj in layer 1/32...
INFO - Quantizing mlp.up_proj in layer 1/32...
INFO - Quantizing mlp.gate_proj in layer 1/32...
INFO - Quantizing mlp.down_proj in layer 1/32...
INFO - Start quantizing layer 2/32
INFO - Quantizing self_attn.k_proj in layer 2/32...
INFO - Quantizing self_attn.v_proj in layer 2/32

In [4]:
model = model.to("cuda")

In [5]:
def get_zero_shots(model, task_list = ('arc_easy',), num_fewshots=1):
    import lm_eval

    lm_eval_model = lm_eval.models.huggingface.HFLM(
        pretrained=model,
    )

    tasks = lm_eval.tasks.get_task_dict(task_list)
    if num_fewshots != 1:
        # TODO: make fewshots properly
        for task_name in tasks:
            task = tasks[task_name]
            if isinstance(task, tuple):
                task = task[1]
            if task is None:
                continue
            task.config.num_fewshot = num_fewshots

    results = lm_eval.evaluator.evaluate(
        lm=lm_eval_model,
        task_dict=tasks,
    )

    result_dict = {task_name: task_result['acc,none'] for task_name, task_result in results['results'].items()}
    result_err_dict = {f'{task_name}_err': task_result['acc_stderr,none'] for task_name, task_result in
                       results['results'].items()}
    result_dict = dict(list(result_dict.items()) + list(result_err_dict.items()))

    if num_fewshots != 1:
        result_dict = {f'{task_name}@{num_fewshots}': acc for task_name, acc in result_dict.items()}

    return result_dict

In [6]:
from torch import nn
from torch.nn import functional as F

from tqdm.auto import trange, tqdm

@torch.no_grad()
def llama_eval(model, dataloader):
    print('Evaluating ...')

    nsamples = len(dataloader) 

    use_cache = model.config.use_cache
    model.config.use_cache = False
    layers = model.model.layers

    model.model.embed_tokens = model.model.embed_tokens
    model.model.rotary_emb = model.model.rotary_emb
    layers[0] = layers[0]

    dtype = next(iter(model.parameters())).dtype
    inps = []
    attention_masks = []
    position_ids = []

    class Catcher(nn.Module):
        def __init__(self, module):
            super().__init__()
            self.module = module
        def forward(self, inp, **kwargs):
            inps.append(inp)
            attention_masks.append(kwargs['attention_mask'])
            position_ids.append(kwargs['position_ids'])
            raise ValueError
    layers[0] = Catcher(layers[0])
    for batch in dataloader:
        try:
            model(batch.to("cuda"))
        except ValueError:
            pass
    layers[0] = layers[0].module

    layers[0] = layers[0]
    model.model.embed_tokens = model.model.embed_tokens
    torch.cuda.empty_cache()

    for i in trange(len(layers), desc=f"Evaluating layer-by-layer..."):
        layer = layers[i]
        for j in range(nsamples):
            inps[j] = layer(inps[j], attention_mask=attention_masks[j], position_ids=position_ids[j])[0]
        layers[i] = layer
        del layer
        torch.cuda.empty_cache()

    if model.model.norm is not None:
        model.model.norm = model.model.norm
    model.lm_head = model.lm_head

    nlls = []
    for i in range(nsamples):
        hidden_states = inps[i]
        if model.model.norm is not None:
            hidden_states = model.model.norm(hidden_states)
        lm_logits = model.lm_head(hidden_states)
        shift_logits = lm_logits[:, :-1, :].contiguous()
        shift_labels = (dataloader[i].to("cuda"))[:, 1:]
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        neg_log_likelihood = loss.float() * 8192
        nlls.append(neg_log_likelihood)
    ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * 8192))
    print(ppl.item())

    model.config.use_cache = use_cache
    
    return ppl.item()

In [7]:
from gptq.datautils import get_loaders

datasets = ['wikitext2'] 
for dataset in datasets:
    dataloader, testloader = get_loaders(
        dataset, seed=0, model="meta-llama/Meta-Llama-3.1-8B", seqlen=8192
    )
    ppl = llama_eval(model.model, testloader)

Token indices sequence length is longer than the specified maximum sequence length for this model (2436214 > 131072). Running this sequence through the model will result in indexing errors


Evaluating ...


Evaluating layer-by-layer...:   0%|          | 0/32 [00:00<?, ?it/s]

The attention layers in this model are transitioning from computing the RoPE embeddings internally through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed `position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be removed and `position_embeddings` will be mandatory.


6.0052490234375


In [8]:
results = get_zero_shots(
    model,
    task_list=("winogrande","arc_easy","piqa","hellaswag","winogrande","arc_challenge"),
    num_fewshots=1,
)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
2024-10-14:13:04:13,406 INFO     [task.py:386] Building contexts for arc_challenge on rank 0...
100%|██████████| 1172/1172 [00:00<00:00, 1421.84it/s]
2024-10-14:13:04:14,284 INFO     [task.py:386] Building contexts for winogrande on rank 0...
100%|██████████| 1267/1267 [00:00<00:00, 91620.69it/s]
2024-10-14:13:04:14,334 INFO     [task.py:386] Building contexts for hellaswag on rank 0...
100%

In [1]:
results

NameError: name 'results' is not defined

hqq 4 64
```json
{'mmlu@5': 0.6378008830650904,
 'mmlu_humanities@5': 0.5874601487778959,
 'mmlu_formal_logic@5': 0.42857142857142855,
 'mmlu_high_school_european_history@5': 0.7575757575757576,
 'mmlu_high_school_us_history@5': 0.8235294117647058,
 'mmlu_high_school_world_history@5': 0.810126582278481,
 'mmlu_international_law@5': 0.8099173553719008,
 'mmlu_jurisprudence@5': 0.7407407407407407,
 'mmlu_logical_fallacies@5': 0.7484662576687117,
 'mmlu_moral_disputes@5': 0.7109826589595376,
 'mmlu_moral_scenarios@5': 0.37988826815642457,
 'mmlu_philosophy@5': 0.7234726688102894,
 'mmlu_prehistory@5': 0.7314814814814815,
 'mmlu_professional_law@5': 0.4804432855280313,
 'mmlu_world_religions@5': 0.8187134502923976,
 'mmlu_other@5': 0.7103315094946894,
 'mmlu_business_ethics@5': 0.61,
 'mmlu_clinical_knowledge@5': 0.7358490566037735,
 'mmlu_college_medicine@5': 0.6473988439306358,
 'mmlu_global_facts@5': 0.37,
 'mmlu_human_aging@5': 0.6771300448430493,
 'mmlu_management@5': 0.7961165048543689,
 'mmlu_marketing@5': 0.8846153846153846,
 'mmlu_medical_genetics@5': 0.75,
 'mmlu_miscellaneous@5': 0.8212005108556832,
 'mmlu_nutrition@5': 0.7287581699346405,
 'mmlu_professional_accounting@5': 0.4858156028368794,
 'mmlu_professional_medicine@5': 0.7022058823529411,
 'mmlu_virology@5': 0.5602409638554217,
 'mmlu_social_sciences@5': 0.7419564510887228,
 'mmlu_econometrics@5': 0.5,
 'mmlu_high_school_geography@5': 0.8131313131313131,
 'mmlu_high_school_government_and_politics@5': 0.8290155440414507,
 'mmlu_high_school_macroeconomics@5': 0.6307692307692307,
 'mmlu_high_school_microeconomics@5': 0.6974789915966386,
 'mmlu_high_school_psychology@5': 0.8422018348623853,
 'mmlu_human_sexuality@5': 0.7557251908396947,
 'mmlu_professional_psychology@5': 0.6911764705882353,
 'mmlu_public_relations@5': 0.7181818181818181,
 'mmlu_security_studies@5': 0.726530612244898,
 'mmlu_sociology@5': 0.8507462686567164,
 'mmlu_us_foreign_policy@5': 0.84,
 'mmlu_stem@5': 0.5398033618775769,
 'mmlu_abstract_algebra@5': 0.31,
 'mmlu_anatomy@5': 0.5925925925925926,
 'mmlu_astronomy@5': 0.6842105263157895,
 'mmlu_college_biology@5': 0.7638888888888888,
 'mmlu_college_chemistry@5': 0.44,
 'mmlu_college_computer_science@5': 0.53,
 'mmlu_college_mathematics@5': 0.33,
 'mmlu_college_physics@5': 0.45098039215686275,
 'mmlu_computer_security@5': 0.77,
 'mmlu_conceptual_physics@5': 0.5829787234042553,
 'mmlu_electrical_engineering@5': 0.6344827586206897,
 'mmlu_elementary_mathematics@5': 0.3968253968253968,
 'mmlu_high_school_biology@5': 0.7645161290322581,
 'mmlu_high_school_chemistry@5': 0.5369458128078818,
 'mmlu_high_school_computer_science@5': 0.62,
 'mmlu_high_school_mathematics@5': 0.40370370370370373,
 'mmlu_high_school_physics@5': 0.4304635761589404,
 'mmlu_high_school_statistics@5': 0.5462962962962963,
 'mmlu_machine_learning@5': 0.4017857142857143,
 'mmlu_err@5': 0.003835221836153209,
 'mmlu_humanities_err@5': 0.006775368801783291,
 'mmlu_formal_logic_err@5': 0.0442626668137991,
 'mmlu_high_school_european_history_err@5': 0.03346409881055953,
 'mmlu_high_school_us_history_err@5': 0.02675640153807897,
 'mmlu_high_school_world_history_err@5': 0.02553010046023351,
 'mmlu_international_law_err@5': 0.03581796951709282,
 'mmlu_jurisprudence_err@5': 0.04236511258094633,
 'mmlu_logical_fallacies_err@5': 0.03408997886857529,
 'mmlu_moral_disputes_err@5': 0.024405173935783234,
 'mmlu_moral_scenarios_err@5': 0.016232826818678495,
 'mmlu_philosophy_err@5': 0.02540383297817961,
 'mmlu_prehistory_err@5': 0.024659685185967284,
 'mmlu_professional_law_err@5': 0.012760464028289299,
 'mmlu_world_religions_err@5': 0.029547741687640038,
 'mmlu_other_err@5': 0.007833424946677572,
 'mmlu_business_ethics_err@5': 0.04902071300001974,
 'mmlu_clinical_knowledge_err@5': 0.027134291628741727,
 'mmlu_college_medicine_err@5': 0.036430371689585475,
 'mmlu_global_facts_err@5': 0.04852365870939098,
 'mmlu_human_aging_err@5': 0.031381476375754995,
 'mmlu_management_err@5': 0.03989139859531769,
 'mmlu_marketing_err@5': 0.02093019318517933,
 'mmlu_medical_genetics_err@5': 0.04351941398892446,
 'mmlu_miscellaneous_err@5': 0.013702643715368983,
 'mmlu_nutrition_err@5': 0.025457756696667864,
 'mmlu_professional_accounting_err@5': 0.02981549448368206,
 'mmlu_professional_medicine_err@5': 0.027778298701545447,
 'mmlu_virology_err@5': 0.03864139923699121,
 'mmlu_social_sciences_err@5': 0.007734190961291264,
 'mmlu_econometrics_err@5': 0.047036043419179864,
 'mmlu_high_school_geography_err@5': 0.02777253333421898,
 'mmlu_high_school_government_and_politics_err@5': 0.027171213683164545,
 'mmlu_high_school_macroeconomics_err@5': 0.024468615241478916,
 'mmlu_high_school_microeconomics_err@5': 0.029837962388291932,
 'mmlu_high_school_psychology_err@5': 0.015630022970092455,
 'mmlu_human_sexuality_err@5': 0.037683359597287434,
 'mmlu_professional_psychology_err@5': 0.018690850273595284,
 'mmlu_public_relations_err@5': 0.04309118709946459,
 'mmlu_security_studies_err@5': 0.028535560337128448,
 'mmlu_sociology_err@5': 0.025196929874827072,
 'mmlu_us_foreign_policy_err@5': 0.03684529491774707,
 'mmlu_stem_err@5': 0.008541772519633976,
 'mmlu_abstract_algebra_err@5': 0.04648231987117316,
 'mmlu_anatomy_err@5': 0.04244633238353228,
 'mmlu_astronomy_err@5': 0.03782728980865469,
 'mmlu_college_biology_err@5': 0.03551446610810826,
 'mmlu_college_chemistry_err@5': 0.04988876515698589,
 'mmlu_college_computer_science_err@5': 0.050161355804659205,
 'mmlu_college_mathematics_err@5': 0.04725815626252606,
 'mmlu_college_physics_err@5': 0.04951218252396262,
 'mmlu_computer_security_err@5': 0.04229525846816505,
 'mmlu_conceptual_physics_err@5': 0.03223276266711712,
 'mmlu_electrical_engineering_err@5': 0.04013124195424385,
 'mmlu_elementary_mathematics_err@5': 0.025197101074246494,
 'mmlu_high_school_biology_err@5': 0.024137632429337703,
 'mmlu_high_school_chemistry_err@5': 0.035083705204426656,
 'mmlu_high_school_computer_science_err@5': 0.048783173121456316,
 'mmlu_high_school_mathematics_err@5': 0.02991481234222763,
 'mmlu_high_school_physics_err@5': 0.04042809961395634,
 'mmlu_high_school_statistics_err@5': 0.033953227263757976,
 'mmlu_machine_learning_err@5': 0.04653333146973647}
```

hqq 8 64
```json
{'mmlu@5': 0.654037886340977, 'mmlu_humanities@5': 0.6006376195536663, 'mmlu_formal_logic@5': 0.47619047619047616, 'mmlu_high_school_european_history@5': 0.7818181818181819, 'mmlu_high_school_us_history@5': 0.8235294117647058, 'mmlu_high_school_world_history@5': 0.8227848101265823, 'mmlu_international_law@5': 0.8264462809917356, 'mmlu_jurisprudence@5': 0.7407407407407407, 'mmlu_logical_fallacies@5': 0.7423312883435583, 'mmlu_moral_disputes@5': 0.7225433526011561, 'mmlu_moral_scenarios@5': 0.4122905027932961, 'mmlu_philosophy@5': 0.7266881028938906, 'mmlu_prehistory@5': 0.7253086419753086, 'mmlu_professional_law@5': 0.4921773142112125, 'mmlu_world_religions@5': 0.8070175438596491, 'mmlu_other@5': 0.7206308336015449, 'mmlu_business_ethics@5': 0.65, 'mmlu_clinical_knowledge@5': 0.7584905660377359, 'mmlu_college_medicine@5': 0.6473988439306358, 'mmlu_global_facts@5': 0.33, 'mmlu_human_aging@5': 0.695067264573991, 'mmlu_management@5': 0.8446601941747572, 'mmlu_marketing@5': 0.8589743589743589, 'mmlu_medical_genetics@5': 0.83, 'mmlu_miscellaneous@5': 0.80970625798212, 'mmlu_nutrition@5': 0.7973856209150327, 'mmlu_professional_accounting@5': 0.5, 'mmlu_professional_medicine@5': 0.6911764705882353, 'mmlu_virology@5': 0.572289156626506, 'mmlu_social_sciences@5': 0.7630809229769255, 'mmlu_econometrics@5': 0.49122807017543857, 'mmlu_high_school_geography@5': 0.8080808080808081, 'mmlu_high_school_government_and_politics@5': 0.8963730569948186, 'mmlu_high_school_macroeconomics@5': 0.6487179487179487, 'mmlu_high_school_microeconomics@5': 0.7352941176470589, 'mmlu_high_school_psychology@5': 0.8495412844036697, 'mmlu_human_sexuality@5': 0.7709923664122137, 'mmlu_professional_psychology@5': 0.7238562091503268, 'mmlu_public_relations@5': 0.7090909090909091, 'mmlu_security_studies@5': 0.7346938775510204, 'mmlu_sociology@5': 0.8805970149253731, 'mmlu_us_foreign_policy@5': 0.89, 'mmlu_stem@5': 0.5616872819536949, 'mmlu_abstract_algebra@5': 0.29, 'mmlu_anatomy@5': 0.6148148148148148, 'mmlu_astronomy@5': 0.7236842105263158, 'mmlu_college_biology@5': 0.7847222222222222, 'mmlu_college_chemistry@5': 0.46, 'mmlu_college_computer_science@5': 0.49, 'mmlu_college_mathematics@5': 0.33, 'mmlu_college_physics@5': 0.5, 'mmlu_computer_security@5': 0.85, 'mmlu_conceptual_physics@5': 0.6127659574468085, 'mmlu_electrical_engineering@5': 0.6413793103448275, 'mmlu_elementary_mathematics@5': 0.42857142857142855, 'mmlu_high_school_biology@5': 0.7870967741935484, 'mmlu_high_school_chemistry@5': 0.5320197044334976, 'mmlu_high_school_computer_science@5': 0.68, 'mmlu_high_school_mathematics@5': 0.42962962962962964, 'mmlu_high_school_physics@5': 0.44370860927152317, 'mmlu_high_school_statistics@5': 0.5555555555555556, 'mmlu_machine_learning@5': 0.44642857142857145, 'mmlu_err@5': 0.0037947049308790343, 'mmlu_humanities_err@5': 0.00678283136290411, 'mmlu_formal_logic_err@5': 0.04467062628403273, 'mmlu_high_school_european_history_err@5': 0.03225078108306289, 'mmlu_high_school_us_history_err@5': 0.026756401538078955, 'mmlu_high_school_world_history_err@5': 0.02485636418450323, 'mmlu_international_law_err@5': 0.0345727283691767, 'mmlu_jurisprudence_err@5': 0.042365112580946336, 'mmlu_logical_fallacies_err@5': 0.03436150827846917, 'mmlu_moral_disputes_err@5': 0.024105712607754307, 'mmlu_moral_scenarios_err@5': 0.01646320023811451, 'mmlu_philosophy_err@5': 0.02531176597542611, 'mmlu_prehistory_err@5': 0.024836057868294677, 'mmlu_professional_law_err@5': 0.0127686730761119, 'mmlu_world_religions_err@5': 0.030267457554898458, 'mmlu_other_err@5': 0.007736708380444025, 'mmlu_business_ethics_err@5': 0.047937248544110196, 'mmlu_clinical_knowledge_err@5': 0.026341480371118362, 'mmlu_college_medicine_err@5': 0.036430371689585475, 'mmlu_global_facts_err@5': 0.047258156262526045, 'mmlu_human_aging_err@5': 0.030898610882477515, 'mmlu_management_err@5': 0.03586594738573974, 'mmlu_marketing_err@5': 0.022801382534597542, 'mmlu_medical_genetics_err@5': 0.0377525168068637, 'mmlu_miscellaneous_err@5': 0.014036945850381385, 'mmlu_nutrition_err@5': 0.023015446877985655, 'mmlu_professional_accounting_err@5': 0.029827499313594685, 'mmlu_professional_medicine_err@5': 0.028064998167040094, 'mmlu_virology_err@5': 0.038515976837185335, 'mmlu_social_sciences_err@5': 0.007488141850953062, 'mmlu_econometrics_err@5': 0.04702880432049615, 'mmlu_high_school_geography_err@5': 0.028057791672989017, 'mmlu_high_school_government_and_politics_err@5': 0.021995311963644237, 'mmlu_high_school_macroeconomics_err@5': 0.024203665177902803, 'mmlu_high_school_microeconomics_err@5': 0.028657491285071977, 'mmlu_high_school_psychology_err@5': 0.01532856
```