## This notebook demos the usage of how to run GPTQ on a quantized model

In [None]:
from dmx.compressor.modeling.hf import pipeline
pipe = pipeline(
    task="text-generation",
    model="d-matrix/opt",
    revision="opt-125m",
    dmx_config="BASELINE",
    trust_remote_code=True,
    device_map="auto",  # enabling model parallel on multi-GPU nodes
)

The next block configures the model to the right format.

xxx_format takes a single value.

input_formats takes a list or a dictionary. When a list is passed, the formats will be set in the order of the castTos within input_casts.

In [None]:
from dmx.compressor.modeling import DmxConfigRule,nn
format = "MXFP8[E4M3]{64}"
weight_format = "MXINT4{64}"
rules = (
    DmxConfigRule(
        module_types=(nn.Linear,),
        module_config=dict(
            input_formats=[format],  # option 1
            # input_formats = {"input_cast": format} # option 2
            weight_format=weight_format,
        ),
    ),
)
# configure model based on rules
pipe.model.configure(None, *rules)

### Note: if you are using INT formats that requires calibration, please refer to calibration.ipynb on how to do quantization calibration

Evaluate before gptq

In [None]:
metric = pipe.evaluate(
    "d-matrix/dmx_perplexity",
    dataset="wikitext",
    dataset_version="wikitext-2-raw-v1",
)
print("before gptq:",metric)

### GPTQ

Define layers to run gptq on. 

Note that at least a forward pass of the model needs to be run before this point so that dmxModules exists.

In [None]:
layers_to_gptq = {lname: layer for lname, layer in pipe.model.named_dmx_modules()}

Define hyperparameters for gptq

In [None]:
GPTQ_HYPERPARAMS = dict(
    block_size=128,
    microblock_size=64, # if weight format is blocked, microblock_size needs to be same as block size
)

Run gptq

In [None]:
import torch
with pipe.model.optimal_brain_compressing(
    layers_to_gptq.items(),
    microblock_size=GPTQ_HYPERPARAMS['microblock_size'],
    block_size=GPTQ_HYPERPARAMS['block_size'],
), torch.no_grad():
    pipe.do_forward_on(
      dataset = "wikitext",dataset_version="wikitext-2-raw-v1",column_name = "text",dataset_split="train",num_samples=10
    )
metric = pipe.evaluate(
    "d-matrix/dmx_perplexity",
    dataset="wikitext",
    dataset_version="wikitext-2-raw-v1",
)

Evaluate after gptq

In [None]:
metric = pipe.evaluate(
    "d-matrix/dmx_perplexity",
    dataset="wikitext",
    dataset_version="wikitext-2-raw-v1",
)
print("after gptq:",metric)