torch nightly and transformers==4.52.4

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from dmx.compressor.modeling import DmxModel, nn
import torch
torch.manual_seed(42)
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

### Opt forward

In [None]:
def test_opt():
    model_name = "d-matrix/opt-125m"
    model1 = AutoModelForCausalLM.from_pretrained(
        model_name, torch_dtype="auto", device_map="cuda", trust_remote_code=True
    )

    model2 = AutoModelForCausalLM.from_pretrained(
        "facebook/opt-125m",
        torch_dtype="auto",
        device_map="cuda",
        trust_remote_code=True,
    )

    model1.eval()
    model2.eval()
    model_inputs = {k: v.to("cuda:0") for k, v in model1.dummy_inputs.items()}
    input_ids = torch.randint(0, 100, (3, 8)).to("cuda:0")
    with torch.no_grad():
        output0 = model1(**model_inputs)

        generated_ids_0 = model1.generate(input_ids, max_new_tokens=5, do_sample=False)
        # compiled(**model_inputs)

        model2 = DmxModel.from_torch(model2, export=True)

        output2 = model2(**model_inputs)

        assert torch.allclose(output0.logits, output2.logits)

        model2(input_ids)
        generated_ids2 = model2.generate(input_ids, max_new_tokens=5, do_sample=False)
        assert torch.allclose(generated_ids_0, generated_ids2)

        model2.to_basic_mode()

        quant_output2 = model2(**model_inputs)
        quant_generated_ids2 = model2.generate(
            input_ids, max_new_tokens=5, do_sample=False
        )

        model1 = DmxModel.from_torch(model1)

        output1 = model1(**model_inputs)

        generated_ids1 = model1.generate(input_ids, max_new_tokens=5, do_sample=False)

        model1.to_basic_mode()

        quant_output1 = model1(**model_inputs)
        quant_generated_ids1 = model1.generate(
            input_ids, max_new_tokens=5, do_sample=False
        )
    assert torch.allclose(output0.logits, output1.logits)
    assert torch.allclose(output2.logits, output1.logits)
    assert torch.allclose(generated_ids_0, generated_ids1)
    assert torch.allclose(generated_ids2, generated_ids1)
    assert not torch.allclose(quant_output1.logits, output0.logits)
    assert torch.allclose(quant_output1.logits, quant_output2.logits)
    assert not torch.allclose(generated_ids_0, quant_generated_ids1)
    assert torch.allclose(quant_generated_ids2, quant_generated_ids1)

test_opt()

### Opt quantized submodules

In [None]:
def test_opt_submod():
    model_name = "d-matrix/opt-125m"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model1 = AutoModelForCausalLM.from_pretrained(
        model_name, torch_dtype="auto", device_map="cuda", trust_remote_code=True
    )

    model2 = AutoModelForCausalLM.from_pretrained(
        "facebook/opt-125m",
        torch_dtype="auto",
        device_map="cuda",
        trust_remote_code=True,
    )

    model1.eval()
    model2.eval()
    input_ids = torch.randint(0, 100, (3, 8)).to("cuda:0")
    hidden_states = torch.rand(3, 8, 768).to("cuda").to(torch.float16)
    attention_mask = torch.ones(3, 1, 8, 8).to("cuda").to(torch.float16)
    with torch.no_grad():
        output0 = model1(input_ids)
        decoder_output0 = model1.model.decoder(input_ids).last_hidden_state
        decoder_layer_output0 = model1.model.decoder.layers[0](
            hidden_states, attention_mask=attention_mask
        )[0]

        model2 = DmxModel.from_torch(model2, export=True)
        output2 = model2(input_ids)

        decoder_output2 = model2.model.decoder(input_ids).last_hidden_state
        decoder_layer_output2 = model2.model.decoder.layers[0](
            hidden_states, attention_mask=attention_mask
        )[0]

        assert torch.allclose(output0.logits, output2.logits)
        assert torch.allclose(decoder_output0, decoder_output2)
        assert torch.allclose(decoder_layer_output0, decoder_layer_output2)

        model2.to_basic_mode()

        quant_decoder_output2 = model2.model.decoder(input_ids).last_hidden_state
        quant_decoder_layer_output2 = model2.model.decoder.layers[0](
            hidden_states, attention_mask=attention_mask
        )[0]

        model1 = DmxModel.from_torch(model1)

        output1 = model1(input_ids)
        decoder_output1 = model1.model.decoder(input_ids).last_hidden_state
        decoder_layer_output1 = model1.model.decoder.layers[0](
            hidden_states, attention_mask=attention_mask
        )[0]

        model1.to_basic_mode()

        quant_decoder_output1 = model1.model.decoder(input_ids).last_hidden_state
        quant_decoder_layer_output1 = model1.model.decoder.layers[0](
            hidden_states, attention_mask=attention_mask
        )[0]
    assert torch.allclose(output0.logits, output1.logits)
    assert torch.allclose(output2.logits, output1.logits)
    assert torch.allclose(decoder_output0, decoder_output1)
    assert torch.allclose(decoder_output2, decoder_output1)
    assert torch.allclose(decoder_layer_output0, decoder_layer_output1)
    assert torch.allclose(decoder_layer_output2, decoder_layer_output1)

    assert not torch.allclose(quant_decoder_output1, decoder_output0)
    assert torch.allclose(quant_decoder_output1, quant_decoder_output2)
    assert not torch.allclose(quant_decoder_layer_output1, decoder_layer_output0)
    assert torch.allclose(quant_decoder_layer_output1, quant_decoder_layer_output2)

test_opt_submod()

### Opt pipeline

In [None]:
def test_pipe():
    from transformers import pipeline
    from dmx.compressor import DmxModel

    model = "facebook/opt-125m"
    task = "text-generation"

    task_cases = [
        dict(
            text_inputs="Once upon a time,",
        ),
        dict(
            text_inputs="To be honest,",
        ),
    ]
    from transformers.generation import GenerationConfig

    pipe = pipeline(
        task=task,
        model=model,
        trust_remote_code=True,
        device_map="cuda",
    )

    model = AutoModelForCausalLM.from_pretrained(
        model,
        attn_implementation="sdpa",
        generation_config=GenerationConfig(
            use_cache=True,
            cache_implementation="static",
            max_length=1024,
            cache_config={
                "batch_size": 1,
                "max_cache_len": 1024,
            },
        ),
    )

    out0 = [pipe(**_tc, do_sample=False) for _tc in task_cases]


    pipe.model = DmxModel.from_torch(pipe.model, export=True)

    # -------------------------------------------------------------------------------
    torch.manual_seed(42)
    out = [pipe(**_tc, do_sample=False) for _tc in task_cases]
    assert out[0][0]["generated_text"] == out0[0][0]["generated_text"]
    assert out[1][0]["generated_text"] == out0[1][0]["generated_text"]

test_pipe()

### GPT2

In [None]:
def test_gpt2():
    model_name = "d-matrix/gpt2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model1 = AutoModelForCausalLM.from_pretrained(
        model_name, torch_dtype="auto", device_map="cuda", trust_remote_code=True
    )

    model2 = AutoModelForCausalLM.from_pretrained(
        "openai-community/gpt2",
        torch_dtype="auto",
        device_map="cuda",
        trust_remote_code=True,
    )

    model1.eval()
    model2.eval()
    model_inputs = {k: v.to("cuda:0") for k, v in model1.dummy_inputs.items()}
    input_ids = torch.randint(0, 100, (3, 8)).to("cuda:0")
    with torch.no_grad():
        output0 = model1(**model_inputs)

        generated_ids_0 = model1.generate(input_ids, max_new_tokens=5, do_sample=False)
        # compiled(**model_inputs)

        model2 = DmxModel.from_torch(model2, export=True)

        output2 = model2(**model_inputs)

        assert torch.allclose(output0.logits, output2.logits)

        model2(input_ids)
        generated_ids2 = model2.generate(input_ids, max_new_tokens=5, do_sample=False)
        assert torch.allclose(generated_ids_0, generated_ids2)

        model2.to_basic_mode()

        quant_output2 = model2(**model_inputs)
        quant_generated_ids2 = model2.generate(
            input_ids, max_new_tokens=5, do_sample=False
        )

        model1 = DmxModel.from_torch(model1)

        output1 = model1(**model_inputs)

        model1.to_basic_mode()

        quant_output1 = model1(**model_inputs)

    assert torch.allclose(output0.logits, output1.logits)
    assert torch.allclose(output2.logits, output1.logits)
    assert torch.allclose(generated_ids_0, generated_ids2)
    assert not torch.allclose(quant_output1.logits, output0.logits)
    assert torch.allclose(quant_output1.logits, quant_output2.logits)
    assert not torch.allclose(generated_ids_0, quant_generated_ids2)
test_gpt2()

### Whisper

In [None]:

def test_whisper():
    # pip install datasets
    import torch
    from datasets import load_dataset
    from transformers import AutoProcessor, WhisperForConditionalGeneration

    from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
    import librosa

    model = WhisperForConditionalGeneration.from_pretrained(
        "openai/whisper-tiny",
        torch_dtype=torch.float16,
        device_map="cuda",
        attn_implementation="sdpa",
    ).to("cuda")
    processor = AutoProcessor.from_pretrained("openai/whisper-tiny")
    task = "automatic-speech-recognition"
    pipe = pipeline(
        task=task,
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
    )
    audio, sr = librosa.load("audio.mp3", sr=16000)
    input_features = processor(
        audio,
        sampling_rate=sr,
        return_tensors="pt",
    ).input_features

    input_features = input_features.to("cuda", dtype=torch.float16)
    decoder_input_ids = torch.randint(0, 100, (1, 2)).to("cuda")
    out0 = model(input_features, decoder_input_ids=decoder_input_ids)
    generation0 = pipe("audio.mp3", return_timestamps=True)
    model = DmxModel.from_torch(model, export=True)
    out1 = model(input_features, decoder_input_ids=decoder_input_ids)
    generation1_nocache = model.generate(
        input_features, decoder_input_ids=decoder_input_ids, use_cache=False
    )
    generation1 = pipe("audio.mp3", return_timestamps=True)
    assert generation0["text"] == generation1["text"]
test_whisper()

### Whisper smoothquant

In [None]:
def test_whisper_smoothquant():
    import torch
    from transformers import AutoProcessor, WhisperForConditionalGeneration
    from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

    model = WhisperForConditionalGeneration.from_pretrained(
        "openai/whisper-tiny",
        torch_dtype=torch.float16,
        device_map="cuda",
        attn_implementation="sdpa",
    ).to("cuda")
    model0 = WhisperForConditionalGeneration.from_pretrained(
        "openai/whisper-tiny",
        torch_dtype=torch.float16,
        device_map="cuda",
        attn_implementation="sdpa",
    ).to("cuda")
    processor = AutoProcessor.from_pretrained("openai/whisper-tiny")

    task = "automatic-speech-recognition"

    pipe = pipeline(
        task=task,
        model=model0,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
    )
    pipe0 = pipeline(
        task=task,
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
    )

    out0 = pipe("audio.mp3")
    pipe.model = DmxModel.from_torch(pipe.model, export=True)
    pipe0.model = DmxModel.from_torch(pipe0.model, export=True)

    pipe("audio.mp3", generate_kwargs={"do_sample": False})
    pipe0("audio.mp3", generate_kwargs={"do_sample": False})

    from dmx.compressor import nn

    from dmx.compressor.advanced_recipe import (
        DmxModuleSmoothQuantHyperparams,
        DmxSmoothQuantRecipe,
    )

    def hp_gen(_model) -> dict:
        return {
            _m: DmxModuleSmoothQuantHyperparams(
                migration_strength=0.25,
                fuse_to_weight=True,
            )
            for _n, _m in _model.named_dmx_modules()
            if "decoder" in _n and isinstance(_m, (nn.Linear,)) and "proj_out" not in _n
        }

    def hp_gen_no_fuse(_model) -> dict:
        return {
            _m: DmxModuleSmoothQuantHyperparams(
                migration_strength=0.25,
                fuse_to_weight=False,
            )
            for _n, _m in _model.named_dmx_modules()
            if isinstance(_m, (nn.Linear,)) and "proj_out" not in _n
        }

    with DmxSmoothQuantRecipe(hp_gen_no_fuse).applied_to(pipe0.model):
        pipe0(
            "audio.mp3", generate_kwargs={"do_sample": False}
        )
    out1 = pipe("audio.mp3", generate_kwargs={"do_sample": False})
    with DmxSmoothQuantRecipe(hp_gen).applied_to(pipe.model):
        pipe(
            "audio.mp3", generate_kwargs={"do_sample": False}
        )
    out2 = pipe("audio.mp3", generate_kwargs={"do_sample": False})
    assert out0["text"] == out1["text"]
    assert out1["text"] == out2["text"]

    pipe0.model.to_basic_mode()
    pipe.model.to_basic_mode()
    out3 = pipe0("audio.mp3", generate_kwargs={"do_sample": False})
    out4 = pipe("audio.mp3", generate_kwargs={"do_sample": False})
    assert not out0["text"] == out3["text"]
    assert out3["text"] == out4["text"]

test_whisper_smoothquant()

### Clip

In [None]:
def test_clip():
    from PIL import Image
    import requests
    import torch
    from transformers import CLIPProcessor, CLIPModel
    from dmx.compressor.modeling import DmxModel

    model1 = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    model2 = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    image = Image.open(requests.get(url, stream=True).raw)

    inputs = processor(
        text=["a photo of a cat", "a photo of a dog"],
        images=image,
        return_tensors="pt",
        padding=True,
    )
    model1.eval()
    model2.eval()

    output0 = model1(**inputs)

    model1 = DmxModel.from_torch(model1, export=False)
    model2 = DmxModel.from_torch(model2, export=True)

    output1 = model1(**inputs)
    output2 = model2(**inputs)
    assert torch.allclose(output0.logits_per_image, output1.logits_per_image)
    assert torch.allclose(output1.logits_per_image, output2.logits_per_image)

    model1.to_basic_mode()
    model2.to_basic_mode()
    output1_quant = model1(**inputs)
    output2_quant = model2(**inputs)
    assert not torch.allclose(output0.logits_per_image, output1_quant.logits_per_image)
    assert torch.allclose(
        output2_quant.logits_per_image, output1_quant.logits_per_image
    )
test_clip()

### Qwen

In [None]:
def test_qwen():
    model_name = "Qwen/Qwen3-0.6B"

    model = AutoModelForCausalLM.from_pretrained(
        model_name, torch_dtype=torch.float, device_map="cuda"
    )
    model.eval()
    input_ids = torch.randint(0, 100, (3, 8)).to("cuda:0")
    ref_output = model(input_ids)
    model = DmxModel.from_torch(model, export=True)
    with torch.no_grad():
        out = model(input_ids)
    assert torch.allclose(ref_output.logits, out.logits, atol=1e-4)
    model.to_basic_mode()
    with torch.no_grad():
        out = model(input_ids)
    assert not torch.allclose(ref_output.logits, out.logits, atol=1e-4)
test_qwen()

### Llama3.2

In [None]:
def test_llama3():
    model_name = "meta-llama/Llama-3.2-1B"
    # model_name = "facebook/opt-125m"
    model = AutoModelForCausalLM.from_pretrained(
        model_name, torch_dtype=torch.float, device_map="cuda"
    )
    model.eval()
    input_ids = torch.randint(0, 100, (3, 8)).to("cuda:0")
    ref_output = model(input_ids)
    model = DmxModel.from_torch(model, export=True)
    with torch.no_grad():
        out = model(input_ids)
    assert torch.allclose(ref_output.logits, out.logits, atol=1e-4)
    model.to_basic_mode()
    with torch.no_grad():
        out = model(input_ids)
    assert not torch.allclose(ref_output.logits, out.logits, atol=1e-4)
test_llama3()