## Validation

In [1]:
import os

from tqdm.auto import tqdm

import torch
from torch.utils.data import DataLoader
from datasets import load_from_disk
from transformers import (
    LlavaNextForConditionalGeneration,
    LlavaNextProcessor,
)
from peft import LoraConfig, get_peft_model

In [2]:
from config import (
    MODEL_ID,
    OUTPUT_DIR,
    TEST_BATCH_SIZE,
    LORA_R,
    LORA_ALPHA,
    LORA_DROPOUT,
    DEVICE,
    MAX_ANSWER_TOKENS,
    ORPO_LAMBDA,
)

In [3]:
processor = LlavaNextProcessor.from_pretrained(MODEL_ID, use_fast=True)
TOKENIZER = processor.tokenizer
EOS_ID = TOKENIZER.eos_token_id

base_model = LlavaNextForConditionalGeneration.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    device_map="auto",
)

lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(base_model, lora_config)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
from dataloader_helper import collate_fn
from functools import partial

collate_fn = partial(
    collate_fn,
    processor=processor,
    DEVICE=DEVICE,
    MAX_ANSWER_TOKENS=MAX_ANSWER_TOKENS,
    TOKENIZER=TOKENIZER,
    EOS_ID=EOS_ID,
)

In [5]:
test_dataset = load_from_disk(f"{OUTPUT_DIR}/test_dataset")
test_loader = DataLoader(test_dataset, batch_size=TEST_BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

In [6]:
from orpo_helper import answer_logits, loss_orpo

In [7]:
def evaluate(model, test_loader):
    
    loss_orpo_arr = []
    loss_sft_arr = []
    loss_or_arr = []


    training = model.training
    model.eval()
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            prompt_inputs, chosen_ids, chosen_mask, rejected_ids, rejected_mask = batch
            chosen_logits, rejected_logits = answer_logits(model, prompt_inputs, chosen_ids, chosen_mask, rejected_ids, rejected_mask)

            loss_orpo_val, loss_sft_val, loss_or_val = loss_orpo(
                chosen_logits, 
                rejected_logits, 
                chosen_ids, 
                rejected_ids, 
                chosen_mask, 
                rejected_mask, 
                ORPO_LAMBDA
            )

            loss_orpo_arr.append(loss_orpo_val.item())
            loss_sft_arr.append(loss_sft_val.item())
            loss_or_arr.append(loss_or_val.item())

    if training:
        model.train()

    loss_orpo_mean = sum(loss_orpo_arr) / len(loss_orpo_arr)
    loss_sft_mean = sum(loss_sft_arr) / len(loss_sft_arr)
    loss_or_mean = sum(loss_or_arr) / len(loss_or_arr)

    return loss_orpo_mean, loss_sft_mean, loss_or_mean

In [8]:
evaluation_results = {}

In [10]:
evaluation_results["base_model"] = evaluate(base_model, test_loader)

Evaluating:   0%|          | 0/139 [00:00<?, ?it/s]

In [9]:
adapters = [
    {
        "name": "last",
        "path": f"{OUTPUT_DIR}/last",
    },
    {
        "name": "best_exp_1",
        "path": f"{OUTPUT_DIR}/step_2600",
    },
    {
        "name": "best_exp_2",
        "path": f"{OUTPUT_DIR}/step_1400",
    },
]

In [17]:
for adapter in adapters:
    model.load_adapter(adapter["path"], adapter_name=adapter["name"])
    model.set_adapter(adapter["name"])
    evaluation_results[adapter["name"]] = evaluate(model, test_loader)

Evaluating:   0%|          | 0/139 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/139 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/139 [00:00<?, ?it/s]

In [18]:
evaluation_results

{'base_model': (14.390681205035971, 7.301905350719425, 0.7088912854091727),
 'last': (13.170132643884893, 7.22701214028777, 0.5943498131182554),
 'best_exp_1': (13.185364208633093, 7.062106564748201, 0.6123169823516187),
 'best_exp_2': (13.597347122302159, 7.4391580485611515, 0.615810476618705)}

In [10]:
# model.load_adapter(adapters[0]["path"], adapter_name=adapters[0]["name"])
model.set_adapter("last")
merged = model.merge_and_unload()
merged.save_pretrained(f"{OUTPUT_DIR}/merged")
processor.save_pretrained(f"{OUTPUT_DIR}/merged")

['../logs//merged/processor_config.json']

## Quantization

In [17]:
import torch
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
import datasets
from datasets import load_dataset, Dataset
from PIL import Image
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor import oneshot
from config import OUTPUT_DIR
import itertools

In [31]:
# Config
merged_model_path = f"{OUTPUT_DIR}/merged"
quantized_model_path = f"{OUTPUT_DIR}/quantized"

max_seq_length = 4096 
num_calibration_samples = 256 

processor = LlavaNextProcessor.from_pretrained(merged_model_path, use_fast=True)

def cal_gen(num_samples):
    ds = test_dataset
    for item in itertools.islice(ds, num_samples):
        image, question = item["image"], item["question"]

        conv = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": question},
                    {"type": "image"},
                ],
            }
        ]

        prompt  = processor.apply_chat_template(conv, add_generation_prompt=True)
        inputs  = processor(images=[image], text=[prompt],
                            padding=False, return_tensors="pt")

        yield {
            "input_ids":      inputs.input_ids[0].tolist(),
            "attention_mask": inputs.attention_mask[0].tolist(),
            "pixel_values":   inputs.pixel_values[0, 0].numpy().astype("float32"),
            # NEW  → pass the height/width pair the model expects
            "image_sizes":    list(inputs.image_sizes[0]),          # e.g. [336, 336]
        }

# include the new column in the schema  ────────────────────────────────────────
features = datasets.Features({
    "input_ids":      datasets.Sequence(datasets.Value("int32")),
    "attention_mask": datasets.Sequence(datasets.Value("int8")),
    "pixel_values":   datasets.Array3D(shape=(3, 336, 336), dtype="float32"),
    "image_sizes":    datasets.Sequence(datasets.Value("int32"), length=2),
})

calibration_dataset = Dataset.from_generator(
    cal_gen,
    gen_kwargs=dict(num_samples=num_calibration_samples),
    features=features,
)


# ---------------------------------------------
# 2. Now load the BIG model & run quantisation
# ---------------------------------------------
model = LlavaNextForConditionalGeneration.from_pretrained(
    merged_model_path,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True
).eval()


Generating train split: 0 examples [00:00, ? examples/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [32]:
calibration_dataset.save_to_disk(f"{OUTPUT_DIR}/calibration_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/256 [00:00<?, ? examples/s]

In [33]:
recipe = [
    SmoothQuantModifier(smoothing_strength=0.8),
    GPTQModifier(scheme="W8A8", targets="Linear", ignore=["lm_head"]),
]

In [34]:
# Quantization
oneshot(
    model=model,
    dataset=calibration_dataset,
    recipe=recipe,
    output_dir=quantized_model_path,
    # The number of samples is now controlled by the generator and this arg
    num_calibration_samples=num_calibration_samples,
    max_seq_length=max_seq_length,
)

2025-07-17T23:09:31.237105+0000 | reset | INFO - Compression lifecycle reset
2025-07-17T23:09:31.244635+0000 | from_modifiers | INFO - Creating recipe from modifiers
2025-07-17T23:09:31.245550+0000 | _infer_mappings_from_model | INFO - No SmoothQuantModifier.mappings provided, inferring from model...
2025-07-17T23:09:31.246440+0000 | get_layer_mappings_from_architecture | INFO - Architecture LlavaNextForConditionalGeneration not found in mappings. Using default mappings: [LayerMap(balance_layers=['re:.*q_proj', 're:.*k_proj', 're:.*v_proj'], smooth_layers='re:.*input_layernorm'), LayerMap(balance_layers=['re:.*gate_proj', 're:.*up_proj'], smooth_layers='re:.*post_attention_layernorm')]
2025-07-17T23:09:33.539588+0000 | initialize | INFO - Compression lifecycle initialized for 1 modifiers
2025-07-17T23:09:33.541270+0000 | IndependentPipeline | INFO - Inferred `SequentialPipeline` for `SmoothQuantModifier`
These modules will not be traced, and any sequential target children will be execu

Preparing cache: 100%|██████████| 256/256 [00:22<00:00, 11.44it/s]
(1/1): Calibrating:   0%|          | 0/256 [00:00<?, ?it/s]


RuntimeError: Raised an exception during execution of the following code:
```
1 
2 torch.fx._symbolic_trace.wrap("transformers_models_llava_next_modeling_llava_next_wrapped_8749100517629")
3 
4 def forward(self, input_ids : torch.Tensor, pixel_values : torch.Tensor, image_sizes : torch.Tensor, attention_mask : torch.Tensor):
5     model = self.model(input_ids, pixel_values = pixel_values, image_sizes = image_sizes, vision_feature_layer = -2, vision_feature_select_strategy = 'default', attention_mask = attention_mask, position_ids = None, past_key_values = None, inputs_embeds = None, use_cache = False, output_attentions = False, output_hidden_states = False, return_dict = True, cache_position = None);  input_ids = pixel_values = image_sizes = attention_mask = None
6     getitem = model[0]
7     getattr_1 = model.past_key_values
8     getattr_2 = model.hidden_states
9     getattr_3 = model.attentions
10     getattr_4 = model.image_hidden_states;  model = None
11     getitem_1 = getitem[(slice(None, None, None), slice(0, None, None), slice(None, None, None))];  getitem = None
12     lm_head = self.lm_head(getitem_1);  getitem_1 = None
13     wrapped_8749100517629 = transformers_models_llava_next_modeling_llava_next_wrapped_8749100517629(lm_head, None, None)
14     getitem_2 = wrapped_8749100517629[0];  wrapped_8749100517629 = None
15     return {'loss': getitem_2, 'logits': lm_head, 'past_key_values': getattr_1, 'hidden_states': getattr_2, 'attentions': getattr_3, 'image_hidden_states': getattr_4}
16     
```
This is likely due to a violation of shape assumptions made when tracing