From 83f3004cfad3f31116afc6ea3d70ed544c2dc865 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 9 Dec 2025 18:05:10 +0100 Subject: [PATCH 1/3] convert: allow using quantized Mistral weight --- convert_hf_to_gguf.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 2cc2a388236..d50fa197164 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2854,13 +2854,14 @@ def set_gguf_parameters(self): self.gguf_writer.add_attn_temperature_scale(rope_params["llama_4_scaling_beta"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): - # TODO: probably not worth supporting quantized weight, as official BF16 is also available - if name.endswith("weight_scale_inv"): - raise ValueError("This is a quantized weight, please use BF16 weight instead") - name = name.replace("language_model.", "") if "multi_modal_projector" in name or "vision_tower" in name: return [] + + if name.endswith(".activation_scale") or name.endswith(".weight_scale_inv"): + assert data_torch.nelement() == 0 # unused by the model + return [] + return super().modify_tensors(data_torch, name, bid) @@ -9955,6 +9956,12 @@ def set_mistral_config(gguf_writer: gguf.GGUFWriter, hparams: dict): if "llama_4_scaling" in hparams: gguf_writer.add_attn_temperature_scale(hparams["llama_4_scaling"]["beta"]) + def modify_tensors(self, data_torch, name, bid): + if name.endswith(".qscale_act") or name.endswith(".qscale_weight"): + assert data_torch.nelement() == 0 # unused by the model + return [] + return super().modify_tensors(data_torch, name, bid) + class MistralMoeModel(DeepseekV2Model): model_arch = gguf.MODEL_ARCH.DEEPSEEK2 From b246a574af0ec28de252a06d705bca3f2f504816 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Tue, 9 Dec 2025 18:26:09 +0100 Subject: [PATCH 2/3] data_torch.ndim --- convert_hf_to_gguf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index d50fa197164..d9ff77be4d2 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2859,7 +2859,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): return [] if name.endswith(".activation_scale") or name.endswith(".weight_scale_inv"): - assert data_torch.nelement() == 0 # unused by the model + assert data_torch.ndim == 0 # unused by the model return [] return super().modify_tensors(data_torch, name, bid) @@ -9958,7 +9958,7 @@ def set_mistral_config(gguf_writer: gguf.GGUFWriter, hparams: dict): def modify_tensors(self, data_torch, name, bid): if name.endswith(".qscale_act") or name.endswith(".qscale_weight"): - assert data_torch.nelement() == 0 # unused by the model + assert data_torch.ndim == 0 # unused by the model return [] return super().modify_tensors(data_torch, name, bid) From e5644781c9521126235388576ee9896c20e53898 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Tue, 9 Dec 2025 19:25:59 +0100 Subject: [PATCH 3/3] update dequant fn Co-authored-by: compilade --- convert_hf_to_gguf.py | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index d9ff77be4d2..867bc90531c 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -383,6 +383,17 @@ def dequant_packed(w: Tensor, scale: Tensor, shape_tensor: Tensor, zero_point: T s = self.model_tensors[name] self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs) tensors_to_remove.append(name) + if name.endswith(".activation_scale"): # unused + tensors_to_remove.append(name) + # mistral format + if name.endswith(".qscale_weight"): + weight_name = name.removesuffix("qscale_weight") + "weight" + w = self.model_tensors[weight_name] + s = self.model_tensors[name] + self.model_tensors[weight_name] = lambda w=w, s=s, bs=block_size: dequant_simple(w(), s(), bs) + tensors_to_remove.append(name) + if name.endswith(".qscale_act"): + tensors_to_remove.append(name) elif quant_method == "gptq": for name in self.model_tensors.keys(): if name.endswith(".qweight"): @@ -2858,10 +2869,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): if "multi_modal_projector" in name or "vision_tower" in name: return [] - if name.endswith(".activation_scale") or name.endswith(".weight_scale_inv"): - assert data_torch.ndim == 0 # unused by the model - return [] - return super().modify_tensors(data_torch, name, bid) @@ -9899,6 +9906,18 @@ def __init__(self, *args, **kwargs): self.gguf_writer.add_architecture() self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + def dequant_model(self): + # transform quantization config into HF format + quant_config = self.hparams.get("quantization") + if quant_config is not None: + assert quant_config["qformat_weight"] == "fp8_e4m3" + self.hparams["quantization_config"] = { + "activation_scheme": "static", + "quant_method": "fp8", + "weight_block_size": None, + } + return super().dequant_model() + @staticmethod def get_community_chat_template(vocab: MistralVocab, templates_dir: Path, is_mistral_format: bool): assert TokenizerVersion is not None and Tekkenizer is not None and SentencePieceTokenizer is not None, _mistral_import_error_msg @@ -9956,12 +9975,6 @@ def set_mistral_config(gguf_writer: gguf.GGUFWriter, hparams: dict): if "llama_4_scaling" in hparams: gguf_writer.add_attn_temperature_scale(hparams["llama_4_scaling"]["beta"]) - def modify_tensors(self, data_torch, name, bid): - if name.endswith(".qscale_act") or name.endswith(".qscale_weight"): - assert data_torch.ndim == 0 # unused by the model - return [] - return super().modify_tensors(data_torch, name, bid) - class MistralMoeModel(DeepseekV2Model): model_arch = gguf.MODEL_ARCH.DEEPSEEK2