From 0ecbb8c35c32ec85ceae03bc7050745b0452624a Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Fri, 5 Dec 2025 11:15:16 +0100 Subject: [PATCH 1/2] llama : remove quantization sanity check This commit removes the quantization sanity check for attention layers. The motivation for this is that there are model that are hybrid models that have recurrent layers, experts layers, and attention layers. For these models the current check fails as the experts layers are not taking into account. After consideration, it was decided that this check is not strictly necessary, and can be removed to allow for more flexible model architectures. --- src/llama-quant.cpp | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 764833749ec..5f538bd9949 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -723,24 +723,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer; - // sanity checks for models that have attention layers - if (qs.n_attention_wv != 0 && !is_clip_model) - { - int32_t n_layer_all = model.hparams.n_layer; - if (llama_model_has_encoder(&model)) { - // now n_layer_all is the number of attention layers in the encoder - // for each decoder block, there are 2 attention layers - n_layer_all += 2 * model.hparams.dec_n_layer; - } - - // note: for linear-attention models (such as Qwen3 Next) this is the number of linear layers - const int32_t n_layer_recr = std::count(model.hparams.recurrent_layer_arr.begin(), model.hparams.recurrent_layer_arr.end(), true); - - LLAMA_LOG_INFO("%s: n_layer_all = %d, n_layer_recr = %d, pruned_attention_w = %d\n", __func__, n_layer_all, n_layer_recr, pruned_attention_w); - - GGML_ASSERT((qs.n_attention_wv == n_layer_all - pruned_attention_w - n_layer_recr) && "n_attention_wv is unexpected"); - } - size_t total_size_org = 0; size_t total_size_new = 0; From 00c5a718960cd3c4f5934927ef23e5183c0fe55f Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Fri, 5 Dec 2025 14:39:17 +0100 Subject: [PATCH 2/2] llama : remove unused pruned_attention_w and is_clip_model vars --- src/llama-quant.cpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 5f538bd9949..351dcb7baaa 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -666,7 +666,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: std::map mapped; int blk_id = 0; - int pruned_attention_w = 0; // make a list of weights std::vector tensors; @@ -674,11 +673,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: for (const auto & it : ml.weights_map) { const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id)); if (remapped_name.empty()) { - if (it.first.find("attn_v.weight") != std::string::npos || - it.first.find("attn_qkv.weight") != std::string::npos || - it.first.find("attn_kv_b.weight") != std::string::npos) { - pruned_attention_w++; - } LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str()); continue; } @@ -703,7 +697,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: }); } - bool is_clip_model = false; for (const auto * it : tensors) { const struct ggml_tensor * tensor = it->tensor; @@ -717,8 +710,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) { qs.has_output = true; } - - is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix } qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;