From 0ecbb8c35c32ec85ceae03bc7050745b0452624a Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Fri, 5 Dec 2025 11:15:16 +0100
Subject: [PATCH 1/2] llama : remove quantization sanity check

This commit removes the quantization sanity check for attention layers.

The motivation for this is that there are model that are hybrid models
that have recurrent layers, experts layers, and attention layers.  For
these models the current check fails as the experts layers are not
taking into account. After consideration, it was decided that this check
is not strictly necessary, and can be removed to allow for more flexible
model architectures.
---
 src/llama-quant.cpp | 18 ------------------
 1 file changed, 18 deletions(-)
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 764833749ec..5f538bd9949 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -723,24 +723,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
     qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
 
-    // sanity checks for models that have attention layers
-    if (qs.n_attention_wv != 0 && !is_clip_model)
-    {
-        int32_t n_layer_all = model.hparams.n_layer;
-        if (llama_model_has_encoder(&model)) {
-            // now n_layer_all is the number of attention layers in the encoder
-            // for each decoder block, there are 2 attention layers
-            n_layer_all += 2 * model.hparams.dec_n_layer;
-        }
-
-        // note: for linear-attention models (such as Qwen3 Next) this is the number of linear layers
-        const int32_t n_layer_recr = std::count(model.hparams.recurrent_layer_arr.begin(), model.hparams.recurrent_layer_arr.end(), true);
-
-        LLAMA_LOG_INFO("%s: n_layer_all = %d, n_layer_recr = %d, pruned_attention_w = %d\n", __func__, n_layer_all, n_layer_recr, pruned_attention_w);
-
-        GGML_ASSERT((qs.n_attention_wv == n_layer_all - pruned_attention_w - n_layer_recr) && "n_attention_wv is unexpected");
-    }
-
     size_t total_size_org = 0;
     size_t total_size_new = 0;
 

From 00c5a718960cd3c4f5934927ef23e5183c0fe55f Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Fri, 5 Dec 2025 14:39:17 +0100
Subject: [PATCH 2/2] llama : remove unused pruned_attention_w and
 is_clip_model vars

---
 src/llama-quant.cpp | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 5f538bd9949..351dcb7baaa 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -666,7 +666,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
     std::map<int, std::string> mapped;
     int blk_id = 0;
-    int pruned_attention_w = 0;
 
     // make a list of weights
     std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
@@ -674,11 +673,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
     for (const auto & it : ml.weights_map) {
         const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id));
         if (remapped_name.empty()) {
-            if (it.first.find("attn_v.weight") != std::string::npos ||
-                it.first.find("attn_qkv.weight") != std::string::npos ||
-                it.first.find("attn_kv_b.weight") != std::string::npos) {
-                    pruned_attention_w++;
-            }
             LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
             continue;
         }
@@ -703,7 +697,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         });
     }
 
-    bool is_clip_model = false;
     for (const auto * it : tensors) {
         const struct ggml_tensor * tensor = it->tensor;
 
@@ -717,8 +710,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
             qs.has_output = true;
         }
-
-        is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix
     }
 
     qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;