From 392061ca801c4332c3cb13abe4ccb398b6238651 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Wed, 30 Jul 2025 23:23:23 +0100
Subject: [PATCH 1/2] Skip overriding when tensor is in fallback mode

---
 src/llama-quant.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index a00af7a1d1758..5016e4543505c 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -875,9 +875,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 
             // get more optimal quantization type based on the tensor shape, layer, etc.
             if (!params->pure && ggml_is_quantized(default_type)) {
+                int fallback = qs.n_fallback;
                 new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
                 // unless the user specifies a type
-                if (params->tensor_types) {
+                if (params->tensor_types && qs.n_fallback - fallback == 0) {
                     const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
                     const std::string tensor_name(tensor->name);
                     for (const auto & [tname, qtype] : tensor_types) {

From b679810167f18a37c1d386fde1e533364b301848 Mon Sep 17 00:00:00 2001
From: Ed Addario <eaddario@hotmail.com>
Date: Thu, 31 Jul 2025 13:04:51 +0100
Subject: [PATCH 2/2] Update comment

---
 src/llama-quant.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 5016e4543505c..0756bf09b8841 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -877,7 +877,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             if (!params->pure && ggml_is_quantized(default_type)) {
                 int fallback = qs.n_fallback;
                 new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
-                // unless the user specifies a type
+                // unless the user specifies a type, and the tensor geometry will not require fallback quantisation
                 if (params->tensor_types && qs.n_fallback - fallback == 0) {
                     const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
                     const std::string tensor_name(tensor->name);
@@ -891,7 +891,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                     }
                 }
             }
-
             if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
                 new_type = params->token_embedding_type;
             }