From 0a60230caec724f427f48692b0a6fdf737962445 Mon Sep 17 00:00:00 2001 From: Piotr Wilkin Date: Sat, 29 Nov 2025 02:14:51 +0100 Subject: [PATCH 1/4] Override SSM_A op for Qwen3 Next to reduce splits --- src/llama-model.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index c2a545531a9..acf87ac82ec 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -2437,6 +2437,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) { throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str())); } + if (arch == LLM_ARCH_QWEN3NEXT && tn_tensor == LLM_TENSOR_SSM_A) { + info.op = GGML_OP_MUL; // override SSM_SCAN default + } + // skip unused tensors if (info.op == GGML_OP_NONE || flags & TENSOR_SKIP) { const size_t nbytes = ggml_nbytes(t_meta); From bac8dedec7e009f2d4b11ad88d71eb753ee86046 Mon Sep 17 00:00:00 2001 From: Piotr Wilkin Date: Mon, 1 Dec 2025 15:20:21 +0100 Subject: [PATCH 2/4] New tensor mapping SSM_A_NOSCAN for SSM_A used outside of OP_SSM_SCAN context. --- src/llama-arch.cpp | 3 ++- src/llama-arch.h | 1 + src/llama-model.cpp | 7 ++----- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 8571a2e025a..a6d83babf88 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -853,7 +853,7 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, - { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" }, + { LLM_TENSOR_SSM_A_NOSCAN, "blk.%d.ssm_a" }, { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" }, { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" }, { LLM_TENSOR_SSM_BETA_ALPHA, "blk.%d.ssm_ba" }, @@ -2611,6 +2611,7 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_FFN_ACT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_DIV}}, {LLM_TENSOR_SSM_CONV1D, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}}, {LLM_TENSOR_SSM_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_SCAN}}, + {LLM_TENSOR_SSM_A_NOSCAN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, // a version of SSM_A used for MUL instead of SSM_SCAN {LLM_TENSOR_SSM_DT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_SSM_B_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_SSM_C_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index 150646478ae..58011bbb48c 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -377,6 +377,7 @@ enum llm_tensor { LLM_TENSOR_SSM_DT, LLM_TENSOR_SSM_DT_NORM, LLM_TENSOR_SSM_A, + LLM_TENSOR_SSM_A_NOSCAN, // qwen3next special case with MUL instead of SSM_SCAN LLM_TENSOR_SSM_B_NORM, LLM_TENSOR_SSM_C_NORM, LLM_TENSOR_SSM_D, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index acf87ac82ec..db9be851403 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1,5 +1,6 @@ #include "llama-model.h" +#include "llama-arch.h" #include "llama-impl.h" #include "llama-mmap.h" #include "llama-cparams.h" @@ -2437,10 +2438,6 @@ bool llama_model::load_tensors(llama_model_loader & ml) { throw std::runtime_error(format("missing tensor info mapping for %s", tn.str().c_str())); } - if (arch == LLM_ARCH_QWEN3NEXT && tn_tensor == LLM_TENSOR_SSM_A) { - info.op = GGML_OP_MUL; // override SSM_SCAN default - } - // skip unused tensors if (info.op == GGML_OP_NONE || flags & TENSOR_SKIP) { const size_t nbytes = ggml_nbytes(t_meta); @@ -6491,7 +6488,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, qkvz_dim }, 0); layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0); layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0); - layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), { hparams.ssm_dt_rank }, 0); + layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0); layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_dim }, 0); layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0); layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0); From e79bae0ec95e4aaf71830ab34c8e97c896c1576a Mon Sep 17 00:00:00 2001 From: "Piotr Wilkin (ilintar)" Date: Mon, 1 Dec 2025 15:49:05 +0100 Subject: [PATCH 3/4] Update src/llama-model.cpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- src/llama-model.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index db9be851403..b401412e5a0 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -6488,7 +6488,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), { n_embd, qkvz_dim }, 0); layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), { hparams.ssm_d_conv, conv_dim }, 0); layer.ssm_dt = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), { hparams.ssm_dt_rank }, 0); - layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0); + layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A_NOSCAN, i), { hparams.ssm_dt_rank }, 0); layer.ssm_beta_alpha = create_tensor(tn(LLM_TENSOR_SSM_BETA_ALPHA, "weight", i), { n_embd, ba_dim }, 0); layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), { head_v_dim }, 0); layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), { value_dim, n_embd }, 0); From 5f93ef654893817639bf4ea1dc6777ff5a1799ea Mon Sep 17 00:00:00 2001 From: "Piotr Wilkin (ilintar)" Date: Mon, 1 Dec 2025 15:49:13 +0100 Subject: [PATCH 4/4] Update src/llama-model.cpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- src/llama-model.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index b401412e5a0..f421fc9f14f 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1,6 +1,5 @@ #include "llama-model.h" -#include "llama-arch.h" #include "llama-impl.h" #include "llama-mmap.h" #include "llama-cparams.h"