From c0dfae723d07774b8057f7e238ece404870bb333 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sat, 18 Oct 2025 22:34:39 +0200 Subject: [PATCH 1/2] add missing norm topk bias --- src/llama-graph.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index f29a1e98c9103..4d86f505f536b 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -981,6 +981,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn( ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens] cb(weights_sum, "ffn_moe_weights_sum", il); + weights_sum = ggml_scale_bias(ctx0, weights_sum, 1.0, 1e-20); + cb(weights_sum, "ffn_moe_weights_sum_biased", il); + weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens] cb(weights, "ffn_moe_weights_norm", il); From 39de13276f0223aa196b66e47d3fd8ac2d10afb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Mon, 20 Oct 2025 12:24:25 +0200 Subject: [PATCH 2/2] use clamping instead, update number and add comment --- src/llama-graph.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 4d86f505f536b..81521f569afba 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -981,8 +981,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn( ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens] cb(weights_sum, "ffn_moe_weights_sum", il); - weights_sum = ggml_scale_bias(ctx0, weights_sum, 1.0, 1e-20); - cb(weights_sum, "ffn_moe_weights_sum_biased", il); + // Avoid division by zero, clamp to smallest number representable by F16 + weights_sum = ggml_clamp(ctx0, weights_sum, 6.103515625e-5, INFINITY); + cb(weights_sum, "ffn_moe_weights_sum_clamped", il); weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens] cb(weights, "ffn_moe_weights_norm", il);