From 385b1f057d90b9200fecb2bb2d2f08564a8b8337 Mon Sep 17 00:00:00 2001 From: Aman Gupta Date: Mon, 1 Dec 2025 11:59:00 +0800 Subject: [PATCH] llama-graph: avoid expand_forward for fusion --- ggml/src/ggml-cuda/ggml-cuda.cu | 2 +- src/llama-graph.cpp | 6 ------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index fa7e1e13a71..eb2e273110d 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3274,7 +3274,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx GGML_LOG_DEBUG("Setting stream no to %d for node %s\n", cuda_ctx->curr_stream_no, node->name); } } - prev_i = i; #ifdef GGML_CUDA_DEBUG const int nodes_fused = i - prev_i - 1; @@ -3282,6 +3281,7 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx GGML_LOG_INFO("nodes_fused: %d\n", nodes_fused); } #endif + prev_i = i; if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { continue; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 1d012e09aba..452a956c43d 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -810,9 +810,6 @@ ggml_tensor * llm_graph_context::build_ffn( GGML_ABORT("fatal error"); } - //expand here so that we can fuse ffn gate - ggml_build_forward_expand(gf, cur); - if (gate && type_gate == LLM_FFN_PAR) { cur = ggml_mul(ctx0, cur, tmp); cb(cur, "ffn_gate_par", il); @@ -1093,9 +1090,6 @@ ggml_tensor * llm_graph_context::build_moe_ffn( GGML_ABORT("fatal error"); } - //expand here so that we can fuse ffn gate - ggml_build_forward_expand(gf, cur); - experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens] cb(experts, "ffn_moe_down", il);