From 7d51323240b8bf7dd82f2523759397aac08af97d Mon Sep 17 00:00:00 2001
From: Hongrui Chen <chraac@gmail.com>
Date: Wed, 12 Nov 2025 10:58:00 +0800
Subject: [PATCH 01/14] hexagon: add buffer support checks for hexagon sessions

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index cabd301ad35..4b9aafa9e1d 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -1912,6 +1912,21 @@ static bool hex_supported_dims(const struct ggml_tensor * x, const struct ggml_t
     return true;
 }
 
+static inline bool hex_supported_buffer(const struct ggml_hexagon_session * sess) {
+    return true;
+}
+
+template <typename... _TBuffers>
+static inline bool hex_supported_buffer(const struct ggml_hexagon_session * sess,
+                                        ggml_backend_buffer_t               buffer,
+                                        _TBuffers... buffers) {
+    if (buffer && (!ggml_backend_buffer_is_hexagon(buffer) || ggml_backend_hexagon_buffer_get_sess(buffer) != sess)) {
+        return false;
+    }
+
+    return hex_supported_buffer(sess, buffers...);
+}
+
 static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * dst) {
     const struct ggml_tensor * src0 = dst->src[0];
     const struct ggml_tensor * src1 = dst->src[1];
@@ -1959,16 +1974,7 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
     }
 
     // src0 & src1 & dst must be mapped to the same session
-    if (src0->buffer &&
-        (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
-        return false;
-    }
-    if (src1->buffer &&
-        (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
-        return false;
-    }
-    if (dst->buffer &&
-        (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
+    if (!hex_supported_buffer(sess, src0->buffer, src1->buffer, dst->buffer)) {
         return false;
     }
 

From fc68ce48aa806b587bc8d8e622bc6028ea1302f0 Mon Sep 17 00:00:00 2001
From: Hongrui Chen <chraac@gmail.com>
Date: Wed, 12 Nov 2025 11:03:37 +0800
Subject: [PATCH 02/14] refactor: simplify buffer support checks in hexagon
 operations

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 85 +++-----------------------
 1 file changed, 7 insertions(+), 78 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 4b9aafa9e1d..99087ab890e 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -2022,20 +2022,7 @@ static bool ggml_hexagon_supported_mul_mat_id(const struct ggml_hexagon_session
 
     // src0 (weights) must be repacked and mapped to the same session
     // src1 & sr2 & dst must be mapped to the same session
-    if (src0->buffer &&
-        (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
-        return false;
-    }
-    if (src1->buffer &&
-        (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
-        return false;
-    }
-    if (src2->buffer &&
-        (!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) {
-        return false;
-    }
-    if (dst->buffer &&
-        (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
+    if (!hex_supported_buffer(sess, src0->buffer, src1->buffer, src2->buffer, dst->buffer)) {
         return false;
     }
 
@@ -2069,16 +2056,7 @@ static bool ggml_hexagon_supported_binary(const struct ggml_hexagon_session * se
     }
 
     // src0, src1 & dst must be mapped to the same session
-    if (src0->buffer &&
-        (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
-        return false;
-    }
-    if (src1->buffer &&
-        (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
-        return false;
-    }
-    if (dst->buffer &&
-        (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
+    if (!hex_supported_buffer(sess, src0->buffer, src1->buffer, dst->buffer)) {
         return false;
     }
 
@@ -2110,20 +2088,7 @@ static bool ggml_hexagon_supported_add_id(const struct ggml_hexagon_session * se
     }
 
     // src0, src1 & dst must be mapped to the same session
-    if (src0->buffer &&
-        (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
-        return false;
-    }
-    if (src1->buffer &&
-        (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
-        return false;
-    }
-    if (src2->buffer &&
-        (!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) {
-        return false;
-    }
-    if (dst->buffer &&
-        (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
+    if (!hex_supported_buffer(sess, src0->buffer, src1->buffer, src2->buffer, dst->buffer)) {
         return false;
     }
 
@@ -2150,12 +2115,7 @@ static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * ses
     }
 
     // src0 & dst must be mapped to the same session
-    if (src0->buffer &&
-        (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
-        return false;
-    }
-    if (dst->buffer &&
-        (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
+    if (!hex_supported_buffer(sess, src0->buffer, dst->buffer)) {
         return false;
     }
 
@@ -2192,16 +2152,7 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session
     }
 
     // src0, src1 & dst must be mapped to the same session
-    if (src0->buffer &&
-        (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
-        return false;
-    }
-    if (src1 && src1->buffer &&
-        (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
-        return false;
-    }
-    if (dst->buffer &&
-        (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
+    if (!hex_supported_buffer(sess, src0->buffer, src1->buffer, dst->buffer)) {
         return false;
     }
 
@@ -2254,16 +2205,7 @@ static bool ggml_hexagon_supported_softmax(const struct ggml_hexagon_session * s
     }
 
     // src0, src1 & dst must be mapped to the same session
-    if (src0->buffer &&
-        (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
-        return false;
-    }
-    if (src1 && src1->buffer &&
-        (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
-        return false;
-    }
-    if (dst->buffer &&
-        (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
+    if (!hex_supported_buffer(sess, src0->buffer, src1->buffer, dst->buffer)) {
         return false;
     }
 
@@ -2318,20 +2260,7 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess
     }
 
     // src0, src1, src2 & dst must be mapped to the same session
-    if (src0->buffer &&
-        (!ggml_backend_buffer_is_hexagon(src0->buffer) || ggml_backend_hexagon_buffer_get_sess(src0->buffer) != sess)) {
-        return false;
-    }
-    if (src1->buffer &&
-        (!ggml_backend_buffer_is_hexagon(src1->buffer) || ggml_backend_hexagon_buffer_get_sess(src1->buffer) != sess)) {
-        return false;
-    }
-    if (src2 && src2->buffer &&
-        (!ggml_backend_buffer_is_hexagon(src2->buffer) || ggml_backend_hexagon_buffer_get_sess(src2->buffer) != sess)) {
-        return false;
-    }
-    if (dst->buffer &&
-        (!ggml_backend_buffer_is_hexagon(dst->buffer) || ggml_backend_hexagon_buffer_get_sess(dst->buffer) != sess)) {
+    if (!hex_supported_buffer(sess, src0->buffer, src1->buffer, src2->buffer, dst->buffer)) {
         return false;
     }
 

From d1bfaddb34d848ef3f45d43491a217e9866a3888 Mon Sep 17 00:00:00 2001
From: Hongrui Chen <chraac@gmail.com>
Date: Wed, 12 Nov 2025 22:10:38 +0800
Subject: [PATCH 03/14] hexagon: update buffer support checks to use tensor
 structure

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 27 +++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 99087ab890e..f9d23bc92f3 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -1916,15 +1916,16 @@ static inline bool hex_supported_buffer(const struct ggml_hexagon_session * sess
     return true;
 }
 
-template <typename... _TBuffers>
+template <typename... _TTensor>
 static inline bool hex_supported_buffer(const struct ggml_hexagon_session * sess,
-                                        ggml_backend_buffer_t               buffer,
-                                        _TBuffers... buffers) {
-    if (buffer && (!ggml_backend_buffer_is_hexagon(buffer) || ggml_backend_hexagon_buffer_get_sess(buffer) != sess)) {
+                                        const ggml_tensor *                 t,
+                                        _TTensor... tensors) {
+    if (t && t->buffer &&
+        (!ggml_backend_buffer_is_hexagon(t->buffer) || ggml_backend_hexagon_buffer_get_sess(t->buffer) != sess)) {
         return false;
     }
 
-    return hex_supported_buffer(sess, buffers...);
+    return hex_supported_buffer(sess, tensors...);
 }
 
 static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * dst) {
@@ -1974,7 +1975,7 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s
     }
 
     // src0 & src1 & dst must be mapped to the same session
-    if (!hex_supported_buffer(sess, src0->buffer, src1->buffer, dst->buffer)) {
+    if (!hex_supported_buffer(sess, src0, src1, dst)) {
         return false;
     }
 
@@ -2022,7 +2023,7 @@ static bool ggml_hexagon_supported_mul_mat_id(const struct ggml_hexagon_session
 
     // src0 (weights) must be repacked and mapped to the same session
     // src1 & sr2 & dst must be mapped to the same session
-    if (!hex_supported_buffer(sess, src0->buffer, src1->buffer, src2->buffer, dst->buffer)) {
+    if (!hex_supported_buffer(sess, src0, src1, src2, dst)) {
         return false;
     }
 
@@ -2056,7 +2057,7 @@ static bool ggml_hexagon_supported_binary(const struct ggml_hexagon_session * se
     }
 
     // src0, src1 & dst must be mapped to the same session
-    if (!hex_supported_buffer(sess, src0->buffer, src1->buffer, dst->buffer)) {
+    if (!hex_supported_buffer(sess, src0, src1, dst)) {
         return false;
     }
 
@@ -2088,7 +2089,7 @@ static bool ggml_hexagon_supported_add_id(const struct ggml_hexagon_session * se
     }
 
     // src0, src1 & dst must be mapped to the same session
-    if (!hex_supported_buffer(sess, src0->buffer, src1->buffer, src2->buffer, dst->buffer)) {
+    if (!hex_supported_buffer(sess, src0, src1, src2, dst)) {
         return false;
     }
 
@@ -2115,7 +2116,7 @@ static bool ggml_hexagon_supported_unary(const struct ggml_hexagon_session * ses
     }
 
     // src0 & dst must be mapped to the same session
-    if (!hex_supported_buffer(sess, src0->buffer, dst->buffer)) {
+    if (!hex_supported_buffer(sess, src0, dst)) {
         return false;
     }
 
@@ -2152,7 +2153,7 @@ static bool ggml_hexagon_supported_activations(const struct ggml_hexagon_session
     }
 
     // src0, src1 & dst must be mapped to the same session
-    if (!hex_supported_buffer(sess, src0->buffer, src1->buffer, dst->buffer)) {
+    if (!hex_supported_buffer(sess, src0, src1, dst)) {
         return false;
     }
 
@@ -2205,7 +2206,7 @@ static bool ggml_hexagon_supported_softmax(const struct ggml_hexagon_session * s
     }
 
     // src0, src1 & dst must be mapped to the same session
-    if (!hex_supported_buffer(sess, src0->buffer, src1->buffer, dst->buffer)) {
+    if (!hex_supported_buffer(sess, src0, src1, dst)) {
         return false;
     }
 
@@ -2260,7 +2261,7 @@ static bool ggml_hexagon_supported_rope(const struct ggml_hexagon_session * sess
     }
 
     // src0, src1, src2 & dst must be mapped to the same session
-    if (!hex_supported_buffer(sess, src0->buffer, src1->buffer, src2->buffer, dst->buffer)) {
+    if (!hex_supported_buffer(sess, src0, src1, src2, dst)) {
         return false;
     }
 

From 02bb8fcd1cd21c870c95b3e34d56836df0119d07 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Thu, 20 Nov 2025 12:04:11 +0800
Subject: [PATCH 04/14] refactor: streamline buffer initialization for DSP
 queue in hexagon operations

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 103 +++++++++++--------------
 1 file changed, 47 insertions(+), 56 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index f9d23bc92f3..57f15e4ff13 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -2282,6 +2282,45 @@ static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t) {
     h->nb[3] = t->nb[3];
 }
 
+template <size_t _Cnt>
+static ggml_hexagon_session * dspqueue_buffers_init(dspqueue_buffer (&bufs)[_Cnt],
+                                                    const bool                                 is_src0_static,
+                                                    const ggml_tensor *                        dst,
+                                                    std::initializer_list<const ggml_tensor *> srcs) {
+    GGML_ASSERT(_Cnt == srcs.size() + 1);
+    GGML_ASSERT(srcs.size() > 0);
+
+    constexpr const auto buffer_init = [](dspqueue_buffer * buffer, const ggml_tensor * t, uint32_t flags) {
+        if (!t || !t->buffer) {
+            return;
+        }
+
+        auto tensor_buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
+        buffer->fd      = tensor_buf->fd;
+        buffer->ptr     = t->data;
+        buffer->offset  = (uint8_t *) t->data - tensor_buf->base;
+        buffer->size    = ggml_nbytes(t);
+        buffer->flags   = flags;
+    };
+
+    memset(bufs, 0, sizeof(bufs));
+    buffer_init(&bufs[_Cnt - 1], dst,
+                DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);  // Output buffer: flush CPU caches
+
+    {
+        uint32_t src_flags = is_src0_static ? 0 :
+                                              (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush CPU
+                                               DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
+        for (size_t i = 0; i < srcs.size(); i++) {
+            buffer_init(&bufs[i], srcs.begin()[i], src_flags);
+            src_flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush CPU
+                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
+        }
+    }
+
+    return static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context)->sess;
+}
+
 static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer * d) {
     auto buf  = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
     auto sess = buf->sess;
@@ -2296,10 +2335,6 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags)
     const struct ggml_tensor * src1 = op->src[1];
     const struct ggml_tensor * dst  = op;
 
-    auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
-    auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
-    auto dst_buf  = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
-
     uint64_t t1, t2;
     t1 = ggml_time_us();
 
@@ -2320,40 +2355,20 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags)
         req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
     }
 
-    dspqueue_buffer bufs[3];
-    memset(bufs, 0, sizeof(bufs));
-
     // First buffer Weights.
     // The content is static, there is no need to do any cache management
-    bufs[0].fd     = src0_buf->fd;
-    bufs[0].ptr    = src0->data;
-    bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
-    bufs[0].size   = ggml_nbytes(src0);
-    bufs[0].flags  = 0;
-
+    //
     // Second buffer Input Activations. This is a buffer that the CPU
     // writes and the DSP reads, so we'll need to flush CPU caches and
     // invalidate DSP ones. On platforms with I/O coherency support the
     // framework will automatically skip cache operations where possible.
-    bufs[1].fd     = src1_buf->fd;
-    bufs[1].ptr    = src1->data;
-    bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
-    bufs[1].size   = ggml_nbytes(src1);
-    bufs[1].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
-                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
-
+    //
     // Third buffer Output Activations. We'll handle DSP
     // cache maintenance in the response message but need to flush
     // CPU caches to ensure any previously written dirty lines are
     // written out before writes from the DSP start.
-    bufs[2].fd     = dst_buf->fd;
-    bufs[2].ptr    = dst->data;
-    bufs[2].offset = (uint8_t *) dst->data - dst_buf->base;
-    bufs[2].size   = ggml_nbytes(dst);
-    bufs[2].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
-
-    // Primary DSP session from the src0 (normally weight) tensor
-    auto sess = src0_buf->sess;
+    dspqueue_buffer bufs[3];
+    auto sess = dspqueue_buffers_init(bufs, true, dst, { src0, src1 });
 
     if (opt_verbose) {
         char dims[64 * GGML_MAX_SRC];
@@ -2517,9 +2532,6 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
     const struct ggml_tensor * src1 = node->src[1];
     const struct ggml_tensor * dst  = node;
 
-    auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
-    auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
-    auto dst_buf  = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
 
     uint64_t t1 = 0;
     uint64_t t2 = 0;
@@ -2556,45 +2568,24 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
     init_htp_tensor(&req.src1, src1);
     init_htp_tensor(&req.dst, dst);
 
-    dspqueue_buffer bufs[3];
-    memset(bufs, 0, sizeof(bufs));
-
     // First buffer = First Operand of Binary op
     // This is a buffer that the CPU writes and the DSP reads, so we'll
     // need to flush CPU caches and invalidate DSP ones. On platforms
     // with I/O coherency support the framework will automatically skip
     // cache operations where possible.
-    bufs[0].fd     = src0_buf->fd;
-    bufs[0].ptr    = src0->data;
-    bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
-    bufs[0].size   = ggml_nbytes(src0);
-    bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
-                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP;
-
+    //
     // Second buffer = Second Operand of Binary op
     // This is a buffer that the CPU writes and the DSP reads, so we'll
     // need to flush CPU caches and invalidate DSP ones. On platforms
     // with I/O coherency support the framework will automatically skip
     // cache operations where possible.
-    bufs[1].fd     = src1_buf->fd;
-    bufs[1].ptr    = src1->data;
-    bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
-    bufs[1].size   = ggml_nbytes(src1);
-    bufs[1].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
-                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
-
+    //
     // Third buffer = Output Activations. We'll handle DSP
     // cache maintenance in the response message but need to flush
     // CPU caches to ensure any previously written dirty lines are
     // written out before writes from the DSP start.
-    bufs[2].fd     = dst_buf->fd;
-    bufs[2].ptr    = dst->data;
-    bufs[2].offset = (uint8_t *) dst->data - dst_buf->base;
-    bufs[2].size   = ggml_nbytes(dst);
-    bufs[2].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
-
-    // Primary DSP session from the src0 tensor
-    ggml_hexagon_session * sess = src0_buf->sess;
+    dspqueue_buffer bufs[3];
+    auto            sess = dspqueue_buffers_init(bufs, false, dst, { src0, src1 });
 
     if (opt_verbose) {
         char dims[64 * GGML_MAX_SRC];

From 0d9bda3c2c56c3e3b3881e0ed81f1ea23e0240c9 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Thu, 20 Nov 2025 21:33:31 +0800
Subject: [PATCH 05/14] refactor: simplify buffer initialization in DSP queue
 for hexagon operations

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 242 +++++++------------------
 1 file changed, 64 insertions(+), 178 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 57f15e4ff13..b4f1ff42cf7 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -2283,18 +2283,14 @@ static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t) {
 }
 
 template <size_t _Cnt>
-static ggml_hexagon_session * dspqueue_buffers_init(dspqueue_buffer (&bufs)[_Cnt],
-                                                    const bool                                 is_src0_static,
-                                                    const ggml_tensor *                        dst,
-                                                    std::initializer_list<const ggml_tensor *> srcs) {
+static size_t dspqueue_buffers_init(dspqueue_buffer (&bufs)[_Cnt],
+                                    const bool                                 is_src0_static,
+                                    const ggml_tensor *                        dst,
+                                    std::initializer_list<const ggml_tensor *> srcs) {
     GGML_ASSERT(_Cnt == srcs.size() + 1);
     GGML_ASSERT(srcs.size() > 0);
 
     constexpr const auto buffer_init = [](dspqueue_buffer * buffer, const ggml_tensor * t, uint32_t flags) {
-        if (!t || !t->buffer) {
-            return;
-        }
-
         auto tensor_buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
         buffer->fd      = tensor_buf->fd;
         buffer->ptr     = t->data;
@@ -2304,21 +2300,34 @@ static ggml_hexagon_session * dspqueue_buffers_init(dspqueue_buffer (&bufs)[_Cnt
     };
 
     memset(bufs, 0, sizeof(bufs));
-    buffer_init(&bufs[_Cnt - 1], dst,
-                DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);  // Output buffer: flush CPU caches
 
-    {
-        uint32_t src_flags = is_src0_static ? 0 :
-                                              (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush CPU
-                                               DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
-        for (size_t i = 0; i < srcs.size(); i++) {
-            buffer_init(&bufs[i], srcs.begin()[i], src_flags);
-            src_flags = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush CPU
-                         DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
+    const uint32_t src0_flags = is_src0_static ? 0 :
+                                                 (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush CPU
+                                                  DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
+    buffer_init(&bufs[0], srcs.begin()[0], src0_flags);
+
+    size_t n_bufs = 1;
+    for (size_t i = 1; i < srcs.size(); i++) {
+        auto * src = srcs.begin()[i];
+        if (!src) {
+            continue;
         }
+
+        buffer_init(&bufs[n_bufs], srcs.begin()[i],
+                    DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |
+                        DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Flush CPU and Invalidate DSP
+        n_bufs++;
     }
 
-    return static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context)->sess;
+    buffer_init(&bufs[n_bufs], dst,
+                DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);  // Output buffer: flush CPU caches
+    n_bufs++;
+
+    return n_bufs;
+}
+
+static ggml_hexagon_session * get_session_from_tensor(const ggml_tensor * t) {
+    return static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context)->sess;
 }
 
 static void hex_dump_dspbuf(const struct ggml_tensor * t, const dspqueue_buffer * d) {
@@ -2368,7 +2377,9 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags)
     // CPU caches to ensure any previously written dirty lines are
     // written out before writes from the DSP start.
     dspqueue_buffer bufs[3];
-    auto sess = dspqueue_buffers_init(bufs, true, dst, { src0, src1 });
+    dspqueue_buffers_init(bufs, true, dst, { src0, src1 });
+
+    auto * sess = get_session_from_tensor(src0);
 
     if (opt_verbose) {
         char dims[64 * GGML_MAX_SRC];
@@ -2414,11 +2425,6 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
     const struct ggml_tensor * src2 = op->src[2];
     const struct ggml_tensor * dst  = op;
 
-    auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
-    auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
-    auto src2_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src2->buffer->context);
-    auto dst_buf  = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
-
     uint64_t t1, t2;
     t1 = ggml_time_us();
 
@@ -2440,51 +2446,27 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
         req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
     }
 
-    dspqueue_buffer bufs[4];
-    memset(bufs, 0, sizeof(bufs));
-
     // First buffer Weights.
     // The content is static, there is no need to do any cache management
-    bufs[0].fd     = src0_buf->fd;
-    bufs[0].ptr    = src0->data;
-    bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
-    bufs[0].size   = ggml_nbytes(src0);
-    bufs[0].flags  = 0;
-
+    //
     // Second buffer Input Activations. This is a buffer that the CPU
     // writes and the DSP reads, so we'll need to flush CPU caches and
     // invalidate DSP ones. On platforms with I/O coherency support the
     // framework will automatically skip cache operations where possible.
-    bufs[1].fd     = src1_buf->fd;
-    bufs[1].ptr    = src1->data;
-    bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
-    bufs[1].size   = ggml_nbytes(src1);
-    bufs[1].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
-                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
-
+    //
     // Third buffer expert IDs. This is a buffer that the CPU
     // writes and the DSP reads, so we'll need to flush CPU caches and
     // invalidate DSP ones. On platforms with I/O coherency support the
     // framework will automatically skip cache operations where possible.
-    bufs[2].fd     = src2_buf->fd;
-    bufs[2].ptr    = src2->data;
-    bufs[2].offset = (uint8_t *) src2->data - src2_buf->base;
-    bufs[2].size   = ggml_nbytes(src2);
-    bufs[2].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
-                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
-
+    //
     // Forth buffer Output Activations. We'll handle DSP
     // cache maintenance in the response message but need to flush
     // CPU caches to ensure any previously written dirty lines are
     // written out before writes from the DSP start.
-    bufs[3].fd     = dst_buf->fd;
-    bufs[3].ptr    = dst->data;
-    bufs[3].offset = (uint8_t *) dst->data - dst_buf->base;
-    bufs[3].size   = ggml_nbytes(dst);
-    bufs[3].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
+    dspqueue_buffer bufs[4];
+    dspqueue_buffers_init(bufs, true, dst, { src0, src1, src2 });
 
-    // Primary DSP session from the src0 (normally weight) tensor
-    auto sess = src0_buf->sess;
+    auto * sess = get_session_from_tensor(src0);
 
     if (opt_verbose) {
         char dims[64 * GGML_MAX_SRC];
@@ -2532,7 +2514,6 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
     const struct ggml_tensor * src1 = node->src[1];
     const struct ggml_tensor * dst  = node;
 
-
     uint64_t t1 = 0;
     uint64_t t2 = 0;
 
@@ -2585,7 +2566,9 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
     // CPU caches to ensure any previously written dirty lines are
     // written out before writes from the DSP start.
     dspqueue_buffer bufs[3];
-    auto            sess = dspqueue_buffers_init(bufs, false, dst, { src0, src1 });
+    dspqueue_buffers_init(bufs, false, dst, { src0, src1 });
+
+    auto * sess = get_session_from_tensor(src0);
 
     if (opt_verbose) {
         char dims[64 * GGML_MAX_SRC];
@@ -2632,11 +2615,6 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
     const struct ggml_tensor * src2 = node->src[2];
     const struct ggml_tensor * dst  = node;
 
-    auto src0_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
-    auto src1_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
-    auto src2_buf = static_cast<ggml_backend_hexagon_buffer_context *>(src2->buffer->context);
-    auto dst_buf  = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
-
     uint64_t t1 = 0;
     uint64_t t2 = 0;
 
@@ -2667,42 +2645,14 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
     init_htp_tensor(&req.src2, src2);
     init_htp_tensor(&req.dst, dst);
 
-    dspqueue_buffer bufs[4];
-    memset(bufs, 0, sizeof(bufs));
-
     // First buffer = input activations
-    bufs[0].fd     = src0_buf->fd;
-    bufs[0].ptr    = src0->data;
-    bufs[0].offset = (uint8_t *) src0->data - src0_buf->base;
-    bufs[0].size   = ggml_nbytes(src0);
-    bufs[0].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
-                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP;
-
     // Second buffer = experts bias
-    bufs[1].fd     = src1_buf->fd;
-    bufs[1].ptr    = src1->data;
-    bufs[1].offset = (uint8_t *) src1->data - src1_buf->base;
-    bufs[1].size   = ggml_nbytes(src1);
-    bufs[1].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
-                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
-
     // Third buffer = activated experts
-    bufs[2].fd     = src2_buf->fd;
-    bufs[2].ptr    = src2->data;
-    bufs[2].offset = (uint8_t *) src2->data - src2_buf->base;
-    bufs[2].size   = ggml_nbytes(src2);
-    bufs[2].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
-                     DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
-
     // Forth buffer = output activations
-    bufs[3].fd     = dst_buf->fd;
-    bufs[3].ptr    = dst->data;
-    bufs[3].offset = (uint8_t *) dst->data - dst_buf->base;
-    bufs[3].size   = ggml_nbytes(dst);
-    bufs[3].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
+    dspqueue_buffer bufs[4];
+    dspqueue_buffers_init(bufs, false, dst, { src0, src1, src2 });
 
-    // Primary DSP session from the src0 tensor
-    ggml_hexagon_session * sess = src0_buf->sess;
+    auto * sess = get_session_from_tensor(src0);
 
     if (opt_verbose) {
         char dims[64 * GGML_MAX_SRC];
@@ -2812,56 +2762,28 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
         req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
     }
 
-    dspqueue_buffer bufs[3];
-    int             n_bufs = 0;
-
-    memset(bufs, 0, sizeof(bufs));
-
     // First buffer = Only Operand of Unary op
     // This is a buffer that the CPU writes and the DSP reads, so we'll
     // need to flush CPU caches and invalidate DSP ones. On platforms
     // with I/O coherency support the framework will automatically skip
     // cache operations where possible.
-    auto src0_buf       = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
-    bufs[n_bufs].fd     = src0_buf->fd;
-    bufs[n_bufs].ptr    = src0->data;
-    bufs[n_bufs].offset = (uint8_t *) src0->data - src0_buf->base;
-    bufs[n_bufs].size   = ggml_nbytes(src0);
-    bufs[n_bufs].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
-                          DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP;
-    ++n_bufs;
-
-    if (src1) {
-        // Second buffer = Second Operand of Binary op
-        // This is a buffer that the CPU writes and the DSP reads, so we'll
-        // need to flush CPU caches and invalidate DSP ones. On platforms
-        // with I/O coherency support the framework will automatically skip
-        // cache operations where possible.
-        auto src1_buf       = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
-        bufs[n_bufs].fd     = src1_buf->fd;
-        bufs[n_bufs].ptr    = src1->data;
-        bufs[n_bufs].offset = (uint8_t *) src1->data - src1_buf->base;
-        bufs[n_bufs].size   = ggml_nbytes(src1);
-        bufs[n_bufs].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
-                              DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
-        ++n_bufs;
-    }
-
+    //
+    // Second buffer(nullable) = Second Operand of Binary op
+    // This is a buffer that the CPU writes and the DSP reads, so we'll
+    // need to flush CPU caches and invalidate DSP ones. On platforms
+    // with I/O coherency support the framework will automatically skip
+    // cache operations where possible.
+    //
     // Second or third buffer = Output Activations. We'll handle DSP
     // Second buffer = Output Activations. We'll handle DSP
     // cache maintenance in the response message but need to flush
     // CPU caches to ensure any previously written dirty lines are
     // written out before writes from the DSP start.
-    auto dst_buf        = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
-    bufs[n_bufs].fd     = dst_buf->fd;
-    bufs[n_bufs].ptr    = dst->data;
-    bufs[n_bufs].offset = (uint8_t *) dst->data - dst_buf->base;
-    bufs[n_bufs].size   = ggml_nbytes(dst);
-    bufs[n_bufs].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
-    ++n_bufs;
+    dspqueue_buffer bufs[3];
+    size_t          n_bufs = dspqueue_buffers_init(bufs, false, dst, { src0, src1 });
 
     // Primary DSP session from the src0 tensor
-    ggml_hexagon_session * sess = src0_buf->sess;
+    auto * sess = get_session_from_tensor(src0);
 
     if (opt_verbose) {
         char dims[64 * GGML_MAX_SRC];
@@ -2949,70 +2871,34 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
         req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
     }
 
-    dspqueue_buffer bufs[4];
-    int             n_bufs = 0;
-
-    memset(bufs, 0, sizeof(bufs));
-
     // First buffer
     // This is a buffer that the CPU writes and the DSP reads, so we'll
     // need to flush CPU caches and invalidate DSP ones. On platforms
     // with I/O coherency support the framework will automatically skip
     // cache operations where possible.
-    auto src0_buf       = static_cast<ggml_backend_hexagon_buffer_context *>(src0->buffer->context);
-    bufs[n_bufs].fd     = src0_buf->fd;
-    bufs[n_bufs].ptr    = src0->data;
-    bufs[n_bufs].offset = (uint8_t *) src0->data - src0_buf->base;
-    bufs[n_bufs].size   = ggml_nbytes(src0);
-    bufs[n_bufs].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
-                          DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP;
-    ++n_bufs;
-
+    //
     // Second buffer
     // This is a buffer that the CPU writes and the DSP reads, so we'll
     // need to flush CPU caches and invalidate DSP ones. On platforms
     // with I/O coherency support the framework will automatically skip
     // cache operations where possible.
-    auto src1_buf       = static_cast<ggml_backend_hexagon_buffer_context *>(src1->buffer->context);
-    bufs[n_bufs].fd     = src1_buf->fd;
-    bufs[n_bufs].ptr    = src1->data;
-    bufs[n_bufs].offset = (uint8_t *) src1->data - src1_buf->base;
-    bufs[n_bufs].size   = ggml_nbytes(src1);
-    bufs[n_bufs].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
-                          DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
-    ++n_bufs;
-
-    if (src2) {
-        // Third buffer
-        // This is a buffer that the CPU writes and the DSP reads, so we'll
-        // need to flush CPU caches and invalidate DSP ones. On platforms
-        // with I/O coherency support the framework will automatically skip
-        // cache operations where possible.
-        auto src2_buf       = static_cast<ggml_backend_hexagon_buffer_context *>(src2->buffer->context);
-        bufs[n_bufs].fd     = src2_buf->fd;
-        bufs[n_bufs].ptr    = src2->data;
-        bufs[n_bufs].offset = (uint8_t *) src2->data - src2_buf->base;
-        bufs[n_bufs].size   = ggml_nbytes(src2);
-        bufs[n_bufs].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |         // Flush CPU
-                              DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
-        ++n_bufs;
-    }
-
+    //
+    // Third buffer(nullable)
+    // This is a buffer that the CPU writes and the DSP reads, so we'll
+    // need to flush CPU caches and invalidate DSP ones. On platforms
+    // with I/O coherency support the framework will automatically skip
+    // cache operations where possible.
+    //
     // Final buffer = Output Activations. We'll handle DSP
     // Second buffer = Output Activations. We'll handle DSP
     // cache maintenance in the response message but need to flush
     // CPU caches to ensure any previously written dirty lines are
     // written out before writes from the DSP start.
-    auto dst_buf        = static_cast<ggml_backend_hexagon_buffer_context *>(dst->buffer->context);
-    bufs[n_bufs].fd     = dst_buf->fd;
-    bufs[n_bufs].ptr    = dst->data;
-    bufs[n_bufs].offset = (uint8_t *) dst->data - dst_buf->base;
-    bufs[n_bufs].size   = ggml_nbytes(dst);
-    bufs[n_bufs].flags  = (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);
-    ++n_bufs;
+    dspqueue_buffer bufs[4];
+    size_t          n_bufs = dspqueue_buffers_init(bufs, false, dst, { src0, src1, src2 });
 
     // Primary DSP session from the src0 tensor
-    ggml_hexagon_session * sess = src0_buf->sess;
+    auto * sess = get_session_from_tensor(src0);
 
     if (opt_verbose) {
         char dims[64 * GGML_MAX_SRC];

From 0dfae43c18ef1d0de2b907aa00e1847ad6ba64e1 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Thu, 20 Nov 2025 22:08:55 +0800
Subject: [PATCH 06/14] refactor: optimize hex_supported_buffer function by
 fold expression

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index b4f1ff42cf7..dd667f86653 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -1912,20 +1912,13 @@ static bool hex_supported_dims(const struct ggml_tensor * x, const struct ggml_t
     return true;
 }
 
-static inline bool hex_supported_buffer(const struct ggml_hexagon_session * sess) {
-    return true;
-}
-
 template <typename... _TTensor>
-static inline bool hex_supported_buffer(const struct ggml_hexagon_session * sess,
-                                        const ggml_tensor *                 t,
-                                        _TTensor... tensors) {
-    if (t && t->buffer &&
-        (!ggml_backend_buffer_is_hexagon(t->buffer) || ggml_backend_hexagon_buffer_get_sess(t->buffer) != sess)) {
-        return false;
-    }
-
-    return hex_supported_buffer(sess, tensors...);
+static inline bool hex_supported_buffer(const struct ggml_hexagon_session * sess, _TTensor... tensors) {
+    return ([&]() -> bool {
+        return !tensors || !tensors->buffer ||
+               (ggml_backend_buffer_is_hexagon(tensors->buffer) &&
+                ggml_backend_hexagon_buffer_get_sess(tensors->buffer) == sess);
+    }() && ...);
 }
 
 static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * dst) {

From b886fafe0107b0450e5de88189c6ad1816e61c41 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Fri, 21 Nov 2025 00:41:59 +0800
Subject: [PATCH 07/14] wip

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 108 ++++++-------------------
 1 file changed, 23 insertions(+), 85 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index dd667f86653..1f1e621a4cd 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -240,6 +240,23 @@ struct ggml_hexagon_session {
     uint32_t         prof_pkts;
 };
 
+static inline void hex_print_op_info(const ggml_tensor * op, ggml_hexagon_session * sess, const uint32_t req_flags) {
+    char dims[64 * GGML_MAX_SRC];
+    char strides[64 * GGML_MAX_SRC];
+    char types[16 * GGML_MAX_SRC];
+    char buffs[64 * GGML_MAX_SRC];
+    char names[64 * GGML_MAX_SRC];
+
+    hex_format_op_dims(dims, op);
+    hex_format_op_strides(strides, op);
+    hex_format_op_types(types, op);
+    hex_format_op_buffs(buffs, op);
+    hex_format_op_names(names, op);
+
+    HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
+                names, dims, types, strides, buffs, req_flags);
+}
+
 void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) {
     // Bump pending flag (cleared in the session::flush once we get the responce)
     this->op_pending++;  // atomic inc
@@ -2375,20 +2392,7 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags)
     auto * sess = get_session_from_tensor(src0);
 
     if (opt_verbose) {
-        char dims[64 * GGML_MAX_SRC];
-        char strides[64 * GGML_MAX_SRC];
-        char types[16 * GGML_MAX_SRC];
-        char buffs[64 * GGML_MAX_SRC];
-        char names[64 * GGML_MAX_SRC];
-
-        hex_format_op_dims(dims, op);
-        hex_format_op_strides(strides, op);
-        hex_format_op_types(types, op);
-        hex_format_op_buffs(buffs, op);
-        hex_format_op_names(names, op);
-
-        HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
-                    names, dims, types, strides, buffs, req.flags);
+        hex_print_op_info(op, sess, req.flags);
         if (opt_verbose > 1) {
             hex_dump_dspbuf(src0, &bufs[0]);
             hex_dump_dspbuf(src1, &bufs[1]);
@@ -2462,20 +2466,7 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
     auto * sess = get_session_from_tensor(src0);
 
     if (opt_verbose) {
-        char dims[64 * GGML_MAX_SRC];
-        char strides[64 * GGML_MAX_SRC];
-        char types[16 * GGML_MAX_SRC];
-        char buffs[64 * GGML_MAX_SRC];
-        char names[64 * GGML_MAX_SRC];
-
-        hex_format_op_dims(dims, op);
-        hex_format_op_types(types, op);
-        hex_format_op_buffs(buffs, op);
-        hex_format_op_names(names, op);
-
-        HEX_VERBOSE("ggml-hex: %s %s: %s : %s : %s : %s : %s: flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
-                    names, dims, types, strides, buffs, req.flags);
-
+        hex_print_op_info(op, sess, req.flags);
         if (opt_verbose > 1) {
             hex_dump_dspbuf(src0, &bufs[0]);
             hex_dump_dspbuf(src1, &bufs[1]);
@@ -2564,20 +2555,7 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
     auto * sess = get_session_from_tensor(src0);
 
     if (opt_verbose) {
-        char dims[64 * GGML_MAX_SRC];
-        char strides[16 * GGML_MAX_SRC];
-        char types[16 * GGML_MAX_SRC];
-        char buffs[64 * GGML_MAX_SRC];
-        char names[64 * GGML_MAX_SRC];
-
-        hex_format_op_dims(dims, op);
-        hex_format_op_strides(strides, op);
-        hex_format_op_types(types, op);
-        hex_format_op_buffs(buffs, op);
-        hex_format_op_names(names, op);
-
-        HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(),
-                    ggml_op_name(node->op), names, dims, types, strides, buffs, req.flags);
+        hex_print_op_info(op, sess, req.flags);
         if (opt_verbose > 1) {
             hex_dump_dspbuf(src0, &bufs[0]);
             hex_dump_dspbuf(src1, &bufs[1]);
@@ -2648,21 +2626,7 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
     auto * sess = get_session_from_tensor(src0);
 
     if (opt_verbose) {
-        char dims[64 * GGML_MAX_SRC];
-        char strides[16 * GGML_MAX_SRC];
-        char types[16 * GGML_MAX_SRC];
-        char buffs[64 * GGML_MAX_SRC];
-        char names[64 * GGML_MAX_SRC];
-
-        hex_format_op_dims(dims, op);
-        hex_format_op_strides(strides, op);
-        hex_format_op_types(types, op);
-        hex_format_op_buffs(buffs, op);
-        hex_format_op_names(names, op);
-
-        HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(),
-                    ggml_op_name(node->op), names, dims, types, strides, buffs, req.flags);
-
+        hex_print_op_info(op, sess, req.flags);
         if (opt_verbose > 1) {
             hex_dump_dspbuf(src0, &bufs[0]);
             hex_dump_dspbuf(src1, &bufs[1]);
@@ -2779,20 +2743,7 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
     auto * sess = get_session_from_tensor(src0);
 
     if (opt_verbose) {
-        char dims[64 * GGML_MAX_SRC];
-        char strides[64 * GGML_MAX_SRC];
-        char types[16 * GGML_MAX_SRC];
-        char buffs[64 * GGML_MAX_SRC];
-        char names[64 * GGML_MAX_SRC];
-
-        hex_format_op_dims(dims, op);
-        hex_format_op_strides(strides, op);
-        hex_format_op_types(types, op);
-        hex_format_op_buffs(buffs, op);
-        hex_format_op_names(names, op);
-
-        HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
-                    names, dims, types, strides, buffs, req.flags);
+        hex_print_op_info(op, sess, req.flags);
         if (opt_verbose > 1) {
             hex_dump_dspbuf(src0, &bufs[0]);
             if (src1) {
@@ -2894,20 +2845,7 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
     auto * sess = get_session_from_tensor(src0);
 
     if (opt_verbose) {
-        char dims[64 * GGML_MAX_SRC];
-        char strides[64 * GGML_MAX_SRC];
-        char types[16 * GGML_MAX_SRC];
-        char buffs[64 * GGML_MAX_SRC];
-        char names[64 * GGML_MAX_SRC];
-
-        hex_format_op_dims(dims, op);
-        hex_format_op_strides(strides, op);
-        hex_format_op_types(types, op);
-        hex_format_op_buffs(buffs, op);
-        hex_format_op_names(names, op);
-
-        HEX_VERBOSE("ggml-hex: %s %s : %s : %s : %s : %s : %s : flags 0x%x\n", sess->name.c_str(), ggml_op_name(op->op),
-                    names, dims, types, strides, buffs, req.flags);
+        hex_print_op_info(op, sess, req.flags);
         if (opt_verbose > 1) {
             hex_dump_dspbuf(src0, &bufs[0]);
             if (src1) {

From 6d330237631a0924f25c3b57d807b9849ea3cd6a Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Fri, 21 Nov 2025 23:40:38 +0800
Subject: [PATCH 08/14] refactor: simplify dspqueue_buffers_init function and
 its usage in hexagon operations

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 118 +++++++++++--------------
 1 file changed, 54 insertions(+), 64 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 1f1e621a4cd..0b4e2c3d4df 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -2292,48 +2292,20 @@ static void init_htp_tensor(htp_tensor * h, const ggml_tensor * t) {
     h->nb[3] = t->nb[3];
 }
 
-template <size_t _Cnt>
-static size_t dspqueue_buffers_init(dspqueue_buffer (&bufs)[_Cnt],
-                                    const bool                                 is_src0_static,
-                                    const ggml_tensor *                        dst,
-                                    std::initializer_list<const ggml_tensor *> srcs) {
-    GGML_ASSERT(_Cnt == srcs.size() + 1);
-    GGML_ASSERT(srcs.size() > 0);
-
-    constexpr const auto buffer_init = [](dspqueue_buffer * buffer, const ggml_tensor * t, uint32_t flags) {
-        auto tensor_buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
-        buffer->fd      = tensor_buf->fd;
-        buffer->ptr     = t->data;
-        buffer->offset  = (uint8_t *) t->data - tensor_buf->base;
-        buffer->size    = ggml_nbytes(t);
-        buffer->flags   = flags;
-    };
-
-    memset(bufs, 0, sizeof(bufs));
-
-    const uint32_t src0_flags = is_src0_static ? 0 :
-                                                 (DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |          // Flush CPU
-                                                  DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Invalidate DSP
-    buffer_init(&bufs[0], srcs.begin()[0], src0_flags);
-
-    size_t n_bufs = 1;
-    for (size_t i = 1; i < srcs.size(); i++) {
-        auto * src = srcs.begin()[i];
-        if (!src) {
-            continue;
-        }
-
-        buffer_init(&bufs[n_bufs], srcs.begin()[i],
-                    DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER |
-                        DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT);  // Flush CPU and Invalidate DSP
-        n_bufs++;
+static size_t dspqueue_buffers_init(dspqueue_buffer * buf, const ggml_tensor * t, bool flush_host, bool flush_htp) {
+    if (!t) {
+        return 0;
     }
 
-    buffer_init(&bufs[n_bufs], dst,
-                DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER);  // Output buffer: flush CPU caches
-    n_bufs++;
-
-    return n_bufs;
+    memset(buf, 0, sizeof(*buf));
+    auto tensor_buf = static_cast<ggml_backend_hexagon_buffer_context *>(t->buffer->context);
+    buf->fd      = tensor_buf->fd;
+    buf->ptr     = t->data;
+    buf->offset  = (uint8_t *) t->data - tensor_buf->base;
+    buf->size    = ggml_nbytes(t);
+    buf->flags   = (flush_host ? DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER : 0);        // Flush CPU
+    buf->flags |= (flush_htp ? DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT : 0);  // Invalidate DSP
+    return 1;
 }
 
 static ggml_hexagon_session * get_session_from_tensor(const ggml_tensor * t) {
@@ -2374,20 +2346,23 @@ static void ggml_hexagon_mul_mat(const struct ggml_tensor * op, uint32_t flags)
         req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
     }
 
+    dspqueue_buffer bufs[3];
+
     // First buffer Weights.
     // The content is static, there is no need to do any cache management
-    //
+    dspqueue_buffers_init(bufs, src0, false, false);
+
     // Second buffer Input Activations. This is a buffer that the CPU
     // writes and the DSP reads, so we'll need to flush CPU caches and
     // invalidate DSP ones. On platforms with I/O coherency support the
     // framework will automatically skip cache operations where possible.
-    //
+    dspqueue_buffers_init(&bufs[1], src1, true, true);
+
     // Third buffer Output Activations. We'll handle DSP
     // cache maintenance in the response message but need to flush
     // CPU caches to ensure any previously written dirty lines are
     // written out before writes from the DSP start.
-    dspqueue_buffer bufs[3];
-    dspqueue_buffers_init(bufs, true, dst, { src0, src1 });
+    dspqueue_buffers_init(&bufs[2], dst, true, false);
 
     auto * sess = get_session_from_tensor(src0);
 
@@ -2443,25 +2418,28 @@ static void ggml_hexagon_mul_mat_id(const struct ggml_tensor * op, uint32_t flag
         req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
     }
 
+    dspqueue_buffer bufs[4];
     // First buffer Weights.
     // The content is static, there is no need to do any cache management
-    //
+    dspqueue_buffers_init(bufs, src0, false, false);
+
     // Second buffer Input Activations. This is a buffer that the CPU
     // writes and the DSP reads, so we'll need to flush CPU caches and
     // invalidate DSP ones. On platforms with I/O coherency support the
     // framework will automatically skip cache operations where possible.
-    //
+    dspqueue_buffers_init(&bufs[1], src1, true, true);
+
     // Third buffer expert IDs. This is a buffer that the CPU
     // writes and the DSP reads, so we'll need to flush CPU caches and
     // invalidate DSP ones. On platforms with I/O coherency support the
     // framework will automatically skip cache operations where possible.
-    //
+    dspqueue_buffers_init(&bufs[2], src2, true, true);
+
     // Forth buffer Output Activations. We'll handle DSP
     // cache maintenance in the response message but need to flush
     // CPU caches to ensure any previously written dirty lines are
     // written out before writes from the DSP start.
-    dspqueue_buffer bufs[4];
-    dspqueue_buffers_init(bufs, true, dst, { src0, src1, src2 });
+    dspqueue_buffers_init(&bufs[3], dst, true, false);
 
     auto * sess = get_session_from_tensor(src0);
 
@@ -2533,24 +2511,26 @@ static void ggml_hexagon_binary(const struct ggml_tensor * op, uint32_t flags) {
     init_htp_tensor(&req.src1, src1);
     init_htp_tensor(&req.dst, dst);
 
+    dspqueue_buffer bufs[3];
     // First buffer = First Operand of Binary op
     // This is a buffer that the CPU writes and the DSP reads, so we'll
     // need to flush CPU caches and invalidate DSP ones. On platforms
     // with I/O coherency support the framework will automatically skip
     // cache operations where possible.
-    //
+    dspqueue_buffers_init(bufs, src0, true, true);
+
     // Second buffer = Second Operand of Binary op
     // This is a buffer that the CPU writes and the DSP reads, so we'll
     // need to flush CPU caches and invalidate DSP ones. On platforms
     // with I/O coherency support the framework will automatically skip
     // cache operations where possible.
-    //
+    dspqueue_buffers_init(&bufs[1], src1, true, true);
+
     // Third buffer = Output Activations. We'll handle DSP
     // cache maintenance in the response message but need to flush
     // CPU caches to ensure any previously written dirty lines are
     // written out before writes from the DSP start.
-    dspqueue_buffer bufs[3];
-    dspqueue_buffers_init(bufs, false, dst, { src0, src1 });
+    dspqueue_buffers_init(&bufs[2], dst, true, false);
 
     auto * sess = get_session_from_tensor(src0);
 
@@ -2616,12 +2596,15 @@ static void ggml_hexagon_add_id(const struct ggml_tensor * op, uint32_t flags) {
     init_htp_tensor(&req.src2, src2);
     init_htp_tensor(&req.dst, dst);
 
+    dspqueue_buffer bufs[4];
     // First buffer = input activations
+    dspqueue_buffers_init(bufs, src0, true, true);
     // Second buffer = experts bias
+    dspqueue_buffers_init(&bufs[1], src1, true, true);
     // Third buffer = activated experts
+    dspqueue_buffers_init(&bufs[2], src2, true, true);
     // Forth buffer = output activations
-    dspqueue_buffer bufs[4];
-    dspqueue_buffers_init(bufs, false, dst, { src0, src1, src2 });
+    dspqueue_buffers_init(&bufs[3], dst, true, true);
 
     auto * sess = get_session_from_tensor(src0);
 
@@ -2719,25 +2702,28 @@ static void ggml_hexagon_unary(const struct ggml_tensor * op, uint32_t flags) {
         req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
     }
 
+    dspqueue_buffer bufs[3];
+
     // First buffer = Only Operand of Unary op
     // This is a buffer that the CPU writes and the DSP reads, so we'll
     // need to flush CPU caches and invalidate DSP ones. On platforms
     // with I/O coherency support the framework will automatically skip
     // cache operations where possible.
-    //
+    size_t n_bufs = dspqueue_buffers_init(bufs, src0, true, true);
+
     // Second buffer(nullable) = Second Operand of Binary op
     // This is a buffer that the CPU writes and the DSP reads, so we'll
     // need to flush CPU caches and invalidate DSP ones. On platforms
     // with I/O coherency support the framework will automatically skip
     // cache operations where possible.
-    //
+    n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, true, true);
+
     // Second or third buffer = Output Activations. We'll handle DSP
     // Second buffer = Output Activations. We'll handle DSP
     // cache maintenance in the response message but need to flush
     // CPU caches to ensure any previously written dirty lines are
     // written out before writes from the DSP start.
-    dspqueue_buffer bufs[3];
-    size_t          n_bufs = dspqueue_buffers_init(bufs, false, dst, { src0, src1 });
+    n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, true, false);
 
     // Primary DSP session from the src0 tensor
     auto * sess = get_session_from_tensor(src0);
@@ -2815,31 +2801,35 @@ static void ggml_hexagon_rope(const struct ggml_tensor * op, uint32_t flags) {
         req.flags |= HTP_OPFLAGS_SKIP_COMPUTE;
     }
 
+    dspqueue_buffer bufs[4];
+
     // First buffer
     // This is a buffer that the CPU writes and the DSP reads, so we'll
     // need to flush CPU caches and invalidate DSP ones. On platforms
     // with I/O coherency support the framework will automatically skip
     // cache operations where possible.
-    //
+    size_t n_bufs = dspqueue_buffers_init(bufs, src0, true, true);
+
     // Second buffer
     // This is a buffer that the CPU writes and the DSP reads, so we'll
     // need to flush CPU caches and invalidate DSP ones. On platforms
     // with I/O coherency support the framework will automatically skip
     // cache operations where possible.
-    //
+    n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src1, true, true);
+
     // Third buffer(nullable)
     // This is a buffer that the CPU writes and the DSP reads, so we'll
     // need to flush CPU caches and invalidate DSP ones. On platforms
     // with I/O coherency support the framework will automatically skip
     // cache operations where possible.
-    //
+    n_bufs += dspqueue_buffers_init(&bufs[n_bufs], src2, true, true);
+
     // Final buffer = Output Activations. We'll handle DSP
     // Second buffer = Output Activations. We'll handle DSP
     // cache maintenance in the response message but need to flush
     // CPU caches to ensure any previously written dirty lines are
     // written out before writes from the DSP start.
-    dspqueue_buffer bufs[4];
-    size_t          n_bufs = dspqueue_buffers_init(bufs, false, dst, { src0, src1, src2 });
+    n_bufs += dspqueue_buffers_init(&bufs[n_bufs], dst, true, false);
 
     // Primary DSP session from the src0 tensor
     auto * sess = get_session_from_tensor(src0);

From 2df16815717dd0483643c2dbd111a11af1cd5105 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Sat, 22 Nov 2025 12:20:30 +0800
Subject: [PATCH 09/14] fix: improve nan handling at
 hvx_vec_fast_sigmoid_fp32_guard

---
 ggml/src/ggml-hexagon/htp/hvx-utils.h | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h
index 5f94645cde3..cdefa7da389 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -773,7 +773,7 @@ static inline HVX_Vector hvx_vec_fast_sigmoid_fp32(HVX_Vector v) {
     HVX_Vector v4 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_Vqf32Vqf32(v2, v1));
     HVX_Vector v5 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(v3, v4));
 
-    HVX_Vector res = hvx_vec_inverse_fp32(v5);
+    HVX_Vector res = hvx_vec_inverse_fp32_guard(v5);
     res            = Q6_Vqf32_vmpy_VsfVsf(v3, res);
 
     return Q6_Vsf_equals_Vqf32(res);
@@ -959,13 +959,19 @@ static inline HVX_Vector hvx_vec_rsqrt_fp32(HVX_Vector in_vec) {
 }
 
 static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v) {
-    static const float kMaxExp = -88.02f;  // log(INF)
+    static const float kMinExp = -88.02f;  // 0
+    static const float kMaxExp = 88.02f;   // 1
 
-    const HVX_Vector     max_exp  = Q6_V_vsplat_R(*((uint32_t *) &kMaxExp));
-    const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(v, max_exp);
+    const HVX_Vector     one      = hvx_vec_splat_fp32(1.f);
+    const HVX_Vector     max_exp  = hvx_vec_splat_fp32(kMaxExp);
+    const HVX_Vector     min_exp  = hvx_vec_splat_fp32(kMinExp);
+
+    const HVX_VectorPred pred_max = Q6_Q_vcmp_gt_VsfVsf(max_exp, v);
+    const HVX_VectorPred pred_min = Q6_Q_vcmp_gt_VsfVsf(v, min_exp);
 
     HVX_Vector out = hvx_vec_fast_sigmoid_fp32(v);
-    return Q6_V_vmux_QVV(pred_inf, out, Q6_V_vzero());
+    out = Q6_V_vmux_QVV(pred_max, out, one);
+    return Q6_V_vmux_QVV(pred_min, out, Q6_V_vzero());
 }
 
 static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {

From b9cf80a2761374f7b8910393a228c23c112c3411 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Sat, 22 Nov 2025 23:27:03 +0800
Subject: [PATCH 10/14] refactor: optimize hvx_vec_inverse_fp32_guard for
 better nan handling

---
 ggml/src/ggml-hexagon/htp/hvx-utils.h | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h
index cdefa7da389..d6fc9bbfc87 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -727,21 +727,15 @@ static inline HVX_Vector hvx_vec_inverse_fp32(HVX_Vector v_sf) {
 }
 
 static inline HVX_Vector hvx_vec_inverse_fp32_guard(HVX_Vector v_sf) {
-    static const float    kInf     = INFINITY;
-    static const uint32_t kNanMask = 0x7fffffff;
-    static const uint32_t kNanMin  = 0x7f800000;
-
-    const HVX_Vector     inf      = hvx_vec_splat_fp32(kInf);
-    const HVX_VectorPred pred_inf = Q6_Q_vcmp_gt_VsfVsf(inf, v_sf);
+    static const uint32_t kNanInfMask = 0x7f800000;
 
     HVX_Vector out = hvx_vec_inverse_fp32(v_sf);
 
-    const HVX_Vector     nan_mask   = Q6_V_vsplat_R(kNanMask);
-    const HVX_Vector     nan_min    = Q6_V_vsplat_R(kNanMin);
-    HVX_Vector           masked_out = Q6_V_vand_VV(out, nan_mask);
-    const HVX_VectorPred pred       = Q6_Q_vcmp_gtand_QVuwVuw(pred_inf, nan_min, masked_out);
+    const HVX_Vector     nan_inf_mask = Q6_V_vsplat_R(kNanInfMask);
+    HVX_Vector           masked_out   = Q6_V_vand_VV(out, nan_inf_mask);
+    const HVX_VectorPred pred         = Q6_Q_vcmp_eq_VwVw(nan_inf_mask, masked_out);
 
-    return Q6_V_vmux_QVV(pred, out, Q6_V_vzero());
+    return Q6_V_vmux_QVV(pred, Q6_V_vzero(), out);
 }
 
 #define FAST_SIGMOID_LOG2F (0x3fb8aa3b)  // 1.442695022
@@ -962,15 +956,15 @@ static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v) {
     static const float kMinExp = -88.02f;  // 0
     static const float kMaxExp = 88.02f;   // 1
 
-    const HVX_Vector     one      = hvx_vec_splat_fp32(1.f);
-    const HVX_Vector     max_exp  = hvx_vec_splat_fp32(kMaxExp);
-    const HVX_Vector     min_exp  = hvx_vec_splat_fp32(kMinExp);
+    const HVX_Vector one     = hvx_vec_splat_fp32(1.f);
+    const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
+    const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);
 
     const HVX_VectorPred pred_max = Q6_Q_vcmp_gt_VsfVsf(max_exp, v);
     const HVX_VectorPred pred_min = Q6_Q_vcmp_gt_VsfVsf(v, min_exp);
 
     HVX_Vector out = hvx_vec_fast_sigmoid_fp32(v);
-    out = Q6_V_vmux_QVV(pred_max, out, one);
+    out            = Q6_V_vmux_QVV(pred_max, out, one);
     return Q6_V_vmux_QVV(pred_min, out, Q6_V_vzero());
 }
 

From 784fa2d95b4a7f5a9d3aa334b3c660afa9ed6b04 Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Sun, 23 Nov 2025 00:12:43 +0800
Subject: [PATCH 11/14] refactor: update hvx_vec_fast_sigmoid_fp32_guard to use
 adjusted exponent limits

---
 ggml/src/ggml-hexagon/htp/hvx-utils.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h
index d6fc9bbfc87..3a832604aa9 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -767,7 +767,7 @@ static inline HVX_Vector hvx_vec_fast_sigmoid_fp32(HVX_Vector v) {
     HVX_Vector v4 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_Vqf32Vqf32(v2, v1));
     HVX_Vector v5 = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(v3, v4));
 
-    HVX_Vector res = hvx_vec_inverse_fp32_guard(v5);
+    HVX_Vector res = hvx_vec_inverse_fp32(v5);
     res            = Q6_Vqf32_vmpy_VsfVsf(v3, res);
 
     return Q6_Vsf_equals_Vqf32(res);
@@ -953,8 +953,8 @@ static inline HVX_Vector hvx_vec_rsqrt_fp32(HVX_Vector in_vec) {
 }
 
 static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v) {
-    static const float kMinExp = -88.02f;  // 0
-    static const float kMaxExp = 88.02f;   // 1
+    static const float kMinExp = -87.f;  // 0
+    static const float kMaxExp = 87.f;   // 1
 
     const HVX_Vector one     = hvx_vec_splat_fp32(1.f);
     const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);

From 2d8c6e26b1983b828c994ac79c772fed06c4289b Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Sun, 23 Nov 2025 00:19:42 +0800
Subject: [PATCH 12/14] refactor: modify hvx_vec_fast_sigmoid_fp32_guard to
 accept parameters for improved flexibility

---
 ggml/src/ggml-hexagon/htp/hvx-utils.h | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h
index 3a832604aa9..125f08e6fdc 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -952,14 +952,10 @@ static inline HVX_Vector hvx_vec_rsqrt_fp32(HVX_Vector in_vec) {
     return Q6_Vsf_equals_Vqf32(temp);
 }
 
-static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v) {
-    static const float kMinExp = -87.f;  // 0
-    static const float kMaxExp = 87.f;   // 1
-
-    const HVX_Vector one     = hvx_vec_splat_fp32(1.f);
-    const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
-    const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);
-
+static inline HVX_Vector hvx_vec_fast_sigmoid_fp32_guard(HVX_Vector v,
+                                                         HVX_Vector one,
+                                                         HVX_Vector max_exp,
+                                                         HVX_Vector min_exp) {
     const HVX_VectorPred pred_max = Q6_Q_vcmp_gt_VsfVsf(max_exp, v);
     const HVX_VectorPred pred_min = Q6_Q_vcmp_gt_VsfVsf(v, min_exp);
 
@@ -977,9 +973,16 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
     const HVX_Vector * restrict v_src = (HVX_Vector *) src;
     HVX_Vector * restrict v_dst       = (HVX_Vector *) dst;
 
+    static const float kMinExp = -87.f;  // 0
+    static const float kMaxExp = 87.f;   // 1
+
+    const HVX_Vector one     = hvx_vec_splat_fp32(1.f);
+    const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
+    const HVX_Vector min_exp = hvx_vec_splat_fp32(kMinExp);
+
     #pragma unroll(4)
     for (int i = 0; i < step_of_1; i++) {
-        v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i]);
+        v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i], one, max_exp, min_exp);
     }
 }
 

From 9686ef877f33551a693539801d2ee893a0d217ef Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Sun, 23 Nov 2025 00:30:19 +0800
Subject: [PATCH 13/14] refactor: update hvx_vec_exp_fp32_guard to accept
 max_exp and inf parameters to save some instructions

---
 ggml/src/ggml-hexagon/htp/hvx-exp.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/hvx-exp.c b/ggml/src/ggml-hexagon/htp/hvx-exp.c
index d0735e9325e..21bf46a542f 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-exp.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-exp.c
@@ -16,13 +16,8 @@
 #include "hvx-utils.h"
 #include "ops-utils.h"
 
-static inline HVX_Vector hvx_vec_exp_fp32_guard(HVX_Vector in_vec) {
-    static const float kInf    = INFINITY;
-    static const float kMaxExp = 88.02f;  // log(INF)
-
-    const HVX_Vector     max_exp = hvx_vec_splat_fp32(kMaxExp);
-    const HVX_Vector     inf     = hvx_vec_splat_fp32(kInf);
-    const HVX_VectorPred pred0   = Q6_Q_vcmp_gt_VsfVsf(in_vec, max_exp);
+static inline HVX_Vector hvx_vec_exp_fp32_guard(HVX_Vector in_vec, HVX_Vector max_exp, HVX_Vector inf) {
+    const HVX_VectorPred pred0 = Q6_Q_vcmp_gt_VsfVsf(in_vec, max_exp);
 
     HVX_Vector out = hvx_vec_exp_fp32(in_vec);
 
@@ -47,6 +42,12 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
 
     HVX_Vector vec_out = Q6_V_vzero();
 
+    static const float kInf    = INFINITY;
+    static const float kMaxExp = 88.02f;  // log(INF)
+
+    const HVX_Vector max_exp = hvx_vec_splat_fp32(kMaxExp);
+    const HVX_Vector inf     = hvx_vec_splat_fp32(kInf);
+
     if (0 == unaligned_loop) {
         HVX_Vector * p_vec_in1 = (HVX_Vector *) src;
         HVX_Vector * p_vec_out = (HVX_Vector *) dst;
@@ -55,9 +56,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             if (true == negate) {
                 HVX_Vector neg_vec_in = hvx_vec_neg_fp32(*p_vec_in1++);
-                *p_vec_out++          = hvx_vec_exp_fp32_guard(neg_vec_in);
+                *p_vec_out++          = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf);
             } else {
-                *p_vec_out++ = hvx_vec_exp_fp32_guard(*p_vec_in1++);
+                *p_vec_out++ = hvx_vec_exp_fp32_guard(*p_vec_in1++, max_exp, inf);
             }
         }
     } else {
@@ -67,9 +68,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
 
             if (true == negate) {
                 HVX_Vector neg_vec_in                    = hvx_vec_neg_fp32(in);
-                *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(neg_vec_in);
+                *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf);
             } else {
-                *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(in);
+                *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_exp_fp32_guard(in, max_exp, inf);
             }
         }
     }
@@ -83,9 +84,9 @@ void hvx_exp_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int
         if (true == negate) {
             HVX_Vector neg_vec_in = hvx_vec_neg_fp32(in);
 
-            vec_out = hvx_vec_exp_fp32_guard(neg_vec_in);
+            vec_out = hvx_vec_exp_fp32_guard(neg_vec_in, max_exp, inf);
         } else {
-            vec_out = hvx_vec_exp_fp32_guard(in);
+            vec_out = hvx_vec_exp_fp32_guard(in, max_exp, inf);
         }
 
         hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, vec_out);

From 515a00ca6311f6a3f2431326271eb796fb6ebbdf Mon Sep 17 00:00:00 2001
From: chraac <chraac@gmail.com>
Date: Sun, 23 Nov 2025 00:40:17 +0800
Subject: [PATCH 14/14] refactor: move hvx_vec_inverse_fp32_guard
 implementation to hvx-inverse.c for better perf

---
 ggml/src/ggml-hexagon/htp/hvx-inverse.c | 18 +++++++++++++++---
 ggml/src/ggml-hexagon/htp/hvx-utils.h   | 12 ------------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/hvx-inverse.c b/ggml/src/ggml-hexagon/htp/hvx-inverse.c
index 953d3e6c167..4d70634fcd4 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-inverse.c
+++ b/ggml/src/ggml-hexagon/htp/hvx-inverse.c
@@ -16,6 +16,15 @@
 #include "hvx-utils.h"
 #include "ops-utils.h"
 
+static inline HVX_Vector hvx_vec_inverse_fp32_guard(HVX_Vector v_sf, HVX_Vector nan_inf_mask) {
+    HVX_Vector out = hvx_vec_inverse_fp32(v_sf);
+
+    HVX_Vector           masked_out = Q6_V_vand_VV(out, nan_inf_mask);
+    const HVX_VectorPred pred       = Q6_Q_vcmp_eq_VwVw(nan_inf_mask, masked_out);
+
+    return Q6_V_vmux_QVV(pred, Q6_V_vzero(), out);
+}
+
 void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const int num_elems) {
     int left_over       = num_elems & (VLEN_FP32 - 1);
     int num_elems_whole = num_elems - left_over;
@@ -32,19 +41,22 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const
         FARF(HIGH, "hvx_inverse_f32: unaligned loop in hvx op, possibly slower execution\n");
     }
 
+    static const uint32_t kNanInfMask  = 0x7f800000;
+    const HVX_Vector      nan_inf_mask = Q6_V_vsplat_R(kNanInfMask);
+
     if (0 == unaligned_loop) {
         HVX_Vector * p_vec_in  = (HVX_Vector *) src;
         HVX_Vector * p_vec_out = (HVX_Vector *) dst;
 
         #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
-            *p_vec_out++ = hvx_vec_inverse_fp32_guard(*p_vec_in++);
+            *p_vec_out++ = hvx_vec_inverse_fp32_guard(*p_vec_in++, nan_inf_mask);
         }
     } else {
         #pragma unroll(4)
         for (int i = 0; i < num_elems_whole; i += VLEN_FP32) {
             HVX_Vector in                            = *(HVX_UVector *) (src + i * SIZEOF_FP32);
-            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32_guard(in);
+            *(HVX_UVector *) (dst + i * SIZEOF_FP32) = hvx_vec_inverse_fp32_guard(in, nan_inf_mask);
         }
     }
 
@@ -53,7 +65,7 @@ void hvx_inverse_f32(const uint8_t * restrict src, uint8_t * restrict dst, const
         float *       dstf = (float *) dst + num_elems_whole;
 
         HVX_Vector in  = *(HVX_UVector *) srcf;
-        HVX_Vector out = hvx_vec_inverse_fp32_guard(in);
+        HVX_Vector out = hvx_vec_inverse_fp32_guard(in, nan_inf_mask);
 
         hvx_vec_store_u((void *) dstf, left_over * SIZEOF_FP32, out);
     }
diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h
index 125f08e6fdc..28b0014fb5a 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -726,18 +726,6 @@ static inline HVX_Vector hvx_vec_inverse_fp32(HVX_Vector v_sf) {
     return Q6_Vsf_equals_Vqf32(r_qf);
 }
 
-static inline HVX_Vector hvx_vec_inverse_fp32_guard(HVX_Vector v_sf) {
-    static const uint32_t kNanInfMask = 0x7f800000;
-
-    HVX_Vector out = hvx_vec_inverse_fp32(v_sf);
-
-    const HVX_Vector     nan_inf_mask = Q6_V_vsplat_R(kNanInfMask);
-    HVX_Vector           masked_out   = Q6_V_vand_VV(out, nan_inf_mask);
-    const HVX_VectorPred pred         = Q6_Q_vcmp_eq_VwVw(nan_inf_mask, masked_out);
-
-    return Q6_V_vmux_QVV(pred, Q6_V_vzero(), out);
-}
-
 #define FAST_SIGMOID_LOG2F (0x3fb8aa3b)  // 1.442695022
 #define FAST_SIGMOID_C1    (0x3d009076)  // 0.03138777
 #define FAST_SIGMOID_C2    (0x3e8d74bd)  // 0.276281267